Eval Protocol FrozenLake Example

This example shows how to use Eval Protocol's FrozenLake environment from within rLLM using the generic EvalProtocolWorkflow.

For a conceptual overview of how this integration works and how it generalizes to other benchmarks, see the core-concepts page on Eval Protocol Integration.

Quick Start

Prepare FrozenLake dataset

From the project root:

cd examples/eval_protocol
python prepare_frozen_lake_data.py

This script builds and registers the frozen_lake_eval_protocol train/test splits in the rLLM DatasetRegistry.

Run FrozenLake workflow (inference)

Once your Fireworks API credentials are configured, you can run a small batch of FrozenLake tasks through Eval Protocol and rLLM:

python run_frozen_lake_flow.py

This will:

Load the frozen_lake_eval_protocol test split.
Use EvalProtocolWorkflow (with env_path="eval_protocol.benchmarks.test_frozen_lake") to run rollouts via Eval Protocol.
Print per-task rewards/accuracy and save results to logs/frozen_lake_results.json.

Train an RL agent

To train an agent against the same Eval Protocol FrozenLake environment:

bash train_frozen_lake_flow.sh

This uses EvalProtocolWorkflow inside AgentTrainer (via Hydra configs) to:

Generate rollouts using Eval Protocol’s rollout processor and MCP server.
Compute rewards via the Eval Protocol evaluation function.
Optimize the underlying model with PPO/GRPO.

You can edit train_frozen_lake_flow.sh to customize model path, Fireworks deployment, and training hyperparameters.

Code Reference

Data preparation

Script that builds and registers the FrozenLake Eval Protocol dataset:

examples/eval_protocol/prepare_frozen_lake_data.py

import random

from datasets import Dataset

from rllm.data.dataset import DatasetRegistry


def prepare_frozen_lake_data(train_size: int, test_size: int):
    system_prompt = "You are playing FrozenLake, a grid-based navigation game displayed as a 4x4 text grid. The grid contains: S (Start), F (Frozen safe), H (Hole - deadly), G (Goal). You start at position S and must reach G while avoiding H tiles. In this version, the surface is not slippery so your moves are deterministic. IMPORTANT: When you are at the starting position, you appear as 'S'. When you move to other positions, the hightlighted position will change on the grid. If you step on H, the episode ends with failure.  Use the lake_move tool with actions LEFT, DOWN, RIGHT, UP to navigate the grid."
    user_prompt_template = "Current game state grid:\n{observation}\n\nYou are navigating the 4x4 grid above. Navigate safely to reach the goal 'G' while avoiding holes 'H'. Choose your next move from: LEFT, DOWN, RIGHT, or UP."

    def create_row(idx, seed):
        return {"id": f"run_{idx}", "system_prompt": system_prompt, "user_prompt_template": user_prompt_template, "environment_context": {"game": "FrozenLake", "map_name": "4x4", "seed": seed}}

    seeds = random.sample(range(1, 1_000_001), train_size + test_size)
    all_rows = []
    for i in range(train_size + test_size):
        all_rows.append(create_row(i, seeds[i]))
    train_rows = all_rows[:train_size]
    test_rows = all_rows[train_size:]

    train_dataset = Dataset.from_list(train_rows)
    test_dataset = Dataset.from_list(test_rows)

    DatasetRegistry.register_dataset("frozen_lake_eval_protocol", train_dataset, "train")
    DatasetRegistry.register_dataset("frozen_lake_eval_protocol", test_dataset, "test")

    print(f"Train dataset size: {len(train_dataset)}")
    print(f"Test dataset size: {len(test_dataset)}")


if __name__ == "__main__":
    prepare_frozen_lake_data(train_size=100, test_size=100)

Workflow runner

Main script for running the FrozenLake Eval Protocol workflow through rLLM:

examples/eval_protocol/run_frozen_lake_flow.py

"""
Run Frozen Lake Workflow with rllm-fw using EvalProtocolWorkflow

This script demonstrates how to execute frozen lake tasks using rllm-fw's
AgentWorkflowEngine with the generic EvalProtocolWorkflow.
"""

import asyncio
import json
import os
from pathlib import Path

from rllm.data.dataset import DatasetRegistry
from rllm.engine.agent_workflow_engine import AgentWorkflowEngine
from rllm.engine.rollout.openai_engine import OpenAIEngine
from rllm.workflows.eval_protocol_workflow import EvalProtocolWorkflow


def evaluate_results(episodes):
    """
    Evaluate the results and compute accuracy metrics.

    Args:
        episodes: List of Episode objects
    """
    total = len(episodes)
    correct = sum(1 for ep in episodes if ep.is_correct)
    accuracy = correct / total if total > 0 else 0.0

    print("\n" + "=" * 60)
    print("EVALUATION RESULTS")
    print("=" * 60)
    print(f"Total tasks: {total}")
    print(f"Correct: {correct}")
    print(f"Accuracy: {accuracy:.2%}")
    print()

    for episode in episodes:
        status = "✅" if episode.is_correct else "❌"
        reward = episode.metrics.get("evaluation_reward", 0.0)
        print(f"{status} Task {episode.id}: reward={reward:.3f}")

    print("=" * 60)

    return accuracy


async def main():
    """Main execution function."""

    n_parallel_tasks = 4
    max_tasks = 4
    model_id = "accounts/fireworks/models/kimi-k2-instruct"

    # Create dummy rollout_engine (required by Workflow base class but not used)
    rollout_engine = OpenAIEngine(
        model=model_id,
        base_url="https://api.fireworks.ai/inference/v1",
        api_key=os.getenv("FIREWORKS_API_KEY"),
    )

    engine = AgentWorkflowEngine(
        workflow_cls=EvalProtocolWorkflow,
        workflow_args={
            "env_path": "eval_protocol.benchmarks.test_frozen_lake",
            "lite_llm_prefix": "fireworks_ai/",
            "steps": 30,
            "temperature": 1.0,
            "max_tokens": 16384,
        },
        rollout_engine=rollout_engine,
        n_parallel_tasks=n_parallel_tasks,
        retry_limit=1,
    )

    test_dataset = DatasetRegistry.load_dataset("frozen_lake_eval_protocol", "test")
    tasks = []
    for i in range(max_tasks):
        tasks.append(test_dataset[i])

    print("Starting frozen lake workflow execution...")
    print(f"Model: {model_id}")
    print(f"Parallel tasks: {n_parallel_tasks}")
    print()

    try:
        episodes = await engine.execute_tasks(tasks)
        for episode in episodes:
            print(episode.trajectories)
        accuracy = evaluate_results(episodes)

        output_dir = Path("logs")
        output_dir.mkdir(exist_ok=True)
        output_file = output_dir / "frozen_lake_results.json"

        with open(output_file, "w") as f:
            json.dump([episode.to_dict() for episode in episodes], f, indent=2)

        print(f"\n✅ Results saved to {output_file}")

        return accuracy

    except Exception as e:
        print(f"❌ Error during execution: {e}")
        import traceback

        traceback.print_exc()
        raise
    finally:
        engine.shutdown()


if __name__ == "__main__":
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    accuracy = asyncio.run(main())

    print(f"\n🎯 Final Accuracy: {accuracy:.2%}")

Training script

Agent training implementation using EvalProtocolWorkflow and AgentTrainer:

examples/eval_protocol/train_frozen_lake_flow.py

import hydra

from rllm.data.dataset import DatasetRegistry
from rllm.trainer.agent_trainer import AgentTrainer
from rllm.workflows.eval_protocol_workflow import EvalProtocolWorkflow


@hydra.main(config_path="pkg://rllm.trainer.config", config_name="agent_ppo_trainer", version_base=None)
def main(config):
    train_dataset = DatasetRegistry.load_dataset("frozen_lake_eval_protocol", "train")
    test_dataset = DatasetRegistry.load_dataset("frozen_lake_eval_protocol", "test")

    trainer = AgentTrainer(
        workflow_class=EvalProtocolWorkflow,
        workflow_args={
            "env_path": "eval_protocol.benchmarks.test_frozen_lake",
            "lite_llm_prefix": "fireworks_ai/",
            "steps": 30,
            "temperature": 1.0,
            "max_tokens": 32768,
        },
        config=config,
        train_dataset=train_dataset,
        val_dataset=test_dataset,
        backend="fireworks",
    )
    trainer.train()


if __name__ == "__main__":
    main()