Eval Protocol FrozenLake Example
This example shows how to use Eval Protocol's FrozenLake environment from within rLLM using the generic EvalProtocolWorkflow.
For a conceptual overview of how this integration works and how it generalizes to other benchmarks, see the core-concepts page on Eval Protocol Integration.
Quick Start
Prepare FrozenLake dataset
From the project root:
This script builds and registers the frozen_lake_eval_protocol train/test splits in the rLLM DatasetRegistry.
Run FrozenLake workflow (inference)
Once your Fireworks API credentials are configured, you can run a small batch of FrozenLake tasks through Eval Protocol and rLLM:
This will:
- Load the
frozen_lake_eval_protocoltest split. - Use
EvalProtocolWorkflow(withenv_path="eval_protocol.benchmarks.test_frozen_lake") to run rollouts via Eval Protocol. - Print per-task rewards/accuracy and save results to
logs/frozen_lake_results.json.
Train an RL agent
To train an agent against the same Eval Protocol FrozenLake environment:
This uses EvalProtocolWorkflow inside AgentTrainer (via Hydra configs) to:
- Generate rollouts using Eval Protocol’s rollout processor and MCP server.
- Compute rewards via the Eval Protocol evaluation function.
- Optimize the underlying model with PPO/GRPO.
You can edit train_frozen_lake_flow.sh to customize model path, Fireworks deployment, and training hyperparameters.
Code Reference
Data preparation
Script that builds and registers the FrozenLake Eval Protocol dataset:
import random
from datasets import Dataset
from rllm.data.dataset import DatasetRegistry
def prepare_frozen_lake_data(train_size: int, test_size: int):
system_prompt = "You are playing FrozenLake, a grid-based navigation game displayed as a 4x4 text grid. The grid contains: S (Start), F (Frozen safe), H (Hole - deadly), G (Goal). You start at position S and must reach G while avoiding H tiles. In this version, the surface is not slippery so your moves are deterministic. IMPORTANT: When you are at the starting position, you appear as 'S'. When you move to other positions, the hightlighted position will change on the grid. If you step on H, the episode ends with failure. Use the lake_move tool with actions LEFT, DOWN, RIGHT, UP to navigate the grid."
user_prompt_template = "Current game state grid:\n{observation}\n\nYou are navigating the 4x4 grid above. Navigate safely to reach the goal 'G' while avoiding holes 'H'. Choose your next move from: LEFT, DOWN, RIGHT, or UP."
def create_row(idx, seed):
return {"id": f"run_{idx}", "system_prompt": system_prompt, "user_prompt_template": user_prompt_template, "environment_context": {"game": "FrozenLake", "map_name": "4x4", "seed": seed}}
seeds = random.sample(range(1, 1_000_001), train_size + test_size)
all_rows = []
for i in range(train_size + test_size):
all_rows.append(create_row(i, seeds[i]))
train_rows = all_rows[:train_size]
test_rows = all_rows[train_size:]
train_dataset = Dataset.from_list(train_rows)
test_dataset = Dataset.from_list(test_rows)
DatasetRegistry.register_dataset("frozen_lake_eval_protocol", train_dataset, "train")
DatasetRegistry.register_dataset("frozen_lake_eval_protocol", test_dataset, "test")
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")
if __name__ == "__main__":
prepare_frozen_lake_data(train_size=100, test_size=100)
Workflow runner
Main script for running the FrozenLake Eval Protocol workflow through rLLM:
"""
Run Frozen Lake Workflow with rllm-fw using EvalProtocolWorkflow
This script demonstrates how to execute frozen lake tasks using rllm-fw's
AgentWorkflowEngine with the generic EvalProtocolWorkflow.
"""
import asyncio
import json
import os
from pathlib import Path
from rllm.data.dataset import DatasetRegistry
from rllm.engine.agent_workflow_engine import AgentWorkflowEngine
from rllm.engine.rollout.openai_engine import OpenAIEngine
from rllm.workflows.eval_protocol_workflow import EvalProtocolWorkflow
def evaluate_results(episodes):
"""
Evaluate the results and compute accuracy metrics.
Args:
episodes: List of Episode objects
"""
total = len(episodes)
correct = sum(1 for ep in episodes if ep.is_correct)
accuracy = correct / total if total > 0 else 0.0
print("\n" + "=" * 60)
print("EVALUATION RESULTS")
print("=" * 60)
print(f"Total tasks: {total}")
print(f"Correct: {correct}")
print(f"Accuracy: {accuracy:.2%}")
print()
for episode in episodes:
status = "✅" if episode.is_correct else "❌"
reward = episode.metrics.get("evaluation_reward", 0.0)
print(f"{status} Task {episode.id}: reward={reward:.3f}")
print("=" * 60)
return accuracy
async def main():
"""Main execution function."""
n_parallel_tasks = 4
max_tasks = 4
model_id = "accounts/fireworks/models/kimi-k2-instruct"
# Create dummy rollout_engine (required by Workflow base class but not used)
rollout_engine = OpenAIEngine(
model=model_id,
base_url="https://api.fireworks.ai/inference/v1",
api_key=os.getenv("FIREWORKS_API_KEY"),
)
engine = AgentWorkflowEngine(
workflow_cls=EvalProtocolWorkflow,
workflow_args={
"env_path": "eval_protocol.benchmarks.test_frozen_lake",
"lite_llm_prefix": "fireworks_ai/",
"steps": 30,
"temperature": 1.0,
"max_tokens": 16384,
},
rollout_engine=rollout_engine,
n_parallel_tasks=n_parallel_tasks,
retry_limit=1,
)
test_dataset = DatasetRegistry.load_dataset("frozen_lake_eval_protocol", "test")
tasks = []
for i in range(max_tasks):
tasks.append(test_dataset[i])
print("Starting frozen lake workflow execution...")
print(f"Model: {model_id}")
print(f"Parallel tasks: {n_parallel_tasks}")
print()
try:
episodes = await engine.execute_tasks(tasks)
for episode in episodes:
print(episode.trajectories)
accuracy = evaluate_results(episodes)
output_dir = Path("logs")
output_dir.mkdir(exist_ok=True)
output_file = output_dir / "frozen_lake_results.json"
with open(output_file, "w") as f:
json.dump([episode.to_dict() for episode in episodes], f, indent=2)
print(f"\n✅ Results saved to {output_file}")
return accuracy
except Exception as e:
print(f"❌ Error during execution: {e}")
import traceback
traceback.print_exc()
raise
finally:
engine.shutdown()
if __name__ == "__main__":
os.environ["TOKENIZERS_PARALLELISM"] = "false"
accuracy = asyncio.run(main())
print(f"\n🎯 Final Accuracy: {accuracy:.2%}")
Training script
Agent training implementation using EvalProtocolWorkflow and AgentTrainer:
import hydra
from rllm.data.dataset import DatasetRegistry
from rllm.trainer.agent_trainer import AgentTrainer
from rllm.workflows.eval_protocol_workflow import EvalProtocolWorkflow
@hydra.main(config_path="pkg://rllm.trainer.config", config_name="agent_ppo_trainer", version_base=None)
def main(config):
train_dataset = DatasetRegistry.load_dataset("frozen_lake_eval_protocol", "train")
test_dataset = DatasetRegistry.load_dataset("frozen_lake_eval_protocol", "test")
trainer = AgentTrainer(
workflow_class=EvalProtocolWorkflow,
workflow_args={
"env_path": "eval_protocol.benchmarks.test_frozen_lake",
"lite_llm_prefix": "fireworks_ai/",
"steps": 30,
"temperature": 1.0,
"max_tokens": 32768,
},
config=config,
train_dataset=train_dataset,
val_dataset=test_dataset,
backend="fireworks",
)
trainer.train()
if __name__ == "__main__":
main()