Base Environment

Core environment interface and base functionality that all rLLM environments inherit from.

Base Environment

rllm.environments.base.base_env

BaseEnv

Bases: ABC

Source code in rllm/environments/base/base_env.py

class BaseEnv(ABC):
    @property
    def idx(self) -> Any:
        """The index or identifier of the environment, often used within a batch.

        Returns:
            The assigned index or identifier, or None if not set.
        """
        # Return the stored _idx value if it exists, otherwise return None.
        return getattr(self, "_idx", None)

    @idx.setter
    def idx(self, value: Any):
        """Set the environment index or identifier.

        This allows assigning an index or identifier (e.g., its position in a batch)
        to the environment instance after it has been created.

        Example:
            env = MyEnvSubclass()  # Assuming MyEnvSubclass inherits from BaseEnv
            env.idx = 5            # Set the index externally

        Args:
            value: The index or identifier to set for this environment.
        """
        self._idx = value

    @abstractmethod
    def reset(self) -> tuple[dict, dict]:
        """Standard Gym reset method. Resets the environment to an initial state.

        Returns:
            A tuple typically containing the initial observation and auxiliary info.
        """
        pass

    @abstractmethod
    def step(self, action: Any) -> tuple[Any, float, bool, dict]:
        """Standard Gym step method. Executes one time step within the environment.

        Args:
            action: An action provided by the agent.

        Returns:
            A tuple containing (observation, reward, done, info).
        """
        pass

    def close(self):
        """Standard Gym close method. Performs any necessary cleanup."""
        return

    @staticmethod
    @abstractmethod
    def from_dict(info: dict) -> "BaseEnv":
        """Creates an environment instance from a dictionary.

        This method should be implemented by concrete subclasses to handle
        environment-specific initialization from serialized data.

        Args:
            info: A dictionary containing the necessary information to initialize the environment.

        Returns:
            An instance of the specific BaseEnv subclass.

        Raises:
            NotImplementedError: If the subclass does not implement this method.
        """
        # BaseEnv is abstract, subclasses must implement this factory method.
        raise NotImplementedError("Subclasses must implement the 'from_dict' static method.")

    @staticmethod
    def is_multithread_safe() -> bool:
        return True

idx `property` `writable`

idx: Any

The index or identifier of the environment, often used within a batch.

Returns:

Type	Description
`Any`	The assigned index or identifier, or None if not set.

reset `abstractmethod`

reset() -> tuple[dict, dict]

Standard Gym reset method. Resets the environment to an initial state.

Returns:

Type	Description
`tuple[dict, dict]`	A tuple typically containing the initial observation and auxiliary info.

Source code in rllm/environments/base/base_env.py

@abstractmethod
def reset(self) -> tuple[dict, dict]:
    """Standard Gym reset method. Resets the environment to an initial state.

    Returns:
        A tuple typically containing the initial observation and auxiliary info.
    """
    pass

step `abstractmethod`

step(action: Any) -> tuple[Any, float, bool, dict]

Standard Gym step method. Executes one time step within the environment.

Parameters:

Name	Type	Description	Default
`action`	`Any`	An action provided by the agent.	required

Returns:

Type	Description
`tuple[Any, float, bool, dict]`	A tuple containing (observation, reward, done, info).

Source code in rllm/environments/base/base_env.py

@abstractmethod
def step(self, action: Any) -> tuple[Any, float, bool, dict]:
    """Standard Gym step method. Executes one time step within the environment.

    Args:
        action: An action provided by the agent.

    Returns:
        A tuple containing (observation, reward, done, info).
    """
    pass

close

close()

Standard Gym close method. Performs any necessary cleanup.

Source code in rllm/environments/base/base_env.py

def close(self):
    """Standard Gym close method. Performs any necessary cleanup."""
    return

from_dict `abstractmethod` `staticmethod`

from_dict(info: dict) -> BaseEnv

Creates an environment instance from a dictionary.

This method should be implemented by concrete subclasses to handle environment-specific initialization from serialized data.

Parameters:

Name	Type	Description	Default
`info`	`dict`	A dictionary containing the necessary information to initialize the environment.	required

Returns:

Type	Description
`BaseEnv`	An instance of the specific BaseEnv subclass.

Raises:

Type	Description
`NotImplementedError`	If the subclass does not implement this method.

Source code in rllm/environments/base/base_env.py

@staticmethod
@abstractmethod
def from_dict(info: dict) -> "BaseEnv":
    """Creates an environment instance from a dictionary.

    This method should be implemented by concrete subclasses to handle
    environment-specific initialization from serialized data.

    Args:
        info: A dictionary containing the necessary information to initialize the environment.

    Returns:
        An instance of the specific BaseEnv subclass.

    Raises:
        NotImplementedError: If the subclass does not implement this method.
    """
    # BaseEnv is abstract, subclasses must implement this factory method.
    raise NotImplementedError("Subclasses must implement the 'from_dict' static method.")

Single Turn Environment

rllm.environments.base.single_turn_env

SingleTurnEnvironment

Bases: MultiTurnEnvironment

A simple environment for single-turn interactions with LLMs. This is a special case of MultiTurnEnvironment where max_turns=1. The environment provides a question/prompt and evaluates the response using a custom reward function.

Source code in rllm/environments/base/single_turn_env.py

class SingleTurnEnvironment(MultiTurnEnvironment):
    """
    A simple environment for single-turn interactions with LLMs.
    This is a special case of MultiTurnEnvironment where max_turns=1.
    The environment provides a question/prompt and evaluates the response using a custom reward function.
    """

    def __init__(self, task: dict | None = None, reward_fn: RewardFunction | None = None, **kwargs):
        """
        Initialize the single turn environment.

        Args:
            task: Dictionary containing the task information, including at least a "question" field
        """
        super().__init__(task=task, max_turns=1, **kwargs)
        if reward_fn is None:
            warnings.warn("No reward function provided, using zero reward", stacklevel=2)
        self.reward_fn = reward_fn or zero_reward

    def get_reward_and_next_obs(self, task: dict, action: Any) -> tuple[float, dict]:
        """
        Compute the reward based on the task and action.

        Args:
            task: The task dictionary containing relevant information
            action: The action taken by the agent

        Returns:
            Tuple of (reward: float, next_observation: Dict)
        """
        reward_output = self.reward_fn(task_info=task, action=action)

        return reward_output.reward, {}

    @staticmethod
    def from_dict(env_args: dict) -> "SingleTurnEnvironment":
        reward_fn = env_args.pop("reward_fn", None)
        if "task" in env_args:
            task = env_args["task"]
        else:
            task = env_args
        return SingleTurnEnvironment(task=task, reward_fn=reward_fn)

init

__init__(task: dict | None = None, reward_fn: RewardFunction | None = None, **kwargs)

Initialize the single turn environment.

Parameters:

Name	Type	Description	Default
`task`	`dict \| None`	Dictionary containing the task information, including at least a "question" field	`None`

Source code in rllm/environments/base/single_turn_env.py

def __init__(self, task: dict | None = None, reward_fn: RewardFunction | None = None, **kwargs):
    """
    Initialize the single turn environment.

    Args:
        task: Dictionary containing the task information, including at least a "question" field
    """
    super().__init__(task=task, max_turns=1, **kwargs)
    if reward_fn is None:
        warnings.warn("No reward function provided, using zero reward", stacklevel=2)
    self.reward_fn = reward_fn or zero_reward

get_reward_and_next_obs

get_reward_and_next_obs(task: dict, action: Any) -> tuple[float, dict]

Compute the reward based on the task and action.

Parameters:

Name	Type	Description	Default
`task`	`dict`	The task dictionary containing relevant information	required
`action`	`Any`	The action taken by the agent	required

Returns:

Type	Description
`tuple[float, dict]`	Tuple of (reward: float, next_observation: Dict)

Source code in rllm/environments/base/single_turn_env.py

def get_reward_and_next_obs(self, task: dict, action: Any) -> tuple[float, dict]:
    """
    Compute the reward based on the task and action.

    Args:
        task: The task dictionary containing relevant information
        action: The action taken by the agent

    Returns:
        Tuple of (reward: float, next_observation: Dict)
    """
    reward_output = self.reward_fn(task_info=task, action=action)

    return reward_output.reward, {}

Multi Turn Environment

rllm.environments.base.multi_turn_env

MultiTurnEnvironment

Bases: BaseEnv, ABC

An environment for multi-turn interactions with LLMs. The environment provides a series of questions/prompts and evaluates responses using a custom reward function. The interaction terminates after reaching the maximum number of turns.

Source code in rllm/environments/base/multi_turn_env.py

class MultiTurnEnvironment(BaseEnv, ABC):
    """
    An environment for multi-turn interactions with LLMs.
    The environment provides a series of questions/prompts and evaluates responses using a custom reward function.
    The interaction terminates after reaching the maximum number of turns.
    """

    def __init__(self, task: dict | None = None, max_turns: int = 3, **kwargs):
        """
        Initialize the multi-turn environment.

        Args:
            task: Dictionary containing the task information, including at least a "questions" field
                  with a list of questions for each turn
            max_turns: Maximum number of turns before terminating the interaction
        """
        super().__init__()
        self.task = task
        self.max_turns = max_turns
        self.current_turn = 0
        self.done = False
        self.history = []

    def reset(self, task: dict | None = None):
        # Use the provided task if available, otherwise use the default task
        if task is not None:
            self.task = task

        self.done = False
        self.current_turn = 0
        self.history = []

        return self.task, {}

    def step(self, action):
        """
        Take a step in the environment based on the action.

        Args:
            action: Response string from the LLM

        Returns:
            next_observation, reward, terminated, truncated, info
        """
        # Store the action in history
        self.history.append(action)

        # Calculate reward for the current turn using the abstract method
        assert self.task is not None, "Task is not set"
        reward, next_obs = self.get_reward_and_next_obs(self.task, action)

        # Increment turn counter
        self.current_turn += 1

        # Check if we've reached the maximum number of turns
        if self.current_turn >= self.max_turns:
            self.done = True
            return {}, reward, self.done, self.task

        return next_obs, reward, self.done, self.task

    @abstractmethod
    def get_reward_and_next_obs(self, task: dict, action: Any) -> tuple[float, dict]:
        """
        Abstract method to compute the reward based on the task and action.

        Args:
            task: The task dictionary containing relevant information
            action: The action taken by the agent

        Returns:
            Tuple of (reward: float, metadata: Dict)
        """
        pass

    @staticmethod
    def from_dict(env_args: dict) -> "MultiTurnEnvironment":
        raise NotImplementedError("MultiTurnEnvironment is abstract and cannot be instantiated directly. Use a concrete subclass.")

init

__init__(task: dict | None = None, max_turns: int = 3, **kwargs)

Initialize the multi-turn environment.

Parameters:

Name	Type	Description	Default
`task`	`dict \| None`	Dictionary containing the task information, including at least a "questions" field with a list of questions for each turn	`None`
`max_turns`	`int`	Maximum number of turns before terminating the interaction	`3`

Source code in rllm/environments/base/multi_turn_env.py

def __init__(self, task: dict | None = None, max_turns: int = 3, **kwargs):
    """
    Initialize the multi-turn environment.

    Args:
        task: Dictionary containing the task information, including at least a "questions" field
              with a list of questions for each turn
        max_turns: Maximum number of turns before terminating the interaction
    """
    super().__init__()
    self.task = task
    self.max_turns = max_turns
    self.current_turn = 0
    self.done = False
    self.history = []

step

step(action)

Take a step in the environment based on the action.

Parameters:

Name	Type	Description	Default
`action`		Response string from the LLM	required

Returns:

Type	Description
	next_observation, reward, terminated, truncated, info

Source code in rllm/environments/base/multi_turn_env.py

def step(self, action):
    """
    Take a step in the environment based on the action.

    Args:
        action: Response string from the LLM

    Returns:
        next_observation, reward, terminated, truncated, info
    """
    # Store the action in history
    self.history.append(action)

    # Calculate reward for the current turn using the abstract method
    assert self.task is not None, "Task is not set"
    reward, next_obs = self.get_reward_and_next_obs(self.task, action)

    # Increment turn counter
    self.current_turn += 1

    # Check if we've reached the maximum number of turns
    if self.current_turn >= self.max_turns:
        self.done = True
        return {}, reward, self.done, self.task

    return next_obs, reward, self.done, self.task

get_reward_and_next_obs `abstractmethod`

get_reward_and_next_obs(task: dict, action: Any) -> tuple[float, dict]

Abstract method to compute the reward based on the task and action.

Parameters:

Name	Type	Description	Default
`task`	`dict`	The task dictionary containing relevant information	required
`action`	`Any`	The action taken by the agent	required

Returns:

Type	Description
`tuple[float, dict]`	Tuple of (reward: float, metadata: Dict)

Source code in rllm/environments/base/multi_turn_env.py

@abstractmethod
def get_reward_and_next_obs(self, task: dict, action: Any) -> tuple[float, dict]:
    """
    Abstract method to compute the reward based on the task and action.

    Args:
        task: The task dictionary containing relevant information
        action: The action taken by the agent

    Returns:
        Tuple of (reward: float, metadata: Dict)
    """
    pass

Base Environment