Web Tools

Web-based tools for search, scraping, and content extraction.

rllm.tools.web_tools

FirecrawlTool

Bases: Tool

A tool for extracting data from websites using the FireCrawl service.

Source code in rllm/tools/web_tools/firecrawl_tool.py

class FirecrawlTool(Tool):
    """A tool for extracting data from websites using the FireCrawl service."""

    def __init__(self, timeout: int = TIMEOUT, api_key: str = FIRECRAWL_API_KEY, api_url: str | None = None):
        """
        Initialize the Firecrawl tool.

        Args:
            timeout (int): Maximum time in seconds to wait for scraping results.
            api_key (str): API key for FireCrawl service.
            api_url (str, optional): Custom API URL endpoint.
        """
        if FirecrawlApp is None:
            raise ImportError("Firecrawl is not installed. Please install it using 'pip install firecrawl'.")
        self.timeout = timeout
        self.api_key = api_key
        self.api_url = api_url
        self._init_app()
        super().__init__(name="firecrawl", description="FireCrawl is a tool that scrapes a url link and returns content as a markdown document along with any links.")

    def _init_app(self):
        """Initialize the FirecrawlApp instance with appropriate configuration."""
        assert self.api_key is not None or self.api_url is not None, "Either api_key or api_url must be provided."
        if self.api_url is None:
            self.app: Any = FirecrawlApp(api_key=self.api_key)
        else:
            self.app = FirecrawlApp(api_url=self.api_url)

    def _start_firecrawl_job(self, url):
        """
        Start a job with firecrawl async API and return job ID.

        Args:
            url (str): The URL to scrape.

        Returns:
            dict: Response from the FireCrawl API containing job information.
        """
        # crawl has many scrape options, potentially can let the agent choose
        # Firecrawl SDK expects options as positional dict, not a 'params' kwarg
        return self.app.async_batch_scrape_urls([url], {"formats": ["markdown", "links"], "onlyMainContent": True})

    @property
    def json(self):
        """Return the tool's information in a standardized format for tool registration."""
        return {"type": "function", "function": {"name": self.name, "description": self.description, "parameters": {"type": "object", "properties": {"url": {"type": "string", "description": "Web URL to scrape content from."}}, "required": ["url"]}}}

    def forward(self, url: str) -> ToolOutput:
        """
        Run firecrawl job asynchronously.

        Args:
            url (str): The URL to scrape.

        Returns:
            ToolOutput: An object containing either the scraped content or an error message.
        """
        try:
            job = self._start_firecrawl_job(url)
        except Exception as e:
            return ToolOutput(name=self.name or "firecrawl", error=f"Firecrawl job could not start: {e}")

        if not job["success"]:
            return ToolOutput(name=self.name or "firecrawl", error="Firecrawl job failed to start")

        job_id = job["id"]
        start_time = time.monotonic()
        while True:
            status = self.app.check_batch_scrape_status(job_id)
            if status["completed"]:
                break
            time.sleep(1)
            if time.monotonic() - start_time > self.timeout:
                return ToolOutput(name=self.name or "firecrawl", error="Firecrawl request timed out")

        if status["success"]:
            results = {page["metadata"]["url"]: page["markdown"] for page in status["data"]}
            return ToolOutput(name=self.name or "firecrawl", output=results)
        return ToolOutput(name=self.name or "firecrawl", error=f"Firecrawl request errored: {status['error']}")

    async def async_forward(self, url: str) -> ToolOutput:
        """
        Asynchronous version of the forward method.

        Args:
            url (str): The URL to scrape.

        Returns:
            ToolOutput: An object containing either the scraped content or an error message.
        """
        # For now, just call the synchronous version
        # This could be optimized later to use async I/O properly
        return self.forward(url=url)

json `property`

json

Return the tool's information in a standardized format for tool registration.

init

__init__(timeout: int = TIMEOUT, api_key: str = FIRECRAWL_API_KEY, api_url: str | None = None)

Initialize the Firecrawl tool.

Parameters:

Name	Type	Description	Default
`timeout`	`int`	Maximum time in seconds to wait for scraping results.	`TIMEOUT`
`api_key`	`str`	API key for FireCrawl service.	`FIRECRAWL_API_KEY`
`api_url`	`str`	Custom API URL endpoint.	`None`

Source code in rllm/tools/web_tools/firecrawl_tool.py

def __init__(self, timeout: int = TIMEOUT, api_key: str = FIRECRAWL_API_KEY, api_url: str | None = None):
    """
    Initialize the Firecrawl tool.

    Args:
        timeout (int): Maximum time in seconds to wait for scraping results.
        api_key (str): API key for FireCrawl service.
        api_url (str, optional): Custom API URL endpoint.
    """
    if FirecrawlApp is None:
        raise ImportError("Firecrawl is not installed. Please install it using 'pip install firecrawl'.")
    self.timeout = timeout
    self.api_key = api_key
    self.api_url = api_url
    self._init_app()
    super().__init__(name="firecrawl", description="FireCrawl is a tool that scrapes a url link and returns content as a markdown document along with any links.")

forward

forward(url: str) -> ToolOutput

Run firecrawl job asynchronously.

Parameters:

Name	Type	Description	Default
`url`	`str`	The URL to scrape.	required

Returns:

Name	Type	Description
`ToolOutput`	`ToolOutput`	An object containing either the scraped content or an error message.

Source code in rllm/tools/web_tools/firecrawl_tool.py

def forward(self, url: str) -> ToolOutput:
    """
    Run firecrawl job asynchronously.

    Args:
        url (str): The URL to scrape.

    Returns:
        ToolOutput: An object containing either the scraped content or an error message.
    """
    try:
        job = self._start_firecrawl_job(url)
    except Exception as e:
        return ToolOutput(name=self.name or "firecrawl", error=f"Firecrawl job could not start: {e}")

    if not job["success"]:
        return ToolOutput(name=self.name or "firecrawl", error="Firecrawl job failed to start")

    job_id = job["id"]
    start_time = time.monotonic()
    while True:
        status = self.app.check_batch_scrape_status(job_id)
        if status["completed"]:
            break
        time.sleep(1)
        if time.monotonic() - start_time > self.timeout:
            return ToolOutput(name=self.name or "firecrawl", error="Firecrawl request timed out")

    if status["success"]:
        results = {page["metadata"]["url"]: page["markdown"] for page in status["data"]}
        return ToolOutput(name=self.name or "firecrawl", output=results)
    return ToolOutput(name=self.name or "firecrawl", error=f"Firecrawl request errored: {status['error']}")

async_forward `async`

async_forward(url: str) -> ToolOutput

Asynchronous version of the forward method.

Parameters:

Name	Type	Description	Default
`url`	`str`	The URL to scrape.	required

Returns:

Name	Type	Description
`ToolOutput`	`ToolOutput`	An object containing either the scraped content or an error message.

Source code in rllm/tools/web_tools/firecrawl_tool.py

async def async_forward(self, url: str) -> ToolOutput:
    """
    Asynchronous version of the forward method.

    Args:
        url (str): The URL to scrape.

    Returns:
        ToolOutput: An object containing either the scraped content or an error message.
    """
    # For now, just call the synchronous version
    # This could be optimized later to use async I/O properly
    return self.forward(url=url)

GoogleSearchTool

Bases: Tool

A tool for searching google.

Source code in rllm/tools/web_tools/gsearch_tool.py

class GoogleSearchTool(Tool):
    """A tool for searching google."""

    NAME = "google_search"
    DESCRIPTION = f"Search a query using the Google search engine, returning the top {REFERENCE_COUNT} results along with a short snippet about their contents"

    def __init__(self, name: str = NAME, description: str = DESCRIPTION, timeout: float = DEFAULT_SEARCH_ENGINE_TIMEOUT, reference_count: int = REFERENCE_COUNT):
        """
        Initialize the GoogleSearch tool.

        Args:
            name (str): The name of the tool, defaults to GoogleSearch.NAME.
            description (str): A description of the tool's purpose, defaults to GoogleSearch.DESCRIPTION.
            timeout (float): Maximum time in seconds to wait for search results, defaults to DEFAULT_SEARCH_ENGINE_TIMEOUT.
            reference_count (int): Number of results to return, defaults to REFERENCE_COUNT.
        """
        self.timeout = timeout
        self.reference_count = reference_count
        self._init_client()
        super().__init__(name=name, description=description)

    def _init_client(self):
        """
        Initialize the HTTP client for making asynchronous requests.

        Creates an instance of httpx.AsyncClient for the current instance.
        """
        self.client = httpx.Client()

    @property
    def json(self):
        return {"type": "function", "function": {"name": self.name, "description": self.description, "parameters": {"type": "object", "properties": {"query": {"type": "string", "description": "Query to be submitted to Google search engine."}}, "required": ["query"]}}}

    def _search_with_google(self, query: str):
        """
        Search with google and return the contexts.
        """

        secret_key = os.getenv("GOOGLE_SEARCH_SECRET_KEY")
        engine_id = os.getenv("GOOGLE_SEARCH_ENGINE_ID")
        if not secret_key or not engine_id:
            raise ValueError("GOOGLE_SEARCH_SECRET_KEY or GOOGLE_SEARCH_ENGINE_ID is not set")
        params: dict[str, Any] = {
            "key": secret_key,
            "cx": engine_id,
            "q": query,
            "num": REFERENCE_COUNT,
        }

        response = self.client.get(url=GOOGLE_SEARCH_ENDPOINT, params=params, timeout=DEFAULT_SEARCH_ENGINE_TIMEOUT)
        if not response.is_success:
            print(f"{response.status_code} {response.text}")
        json_content = response.json()
        try:
            contexts = json_content["items"][:REFERENCE_COUNT]
        except KeyError:
            print(f"Error encountered: {json_content}")
            return []
        return contexts

    def forward(self, query: str) -> ToolOutput:
        """
        Execute a Google search with the given query.

        Args:
            query (str): Query to be submitted to Google search engine.

        Returns:
            ToolOutput: An object containing either the search results or an error message.
        """
        try:
            assert self.client is not None, "Google Search Client not initialized"
            contexts = self._search_with_google(query)
            results = {c["link"]: c["snippet"] for c in contexts}
            return ToolOutput(name=self.name or "google_search", output=results)
        except Exception as e:
            return ToolOutput(name=self.name or "google_search", error=f"{type(e).__name__} - {str(e)}")

    def __del__(self):
        try:
            self.client.close()
        except Exception:
            pass

init

__init__(name: str = NAME, description: str = DESCRIPTION, timeout: float = DEFAULT_SEARCH_ENGINE_TIMEOUT, reference_count: int = REFERENCE_COUNT)

Initialize the GoogleSearch tool.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the tool, defaults to GoogleSearch.NAME.	`NAME`
`description`	`str`	A description of the tool's purpose, defaults to GoogleSearch.DESCRIPTION.	`DESCRIPTION`
`timeout`	`float`	Maximum time in seconds to wait for search results, defaults to DEFAULT_SEARCH_ENGINE_TIMEOUT.	`DEFAULT_SEARCH_ENGINE_TIMEOUT`
`reference_count`	`int`	Number of results to return, defaults to REFERENCE_COUNT.	`REFERENCE_COUNT`

Source code in rllm/tools/web_tools/gsearch_tool.py

def __init__(self, name: str = NAME, description: str = DESCRIPTION, timeout: float = DEFAULT_SEARCH_ENGINE_TIMEOUT, reference_count: int = REFERENCE_COUNT):
    """
    Initialize the GoogleSearch tool.

    Args:
        name (str): The name of the tool, defaults to GoogleSearch.NAME.
        description (str): A description of the tool's purpose, defaults to GoogleSearch.DESCRIPTION.
        timeout (float): Maximum time in seconds to wait for search results, defaults to DEFAULT_SEARCH_ENGINE_TIMEOUT.
        reference_count (int): Number of results to return, defaults to REFERENCE_COUNT.
    """
    self.timeout = timeout
    self.reference_count = reference_count
    self._init_client()
    super().__init__(name=name, description=description)

forward

forward(query: str) -> ToolOutput

Execute a Google search with the given query.

Parameters:

Name	Type	Description	Default
`query`	`str`	Query to be submitted to Google search engine.	required

Returns:

Name	Type	Description
`ToolOutput`	`ToolOutput`	An object containing either the search results or an error message.

Source code in rllm/tools/web_tools/gsearch_tool.py

def forward(self, query: str) -> ToolOutput:
    """
    Execute a Google search with the given query.

    Args:
        query (str): Query to be submitted to Google search engine.

    Returns:
        ToolOutput: An object containing either the search results or an error message.
    """
    try:
        assert self.client is not None, "Google Search Client not initialized"
        contexts = self._search_with_google(query)
        results = {c["link"]: c["snippet"] for c in contexts}
        return ToolOutput(name=self.name or "google_search", output=results)
    except Exception as e:
        return ToolOutput(name=self.name or "google_search", error=f"{type(e).__name__} - {str(e)}")

TavilyExtractTool

Bases: Tool

A tool for extracting data from websites.

Source code in rllm/tools/web_tools/tavily_tool.py

class TavilyExtractTool(Tool):
    """A tool for extracting data from websites."""

    def __init__(self):
        self._init_client()
        super().__init__(name="tavily-extract", description="Extract web page content from one or more specified URLs")

    @property
    def json(self):
        return {"type": "function", "function": {"name": self.name, "description": self.description, "parameters": {"type": "object", "properties": {"urls": {"type": "array", "items": {"type": "string"}, "description": "Array of URLs to extract content from"}}, "required": ["urls"]}}}

    def _init_client(self):
        self.client: httpx.Client | None = httpx.Client()

    def _close_client(self):
        if self.client:
            self.client.close()
        self.client = None

    def forward(self, urls: list[str]) -> ToolOutput:
        """
        Extract content from provided URLs using Tavily API.

        Args:
            urls (List[str]): List of URLs to extract content from.

        Returns:
            ToolOutput: An object containing either the extracted content or an error message.
        """
        api_key = os.getenv("TAVILY_API_KEY")
        if not api_key:
            raise ValueError("TAVILY_API_KEY is not set")

        if self.client is None:
            raise RuntimeError("HTTP client is not initialized")

        try:
            params = {"urls": urls, "include_images": False, "extract_depth": "basic"}
            headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}

            response = self.client.post(url=TAVILY_EXTRACT_ENDPOINT, json=params, headers=headers)

            if not response.is_success:
                return ToolOutput(name=self.name or "tavily-extract", error=f"Error: {response.status_code} - {response.text}")

            output = response.json()
            return ToolOutput(name=self.name or "tavily-extract", output=output)
        except Exception as e:
            return ToolOutput(name=self.name or "tavily-extract", error=f"{type(e).__name__} - {str(e)}")

    def __del__(self):
        """Clean up resources when the tool is garbage collected."""
        self._close_client()

forward

forward(urls: list[str]) -> ToolOutput

Extract content from provided URLs using Tavily API.

Parameters:

Name	Type	Description	Default
`urls`	`List[str]`	List of URLs to extract content from.	required

Returns:

Name	Type	Description
`ToolOutput`	`ToolOutput`	An object containing either the extracted content or an error message.

Source code in rllm/tools/web_tools/tavily_tool.py

def forward(self, urls: list[str]) -> ToolOutput:
    """
    Extract content from provided URLs using Tavily API.

    Args:
        urls (List[str]): List of URLs to extract content from.

    Returns:
        ToolOutput: An object containing either the extracted content or an error message.
    """
    api_key = os.getenv("TAVILY_API_KEY")
    if not api_key:
        raise ValueError("TAVILY_API_KEY is not set")

    if self.client is None:
        raise RuntimeError("HTTP client is not initialized")

    try:
        params = {"urls": urls, "include_images": False, "extract_depth": "basic"}
        headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}

        response = self.client.post(url=TAVILY_EXTRACT_ENDPOINT, json=params, headers=headers)

        if not response.is_success:
            return ToolOutput(name=self.name or "tavily-extract", error=f"Error: {response.status_code} - {response.text}")

        output = response.json()
        return ToolOutput(name=self.name or "tavily-extract", output=output)
    except Exception as e:
        return ToolOutput(name=self.name or "tavily-extract", error=f"{type(e).__name__} - {str(e)}")

del

__del__()

Clean up resources when the tool is garbage collected.

Source code in rllm/tools/web_tools/tavily_tool.py

def __del__(self):
    """Clean up resources when the tool is garbage collected."""
    self._close_client()

TavilySearchTool

Bases: Tool

A tool for searching the web using Tavily API.

Source code in rllm/tools/web_tools/tavily_tool.py

class TavilySearchTool(Tool):
    """A tool for searching the web using Tavily API."""

    def __init__(self):
        self._init_client()
        super().__init__(name="tavily-search", description="Search the web for information on a specific query")

    @property
    def json(self):
        return {
            "type": "function",
            "function": {
                "name": self.name,
                "description": self.description,
                "parameters": {
                    "type": "object",
                    "properties": {
                        "query": {"type": "string", "description": "The search query"},
                        "search_depth": {"type": "string", "enum": ["basic", "advanced"], "description": "The depth of search (basic or advanced)"},
                        "include_domains": {"type": "array", "items": {"type": "string"}, "description": "List of domains to include in the search"},
                        "exclude_domains": {"type": "array", "items": {"type": "string"}, "description": "List of domains to exclude from the search"},
                        "max_results": {"type": "integer", "description": "Maximum number of search results to return"},
                    },
                    "required": ["query"],
                },
            },
        }

    def _init_client(self):
        self.client: httpx.Client | None = httpx.Client()

    def _close_client(self):
        if self.client:
            self.client.close()
        self.client = None

    def forward(self, query: str, search_depth: str = "basic", include_domains: list[str] | None = None, exclude_domains: list[str] | None = None, max_results: int = 5) -> ToolOutput:
        """
        Search the web using Tavily API.

        Args:
            query (str): The search query.
            search_depth (str, optional): The depth of search. Defaults to "basic".
            include_domains (List[str], optional): List of domains to include in the search.
            exclude_domains (List[str], optional): List of domains to exclude from the search.
            max_results (int, optional): Maximum number of search results to return. Defaults to 5.

        Returns:
            ToolOutput: An object containing either the search results or an error message.
        """
        api_key = os.getenv("TAVILY_API_KEY")
        if not api_key:
            raise ValueError("TAVILY_API_KEY is not set")

        if self.client is None:
            raise RuntimeError("HTTP client is not initialized")

        try:
            params = {"query": query, "search_depth": search_depth, "max_results": max_results}

            if include_domains:
                params["include_domains"] = include_domains
            if exclude_domains:
                params["exclude_domains"] = exclude_domains

            headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}

            response = self.client.post(url=TAVILY_SEARCH_ENDPOINT, json=params, headers=headers)

            if not response.is_success:
                return ToolOutput(name=self.name or "tavily-search", error=f"Error: {response.status_code} - {response.text}")

            result = response.json()
            return ToolOutput(name=self.name or "tavily-search", output=result)
        except Exception as e:
            return ToolOutput(name=self.name or "tavily-search", error=f"{type(e).__name__} - {str(e)}")

    def __del__(self):
        """Clean up resources when the tool is garbage collected."""
        self._close_client()

forward

forward(query: str, search_depth: str = 'basic', include_domains: list[str] | None = None, exclude_domains: list[str] | None = None, max_results: int = 5) -> ToolOutput

Search the web using Tavily API.

Parameters:

Name	Type	Description	Default
`query`	`str`	The search query.	required
`search_depth`	`str`	The depth of search. Defaults to "basic".	`'basic'`
`include_domains`	`List[str]`	List of domains to include in the search.	`None`
`exclude_domains`	`List[str]`	List of domains to exclude from the search.	`None`
`max_results`	`int`	Maximum number of search results to return. Defaults to 5.	`5`

Returns:

Name	Type	Description
`ToolOutput`	`ToolOutput`	An object containing either the search results or an error message.

Source code in rllm/tools/web_tools/tavily_tool.py

def forward(self, query: str, search_depth: str = "basic", include_domains: list[str] | None = None, exclude_domains: list[str] | None = None, max_results: int = 5) -> ToolOutput:
    """
    Search the web using Tavily API.

    Args:
        query (str): The search query.
        search_depth (str, optional): The depth of search. Defaults to "basic".
        include_domains (List[str], optional): List of domains to include in the search.
        exclude_domains (List[str], optional): List of domains to exclude from the search.
        max_results (int, optional): Maximum number of search results to return. Defaults to 5.

    Returns:
        ToolOutput: An object containing either the search results or an error message.
    """
    api_key = os.getenv("TAVILY_API_KEY")
    if not api_key:
        raise ValueError("TAVILY_API_KEY is not set")

    if self.client is None:
        raise RuntimeError("HTTP client is not initialized")

    try:
        params = {"query": query, "search_depth": search_depth, "max_results": max_results}

        if include_domains:
            params["include_domains"] = include_domains
        if exclude_domains:
            params["exclude_domains"] = exclude_domains

        headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}

        response = self.client.post(url=TAVILY_SEARCH_ENDPOINT, json=params, headers=headers)

        if not response.is_success:
            return ToolOutput(name=self.name or "tavily-search", error=f"Error: {response.status_code} - {response.text}")

        result = response.json()
        return ToolOutput(name=self.name or "tavily-search", output=result)
    except Exception as e:
        return ToolOutput(name=self.name or "tavily-search", error=f"{type(e).__name__} - {str(e)}")

del

__del__()

Clean up resources when the tool is garbage collected.

Source code in rllm/tools/web_tools/tavily_tool.py

def __del__(self):
    """Clean up resources when the tool is garbage collected."""
    self._close_client()

Web Tools

rllm.tools.web_tools

FirecrawlTool

json property

__init__

forward

async_forward async

GoogleSearchTool

__init__

forward

TavilyExtractTool

forward

__del__

TavilySearchTool

forward

__del__

json `property`

init

async_forward `async`

init

del

del