Build evaluation datasets using existing environments or custom setups
Create benchmarks to evaluate agent capabilities systematically. HUD makes it easy to build evaluation datasets using existing environments or by creating entirely custom setups.
from hud.datasets import run_datasetfrom hud.agents import ClaudeAgent# Test your tasksresults = await run_dataset( "Web Navigation Test", web_tasks, # Your list of task dicts agent_class=ClaudeAgent, max_concurrent=2)# Check resultssuccess_rate = sum(r.reward > 0.5 for r in results) / len(results)print(f"Success rate: {success_rate:.2%}")
from hud.datasets import save_tasks# Save to HuggingFace (requires HF token)save_tasks( web_tasks, # List of task dictionaries repo_id="my-org/web-navigation-benchmark", private=False, # Make it public tags=["web", "navigation", "automation"])
Leaderboards are automatically created when the first scorecard is published for your dataset. Simply run an evaluation and create a scorecard at app.hud.so/leaderboards/{your-dataset-id}.
For deeper customization, explore existing evaluators and contribute new ones:
Copy
Ask AI
# Example: Contributing a new evaluator# In environments/remote_browser/src/hud_controller/evaluate/form_complete.pyfrom hud.tools.types import EvaluationResultfrom . import evaluate@evaluate.tool("form_complete")async def form_complete(ctx, required_fields: list[str]): """Check if all required form fields are filled.""" # Access environment from the hub env = evaluate.env # BrowserEnvironmentContext or similar # Use environment methods page_content = await env.get_page_content() # Check each field (simplified example) filled_count = 0 for field in required_fields: if f'value="{field}"' in page_content or f'>{field}<' in page_content: filled_count += 1 success = filled_count == len(required_fields) return EvaluationResult( reward=1.0 if success else filled_count / len(required_fields), done=success, info={"filled": filled_count, "required": len(required_fields)} )
Submit evaluators via pull requests to expand environment capabilities.
# Example task from the datasetsheetbench_task = { "id": "6e4744c7-b2c9-4bb6-807e-2cc144a4e8c2", "prompt": "Calculate from the RawData tab the z-scores from the mean close price for each row. Return, starting in ANSWER!A1 and descending to ANSWER!A5, the 5 dates with the greatest absolute value of standard deviations from the mean", "mcp_config": { "hud": { "url": "https://mcp.hud.so/v3/mcp", "headers": { "Authorization": "Bearer ${HUD_API_KEY}", "Run-Id": "${RUN_ID}", "Mcp-Image": "hudpython/hud-remote-browser:v1.1" } } }, "setup_tool": { "name": "setup", "arguments": { "name": "sheets_from_xlsx", "arguments": { "file_url": "https://gahludmjcsmszgyufydt.supabase.co//storage/v1/object/public/sheetbench/c6ddeb9a-0c16-4f5e-8a06-f148ebb4be8a/setup_input_2.xlsx?" } } }, "evaluate_tool": { "name": "evaluate", "arguments": { "name": "sheets_cell_values", "arguments": { "args": { "A1": "1/12/2024", "A2": "1/10/2024", "A3": "1/15/2024", "A4": "1/11/2024", "A5": "1/17/2024" } } } }, "metadata": { "partial": True, "gold_file_url": "https://gahludmjcsmszgyufydt.supabase.co//storage/v1/object/public/sheetbench/c6ddeb9a-0c16-4f5e-8a06-f148ebb4be8a/gold_solution_2.xlsx?" }}# The dataset includes 50 such tasks covering:# - Formula creation (VLOOKUP, SUMIF, etc.)# - Data analysis (z-scores, correlations)# - Data manipulation (sorting, filtering)# - Chart creation# - Pivot tables