hud-evals · jayendra-ram · Aug 26, 2025
diff --git a/environments/deep_research/Dockerfile b/environments/deep_research/Dockerfile
@@ -0,0 +1,22 @@
+# Use the HUD base browser image with Playwright and uv pre-installed
+FROM hudpython/base-browser:latest
+
+WORKDIR /app
+
+COPY pyproject.toml ./
+COPY src/ ./src/
+
+# Install the package using the existing venv at /opt/venv
+RUN uv pip install --python /opt/venv -e .
+
+ENV DISPLAY_WIDTH=1448
+ENV DISPLAY_HEIGHT=944
+
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1
+
+# Initial URL defaults to Wikipedia if not provided
+ENV INITIAL_URL="https://en.wikipedia.org/wiki/Main_Page"
+
+CMD ["python", "-m", "hud_controller.server"]
+
diff --git a/environments/deep_research/README.md b/environments/deep_research/README.md
@@ -0,0 +1,45 @@
+# HUD Deep Research MCP Server
+
+Local Playwright-based environment tailored for reading and analyzing Wikipedia pages.
+
+## Build
+
+```bash
+docker build -t hud-deep-research:dev .
+```
+
+## Run (production)
+
+```bash
+docker run --rm -i \
+  -e INITIAL_URL="https://en.wikipedia.org/wiki/Main_Page" \
+  hud-deep-research:dev
+```
+
+## Develop (hot-reload)
+
+```bash
+# From repo root
+python -m hud.cli dev environments/deep_research --build
+```
+
+## Debug (stdio inspector)
+
+```bash
+# Basic MCP initialize + list tools
+python -m hud.cli debug environments/deep_research --max-phase 2 --build
+```
+
+## Tools
+
+- setup.navigate(url)
+- setup.open_wikipedia_page(title, lang="en")
+- evaluate.url_match(expected_substring)
+- evaluate.page_contains(search_terms, partial_rewarding=True)
+- playwright (navigate, screenshot, click, type, wait_for_element, get_page_info)
+
+## Environment variables
+
+- INITIAL_URL: Default page to open. Defaults to Wikipedia Main Page.
+- BROWSER_URL: Alternative variable name for initial URL.
+
diff --git a/environments/deep_research/pyproject.toml b/environments/deep_research/pyproject.toml
@@ -0,0 +1,23 @@
+[project]
+name = "hud-deep-research"
+version = "0.1.0"
+description = "HUD Deep Research environment focused on reading Wikipedia via Playwright"
+requires-python = ">=3.11,<3.13"
+dependencies = [
+    "hud-python",
+    "playwright",
+]
+
+[project.scripts]
+hud-deep-research = "hud_controller.server:mcp.run"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/hud_controller"]
+
+[tool.hatch.metadata]
+allow-direct-references = true
+
diff --git a/environments/deep_research/src/hud_controller/__init__.py b/environments/deep_research/src/hud_controller/__init__.py
@@ -0,0 +1,4 @@
+"""HUD Deep Research Controller - Local Playwright environment for Wikipedia reading."""
+
+__version__ = "0.1.0"
+
diff --git a/environments/deep_research/src/hud_controller/evaluate/__init__.py b/environments/deep_research/src/hud_controller/evaluate/__init__.py
@@ -0,0 +1,12 @@
+"""Evaluation layer for deep research environment."""
+
+from __future__ import annotations
+
+from hud.tools.base import BaseHub
+
+evaluate = BaseHub("evaluate")
+
+from . import url_match, page_contains  # noqa: E402
+
+__all__ = ["evaluate"]
+
diff --git a/environments/deep_research/src/hud_controller/evaluate/page_contains.py b/environments/deep_research/src/hud_controller/evaluate/page_contains.py
@@ -0,0 +1,46 @@
+"""Page contains evaluator for deep research environment."""
+
+import logging
+from typing import List, Union
+
+from fastmcp import Context
+from hud.tools.types import EvaluationResult
+
+from . import evaluate
+
+logger = logging.getLogger(__name__)
+
+
+@evaluate.tool("page_contains")
+async def page_contains(ctx: Context, search_terms: Union[str, List[str]], partial_rewarding: bool = True):
+    """Check if the page contains specific text."""
+    tool = evaluate.env
+    if not tool or not tool.page:
+        return EvaluationResult(reward=0.0, done=False, content="No page", info={"success": False})
+    try:
+        content = await tool.page.content()
+        terms = [search_terms] if isinstance(search_terms, str) else list(search_terms)
+        found = [t for t in terms if t in content]
+        not_found = [t for t in terms if t not in content]
+        if partial_rewarding and terms:
+            reward = len(found) / len(terms)
+        else:
+            reward = 1.0 if not not_found else 0.0
+        msg = (
+            "All terms found" if reward == 1.0 else (f"Found {len(found)} of {len(terms)} terms" if reward > 0 else "No terms found")
+        )
+        return EvaluationResult(
+            reward=float(reward),
+            done=reward == 1.0,
+            content=msg,
+            info={
+                "success": reward > 0,
+                "found_terms": found,
+                "not_found_terms": not_found,
+                "total_terms": len(terms),
+            },
+        )
+    except Exception as e:
+        logger.error(f"page_contains failed: {e}")
+        return EvaluationResult(reward=0.0, done=False, content=str(e), info={"success": False})
+
diff --git a/environments/deep_research/src/hud_controller/evaluate/url_match.py b/environments/deep_research/src/hud_controller/evaluate/url_match.py
@@ -0,0 +1,30 @@
+"""URL match evaluator for deep research environment."""
+
+import logging
+from fastmcp import Context
+from hud.tools.types import EvaluationResult
+
+from . import evaluate
+
+logger = logging.getLogger(__name__)
+
+
+@evaluate.tool("url_match")
+async def url_match(ctx: Context, expected_substring: str):
+    """Reward if current URL contains the expected substring."""
+    tool = evaluate.env
+    if not tool or not tool.page:
+        return EvaluationResult(reward=0.0, done=False, content="No page", info={"success": False})
+    try:
+        url = tool.page.url
+        ok = expected_substring in url
+        return EvaluationResult(
+            reward=1.0 if ok else 0.0,
+            done=ok,
+            content=f"URL is {url}",
+            info={"success": ok, "url": url},
+        )
+    except Exception as e:
+        logger.error(f"url_match failed: {e}")
+        return EvaluationResult(reward=0.0, done=False, content=str(e), info={"success": False})
+
diff --git a/environments/deep_research/src/hud_controller/server.py b/environments/deep_research/src/hud_controller/server.py
@@ -0,0 +1,142 @@
+"""MCP server for deep research (Wikipedia-focused) environment."""
+
+import logging
+import os
+import sys
+from datetime import datetime
+from typing import Optional, TypedDict
+
+logging.basicConfig(
+    stream=sys.stderr,
+    level=logging.INFO,
+    format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
+    force=True,
+)
+logger = logging.getLogger(__name__)
+
+from hud.server import MCPServer
+
+from .tools.playwright import PlaywrightToolWithMemory
+from .tools.executor import BrowserExecutor
+
+from .setup import setup as setup_hub
+from .evaluate import evaluate as evaluate_hub
+
+
+class Telemetry(TypedDict):
+    provider: str
+    status: str
+    live_url: str | None
+    timestamp: str
+    cdp_url: str | None
+    instance_id: str | None
+
+
+mcp = MCPServer(
+    name="HUD Deep Research",
+    instructions="""
+    This environment launches a local Playwright browser tailored for reading and
+    analyzing Wikipedia pages. It exposes setup and evaluation tools plus a Playwright
+    automation tool with navigation/action history.
+    """,
+)
+
+playwright_tool: Optional[PlaywrightToolWithMemory] = None
+browser_executor: Optional[BrowserExecutor] = None
+
+
+@mcp.resource("telemetry://live")
+async def get_telemetry_resource() -> Telemetry:
+    status = "running" if playwright_tool and playwright_tool.page else "not_initialized"
+    return Telemetry(
+        provider="local-playwright",
+        status=status,
+        live_url=None,
+        timestamp=datetime.now().isoformat(),
+        cdp_url=None,
+        instance_id=None,
+    )
+
+
+@mcp.initialize
+async def initialize_environment(ctx):
+    global playwright_tool, browser_executor
+
+    metadata = ctx.meta
+    progress_token = metadata.get("progressToken", None)
+
+    async def send_progress(progress: int, message: str):
+        if progress_token:
+            await ctx.session.send_progress_notification(
+                progress_token=progress_token,
+                progress=progress,
+                total=100,
+                message=message,
+            )
+        logger.info(f"[{progress}%] {message}")
+
+    try:
+        await send_progress(10, "Starting deep_research initialization...")
+
+        skip_browser = os.getenv("SKIP_BROWSER") in {"1", "true", "True"}
+
+        # Initialize local Playwright tool
+        playwright_tool = PlaywrightToolWithMemory(context=None, cdp_url=None)
+        if not skip_browser:
+            await playwright_tool._ensure_browser()
+            await send_progress(40, "Playwright browser launched")
+        else:
+            await send_progress(40, "Skipping browser launch (SKIP_BROWSER set)")
+
+        # Register playwright tool
+        mcp.add_tool(playwright_tool.mcp)
+        await send_progress(55, "Playwright tool registered")
+
+        # Initialize executor and computer tools (HUD Computer only, no cloud providers)
+        browser_executor = BrowserExecutor(playwright_tool)
+        await send_progress(65, "Browser executor initialized")
+
+        # Mount hubs with environment
+        setup_hub.env = playwright_tool
+        evaluate_hub.env = playwright_tool
+        mcp.mount(setup_hub)
+        mcp.mount(evaluate_hub)
+        await send_progress(80, "Setup and evaluate hubs registered")
+
+        # Navigate to initial URL
+        if not skip_browser:
+            initial_url = os.getenv("BROWSER_URL") or os.getenv("INITIAL_URL") or (
+                "https://en.wikipedia.org/wiki/Main_Page"
+            )
+            await playwright_tool.navigate(initial_url)
+            await send_progress(100, f"Navigated to {initial_url}")
+        else:
+            await send_progress(100, "Initialization complete (browser launch skipped)")
+    except Exception as e:
+        if progress_token:
+            await ctx.session.send_progress_notification(
+                progress_token=progress_token,
+                progress=0,
+                total=100,
+                message=f"Initialization failed: {str(e)}",
+            )
+        raise
+
+
+@mcp.shutdown
+async def shutdown_environment():
+    global playwright_tool, browser_executor
+    logger.info("Shutting down deep_research environment")
+    try:
+        if playwright_tool and playwright_tool._browser:
+            await playwright_tool._browser.close()
+    except Exception as e:
+        logger.error(f"Error closing browser: {e}")
+    finally:
+        playwright_tool = None
+        browser_executor = None
+
+
+if __name__ == "__main__":
+    mcp.run()
+
diff --git a/environments/deep_research/src/hud_controller/setup/__init__.py b/environments/deep_research/src/hud_controller/setup/__init__.py
@@ -0,0 +1,12 @@
+"""Setup layer for deep research environment."""
+
+from __future__ import annotations
+
+from hud.tools.base import BaseHub
+
+setup = BaseHub("setup")
+
+from . import navigate  # noqa: E402
+
+__all__ = ["setup"]
+
diff --git a/environments/deep_research/src/hud_controller/setup/navigate.py b/environments/deep_research/src/hud_controller/setup/navigate.py
@@ -0,0 +1,34 @@
+"""Setup tools for navigation (Wikipedia-focused)."""
+
+import logging
+from typing import Optional
+
+from fastmcp import Context
+
+from . import setup
+
+logger = logging.getLogger(__name__)
+
+
+@setup.tool("navigate")
+async def navigate(ctx: Context, url: str):
+    """Navigate to a URL."""
+    tool = setup.env
+    await tool.navigate(url)
+    return {"success": True, "message": f"Navigated to {url}"}
+
+
+@setup.tool("open_wikipedia_page")
+async def open_wikipedia_page(ctx: Context, title: str, lang: Optional[str] = "en"):
+    """Open a Wikipedia page by article title.
+
+    Args:
+        title: Article title (e.g., "Alan Turing")
+        lang: Language code (default: en)
+    """
+    slug = title.strip().replace(" ", "_")
+    url = f"https://{lang}.wikipedia.org/wiki/{slug}"
+    tool = setup.env
+    await tool.navigate(url)
+    return {"success": True, "message": f"Opened Wikipedia page: {title}", "url": url}
+
diff --git a/environments/deep_research/src/hud_controller/tools/__init__.py b/environments/deep_research/src/hud_controller/tools/__init__.py
@@ -0,0 +1,7 @@
+from __future__ import annotations
+
+from .playwright import PlaywrightToolWithMemory
+from .executor import BrowserExecutor
+
+__all__ = ["PlaywrightToolWithMemory", "BrowserExecutor"]
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@
		"""HUD Deep Research Controller - Local Playwright environment for Wikipedia reading."""

		__version__ = "0.1.0"