File size: 5,254 Bytes
88f3fce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import json
from typing import TYPE_CHECKING, Optional
from pydantic import Field, model_validator
from app.agent.toolcall import ToolCallAgent
from app.logger import logger
from app.prompt.browser import NEXT_STEP_PROMPT, SYSTEM_PROMPT
from app.schema import Message, ToolChoice
from app.tool import BrowserUseTool, Terminate, ToolCollection
from app.tool.sandbox.sb_browser_tool import SandboxBrowserTool
# Avoid circular import if BrowserAgent needs BrowserContextHelper
if TYPE_CHECKING:
from app.agent.base import BaseAgent # Or wherever memory is defined
class BrowserContextHelper:
def __init__(self, agent: "BaseAgent"):
self.agent = agent
self._current_base64_image: Optional[str] = None
async def get_browser_state(self) -> Optional[dict]:
browser_tool = self.agent.available_tools.get_tool(BrowserUseTool().name)
if not browser_tool:
browser_tool = self.agent.available_tools.get_tool(
SandboxBrowserTool().name
)
if not browser_tool or not hasattr(browser_tool, "get_current_state"):
logger.warning("BrowserUseTool not found or doesn't have get_current_state")
return None
try:
result = await browser_tool.get_current_state()
if result.error:
logger.debug(f"Browser state error: {result.error}")
return None
if hasattr(result, "base64_image") and result.base64_image:
self._current_base64_image = result.base64_image
else:
self._current_base64_image = None
return json.loads(result.output)
except Exception as e:
logger.debug(f"Failed to get browser state: {str(e)}")
return None
async def format_next_step_prompt(self) -> str:
"""Gets browser state and formats the browser prompt."""
browser_state = await self.get_browser_state()
url_info, tabs_info, content_above_info, content_below_info = "", "", "", ""
results_info = "" # Or get from agent if needed elsewhere
if browser_state and not browser_state.get("error"):
url_info = f"\n URL: {browser_state.get('url', 'N/A')}\n Title: {browser_state.get('title', 'N/A')}"
tabs = browser_state.get("tabs", [])
if tabs:
tabs_info = f"\n {len(tabs)} tab(s) available"
pixels_above = browser_state.get("pixels_above", 0)
pixels_below = browser_state.get("pixels_below", 0)
if pixels_above > 0:
content_above_info = f" ({pixels_above} pixels)"
if pixels_below > 0:
content_below_info = f" ({pixels_below} pixels)"
if self._current_base64_image:
image_message = Message.user_message(
content="Current browser screenshot:",
base64_image=self._current_base64_image,
)
self.agent.memory.add_message(image_message)
self._current_base64_image = None # Consume the image after adding
return NEXT_STEP_PROMPT.format(
url_placeholder=url_info,
tabs_placeholder=tabs_info,
content_above_placeholder=content_above_info,
content_below_placeholder=content_below_info,
results_placeholder=results_info,
)
async def cleanup_browser(self):
browser_tool = self.agent.available_tools.get_tool(BrowserUseTool().name)
if browser_tool and hasattr(browser_tool, "cleanup"):
await browser_tool.cleanup()
class BrowserAgent(ToolCallAgent):
"""
A browser agent that uses the browser_use library to control a browser.
This agent can navigate web pages, interact with elements, fill forms,
extract content, and perform other browser-based actions to accomplish tasks.
"""
name: str = "browser"
description: str = "A browser agent that can control a browser to accomplish tasks"
system_prompt: str = SYSTEM_PROMPT
next_step_prompt: str = NEXT_STEP_PROMPT
max_observe: int = 10000
max_steps: int = 20
# Configure the available tools
available_tools: ToolCollection = Field(
default_factory=lambda: ToolCollection(BrowserUseTool(), Terminate())
)
# Use Auto for tool choice to allow both tool usage and free-form responses
tool_choices: ToolChoice = ToolChoice.AUTO
special_tool_names: list[str] = Field(default_factory=lambda: [Terminate().name])
browser_context_helper: Optional[BrowserContextHelper] = None
@model_validator(mode="after")
def initialize_helper(self) -> "BrowserAgent":
self.browser_context_helper = BrowserContextHelper(self)
return self
async def think(self) -> bool:
"""Process current state and decide next actions using tools, with browser state info added"""
self.next_step_prompt = (
await self.browser_context_helper.format_next_step_prompt()
)
return await super().think()
async def cleanup(self):
"""Clean up browser agent resources by calling parent cleanup."""
await self.browser_context_helper.cleanup_browser()
|