feat(tool): Add URLReaderTool (#2577)

This pull request adds the URLReaderTool to the list of tools in the GPT4Brain class. The URLReaderTool allows for reading the content of a URL.
2024-08-17 02:10:31 +03:00 · 2024-05-10 14:03:21 +02:00 · 2024-05-10 14:03:21 +02:00 · 105a2b8ecc
commit 105a2b8ecc
parent 62f9b5bed2
3 changed files with 56 additions and 2 deletions
--- a/backend/modules/brain/integrations/GPT4/Brain.py
+++ b/backend/modules/brain/integrations/GPT4/Brain.py
@ -15,7 +15,7 @@ from modules.brain.knowledge_brain_qa import KnowledgeBrainQA
 from modules.chat.dto.chats import ChatQuestion
 from modules.chat.dto.outputs import GetChatHistoryOutput
 from modules.chat.service.chat_service import ChatService
-from modules.tools import ImageGeneratorTool, WebSearchTool
+from modules.tools import ImageGeneratorTool, URLReaderTool, WebSearchTool


 class AgentState(TypedDict):
@ -37,7 +37,7 @@ class GPT4Brain(KnowledgeBrainQA):
        KnowledgeBrainQA (_type_): A brain that store the knowledge internaly
    """

-    tools: List[BaseTool] = [WebSearchTool(), ImageGeneratorTool()]
+    tools: List[BaseTool] = [WebSearchTool(), ImageGeneratorTool(), URLReaderTool()]
    tool_executor: ToolExecutor = ToolExecutor(tools)
    model_function: ChatOpenAI = None

--- a/backend/modules/tools/init.py
+++ b/backend/modules/tools/init.py
@ -1,3 +1,4 @@
 from .image_generator import ImageGeneratorTool
 from .web_search import WebSearchTool
+from .url_reader import URLReaderTool

--- a/backend/modules/tools/url_reader.py
+++ b/backend/modules/tools/url_reader.py
@ -0,0 +1,53 @@
+# Extract and combine content recursively
+import os
+from typing import Dict, Optional, Type
+
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForToolRun,
+    CallbackManagerForToolRun,
+)
+from langchain.pydantic_v1 import BaseModel as BaseModelV1
+from langchain.pydantic_v1 import Field as FieldV1
+from langchain_community.document_loaders import PlaywrightURLLoader
+from langchain_core.tools import BaseTool
+from logger import get_logger
+from pydantic import BaseModel
+
+logger = get_logger(__name__)
+
+
+class URLReaderInput(BaseModelV1):
+    url: str = FieldV1(..., title="url", description="url to read")
+
+
+class URLReaderTool(BaseTool):
+    name = "url-reader"
+    description = "useful for when you need to read the content of a url."
+    args_schema: Type[BaseModel] = URLReaderInput
+    api_key = os.getenv("BRAVE_SEARCH_API_KEY")
+
+    def _run(
+        self, url: str, run_manager: Optional[CallbackManagerForToolRun] = None
+    ) -> Dict:
+
+        loader = PlaywrightURLLoader(urls=[url], remove_selectors=["header", "footer"])
+        data = loader.load()
+
+        extracted_content = ""
+        for page in data:
+            extracted_content += page.page_content
+
+        return {"content": extracted_content}
+
+    async def _arun(
+        self, url: str, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
+    ) -> Dict:
+        """Run the tool asynchronously."""
+        loader = PlaywrightURLLoader(urls=[url], remove_selectors=["header", "footer"])
+        data = loader.load()
+
+        extracted_content = ""
+        for page in data:
+            extracted_content += page.page_content
+
+        return {"content": extracted_content}