From 105a2b8eccd64c24dfe2278d365fd8bd1f34d268 Mon Sep 17 00:00:00 2001
From: Stan Girard <girard.stanislas@gmail.com>
Date: Fri, 10 May 2024 14:03:21 +0200
Subject: [PATCH] feat(tool): Add URLReaderTool (#2577)

This pull request adds the URLReaderTool to the list of tools in the
GPT4Brain class. The URLReaderTool allows for reading the content of a
URL.
---
 .../modules/brain/integrations/GPT4/Brain.py  |  4 +-
 backend/modules/tools/__init__.py             |  1 +
 backend/modules/tools/url_reader.py           | 53 +++++++++++++++++++
 3 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 backend/modules/tools/url_reader.py

diff --git a/backend/modules/brain/integrations/GPT4/Brain.py b/backend/modules/brain/integrations/GPT4/Brain.py
index 962c85312..d57b8f55b 100644
--- a/backend/modules/brain/integrations/GPT4/Brain.py
+++ b/backend/modules/brain/integrations/GPT4/Brain.py
@@ -15,7 +15,7 @@ from modules.brain.knowledge_brain_qa import KnowledgeBrainQA
 from modules.chat.dto.chats import ChatQuestion
 from modules.chat.dto.outputs import GetChatHistoryOutput
 from modules.chat.service.chat_service import ChatService
-from modules.tools import ImageGeneratorTool, WebSearchTool
+from modules.tools import ImageGeneratorTool, URLReaderTool, WebSearchTool
 
 
 class AgentState(TypedDict):
@@ -37,7 +37,7 @@ class GPT4Brain(KnowledgeBrainQA):
         KnowledgeBrainQA (_type_): A brain that store the knowledge internaly
     """
 
-    tools: List[BaseTool] = [WebSearchTool(), ImageGeneratorTool()]
+    tools: List[BaseTool] = [WebSearchTool(), ImageGeneratorTool(), URLReaderTool()]
     tool_executor: ToolExecutor = ToolExecutor(tools)
     model_function: ChatOpenAI = None
 
diff --git a/backend/modules/tools/__init__.py b/backend/modules/tools/__init__.py
index 2fb21345c..476e2f20f 100644
--- a/backend/modules/tools/__init__.py
+++ b/backend/modules/tools/__init__.py
@@ -1,3 +1,4 @@
 from .image_generator import ImageGeneratorTool
 from .web_search import WebSearchTool
+from .url_reader import URLReaderTool
 
diff --git a/backend/modules/tools/url_reader.py b/backend/modules/tools/url_reader.py
new file mode 100644
index 000000000..1eb6b2f8c
--- /dev/null
+++ b/backend/modules/tools/url_reader.py
@@ -0,0 +1,53 @@
+# Extract and combine content recursively
+import os
+from typing import Dict, Optional, Type
+
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForToolRun,
+    CallbackManagerForToolRun,
+)
+from langchain.pydantic_v1 import BaseModel as BaseModelV1
+from langchain.pydantic_v1 import Field as FieldV1
+from langchain_community.document_loaders import PlaywrightURLLoader
+from langchain_core.tools import BaseTool
+from logger import get_logger
+from pydantic import BaseModel
+
+logger = get_logger(__name__)
+
+
+class URLReaderInput(BaseModelV1):
+    url: str = FieldV1(..., title="url", description="url to read")
+
+
+class URLReaderTool(BaseTool):
+    name = "url-reader"
+    description = "useful for when you need to read the content of a url."
+    args_schema: Type[BaseModel] = URLReaderInput
+    api_key = os.getenv("BRAVE_SEARCH_API_KEY")
+
+    def _run(
+        self, url: str, run_manager: Optional[CallbackManagerForToolRun] = None
+    ) -> Dict:
+
+        loader = PlaywrightURLLoader(urls=[url], remove_selectors=["header", "footer"])
+        data = loader.load()
+
+        extracted_content = ""
+        for page in data:
+            extracted_content += page.page_content
+
+        return {"content": extracted_content}
+
+    async def _arun(
+        self, url: str, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
+    ) -> Dict:
+        """Run the tool asynchronously."""
+        loader = PlaywrightURLLoader(urls=[url], remove_selectors=["header", "footer"])
+        data = loader.load()
+
+        extracted_content = ""
+        for page in data:
+            extracted_content += page.page_content
+
+        return {"content": extracted_content}