From 105a2b8eccd64c24dfe2278d365fd8bd1f34d268 Mon Sep 17 00:00:00 2001 From: Stan Girard Date: Fri, 10 May 2024 14:03:21 +0200 Subject: [PATCH] feat(tool): Add URLReaderTool (#2577) This pull request adds the URLReaderTool to the list of tools in the GPT4Brain class. The URLReaderTool allows for reading the content of a URL. --- .../modules/brain/integrations/GPT4/Brain.py | 4 +- backend/modules/tools/__init__.py | 1 + backend/modules/tools/url_reader.py | 53 +++++++++++++++++++ 3 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 backend/modules/tools/url_reader.py diff --git a/backend/modules/brain/integrations/GPT4/Brain.py b/backend/modules/brain/integrations/GPT4/Brain.py index 962c85312..d57b8f55b 100644 --- a/backend/modules/brain/integrations/GPT4/Brain.py +++ b/backend/modules/brain/integrations/GPT4/Brain.py @@ -15,7 +15,7 @@ from modules.brain.knowledge_brain_qa import KnowledgeBrainQA from modules.chat.dto.chats import ChatQuestion from modules.chat.dto.outputs import GetChatHistoryOutput from modules.chat.service.chat_service import ChatService -from modules.tools import ImageGeneratorTool, WebSearchTool +from modules.tools import ImageGeneratorTool, URLReaderTool, WebSearchTool class AgentState(TypedDict): @@ -37,7 +37,7 @@ class GPT4Brain(KnowledgeBrainQA): KnowledgeBrainQA (_type_): A brain that store the knowledge internaly """ - tools: List[BaseTool] = [WebSearchTool(), ImageGeneratorTool()] + tools: List[BaseTool] = [WebSearchTool(), ImageGeneratorTool(), URLReaderTool()] tool_executor: ToolExecutor = ToolExecutor(tools) model_function: ChatOpenAI = None diff --git a/backend/modules/tools/__init__.py b/backend/modules/tools/__init__.py index 2fb21345c..476e2f20f 100644 --- a/backend/modules/tools/__init__.py +++ b/backend/modules/tools/__init__.py @@ -1,3 +1,4 @@ from .image_generator import ImageGeneratorTool from .web_search import WebSearchTool +from .url_reader import URLReaderTool diff --git a/backend/modules/tools/url_reader.py b/backend/modules/tools/url_reader.py new file mode 100644 index 000000000..1eb6b2f8c --- /dev/null +++ b/backend/modules/tools/url_reader.py @@ -0,0 +1,53 @@ +# Extract and combine content recursively +import os +from typing import Dict, Optional, Type + +from langchain.callbacks.manager import ( + AsyncCallbackManagerForToolRun, + CallbackManagerForToolRun, +) +from langchain.pydantic_v1 import BaseModel as BaseModelV1 +from langchain.pydantic_v1 import Field as FieldV1 +from langchain_community.document_loaders import PlaywrightURLLoader +from langchain_core.tools import BaseTool +from logger import get_logger +from pydantic import BaseModel + +logger = get_logger(__name__) + + +class URLReaderInput(BaseModelV1): + url: str = FieldV1(..., title="url", description="url to read") + + +class URLReaderTool(BaseTool): + name = "url-reader" + description = "useful for when you need to read the content of a url." + args_schema: Type[BaseModel] = URLReaderInput + api_key = os.getenv("BRAVE_SEARCH_API_KEY") + + def _run( + self, url: str, run_manager: Optional[CallbackManagerForToolRun] = None + ) -> Dict: + + loader = PlaywrightURLLoader(urls=[url], remove_selectors=["header", "footer"]) + data = loader.load() + + extracted_content = "" + for page in data: + extracted_content += page.page_content + + return {"content": extracted_content} + + async def _arun( + self, url: str, run_manager: Optional[AsyncCallbackManagerForToolRun] = None + ) -> Dict: + """Run the tool asynchronously.""" + loader = PlaywrightURLLoader(urls=[url], remove_selectors=["header", "footer"]) + data = loader.load() + + extracted_content = "" + for page in data: + extracted_content += page.page_content + + return {"content": extracted_content}