quivr/backend/modules/tools/url_reader.py
Stan Girard 105a2b8ecc
feat(tool): Add URLReaderTool (#2577)
This pull request adds the URLReaderTool to the list of tools in the
GPT4Brain class. The URLReaderTool allows for reading the content of a
URL.
2024-05-10 05:03:21 -07:00

54 lines
1.6 KiB
Python

# Extract and combine content recursively
import os
from typing import Dict, Optional, Type
from langchain.callbacks.manager import (
AsyncCallbackManagerForToolRun,
CallbackManagerForToolRun,
)
from langchain.pydantic_v1 import BaseModel as BaseModelV1
from langchain.pydantic_v1 import Field as FieldV1
from langchain_community.document_loaders import PlaywrightURLLoader
from langchain_core.tools import BaseTool
from logger import get_logger
from pydantic import BaseModel
logger = get_logger(__name__)
class URLReaderInput(BaseModelV1):
url: str = FieldV1(..., title="url", description="url to read")
class URLReaderTool(BaseTool):
name = "url-reader"
description = "useful for when you need to read the content of a url."
args_schema: Type[BaseModel] = URLReaderInput
api_key = os.getenv("BRAVE_SEARCH_API_KEY")
def _run(
self, url: str, run_manager: Optional[CallbackManagerForToolRun] = None
) -> Dict:
loader = PlaywrightURLLoader(urls=[url], remove_selectors=["header", "footer"])
data = loader.load()
extracted_content = ""
for page in data:
extracted_content += page.page_content
return {"content": extracted_content}
async def _arun(
self, url: str, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
) -> Dict:
"""Run the tool asynchronously."""
loader = PlaywrightURLLoader(urls=[url], remove_selectors=["header", "footer"])
data = loader.load()
extracted_content = ""
for page in data:
extracted_content += page.page_content
return {"content": extracted_content}