mirror of
https://github.com/QuivrHQ/quivr.git
synced 2024-12-18 11:51:41 +03:00
105a2b8ecc
This pull request adds the URLReaderTool to the list of tools in the GPT4Brain class. The URLReaderTool allows for reading the content of a URL.
54 lines
1.6 KiB
Python
54 lines
1.6 KiB
Python
# Extract and combine content recursively
|
|
import os
|
|
from typing import Dict, Optional, Type
|
|
|
|
from langchain.callbacks.manager import (
|
|
AsyncCallbackManagerForToolRun,
|
|
CallbackManagerForToolRun,
|
|
)
|
|
from langchain.pydantic_v1 import BaseModel as BaseModelV1
|
|
from langchain.pydantic_v1 import Field as FieldV1
|
|
from langchain_community.document_loaders import PlaywrightURLLoader
|
|
from langchain_core.tools import BaseTool
|
|
from logger import get_logger
|
|
from pydantic import BaseModel
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
class URLReaderInput(BaseModelV1):
|
|
url: str = FieldV1(..., title="url", description="url to read")
|
|
|
|
|
|
class URLReaderTool(BaseTool):
|
|
name = "url-reader"
|
|
description = "useful for when you need to read the content of a url."
|
|
args_schema: Type[BaseModel] = URLReaderInput
|
|
api_key = os.getenv("BRAVE_SEARCH_API_KEY")
|
|
|
|
def _run(
|
|
self, url: str, run_manager: Optional[CallbackManagerForToolRun] = None
|
|
) -> Dict:
|
|
|
|
loader = PlaywrightURLLoader(urls=[url], remove_selectors=["header", "footer"])
|
|
data = loader.load()
|
|
|
|
extracted_content = ""
|
|
for page in data:
|
|
extracted_content += page.page_content
|
|
|
|
return {"content": extracted_content}
|
|
|
|
async def _arun(
|
|
self, url: str, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
|
|
) -> Dict:
|
|
"""Run the tool asynchronously."""
|
|
loader = PlaywrightURLLoader(urls=[url], remove_selectors=["header", "footer"])
|
|
data = loader.load()
|
|
|
|
extracted_content = ""
|
|
for page in data:
|
|
extracted_content += page.page_content
|
|
|
|
return {"content": extracted_content}
|