mirror of
https://github.com/QuivrHQ/quivr.git
synced 2024-12-19 20:31:50 +03:00
54 lines
1.6 KiB
Python
54 lines
1.6 KiB
Python
|
# Extract and combine content recursively
|
||
|
import os
|
||
|
from typing import Dict, Optional, Type
|
||
|
|
||
|
from langchain.callbacks.manager import (
|
||
|
AsyncCallbackManagerForToolRun,
|
||
|
CallbackManagerForToolRun,
|
||
|
)
|
||
|
from langchain.pydantic_v1 import BaseModel as BaseModelV1
|
||
|
from langchain.pydantic_v1 import Field as FieldV1
|
||
|
from langchain_community.document_loaders import PlaywrightURLLoader
|
||
|
from langchain_core.tools import BaseTool
|
||
|
from logger import get_logger
|
||
|
from pydantic import BaseModel
|
||
|
|
||
|
logger = get_logger(__name__)
|
||
|
|
||
|
|
||
|
class URLReaderInput(BaseModelV1):
|
||
|
url: str = FieldV1(..., title="url", description="url to read")
|
||
|
|
||
|
|
||
|
class URLReaderTool(BaseTool):
|
||
|
name = "url-reader"
|
||
|
description = "useful for when you need to read the content of a url."
|
||
|
args_schema: Type[BaseModel] = URLReaderInput
|
||
|
api_key = os.getenv("BRAVE_SEARCH_API_KEY")
|
||
|
|
||
|
def _run(
|
||
|
self, url: str, run_manager: Optional[CallbackManagerForToolRun] = None
|
||
|
) -> Dict:
|
||
|
|
||
|
loader = PlaywrightURLLoader(urls=[url], remove_selectors=["header", "footer"])
|
||
|
data = loader.load()
|
||
|
|
||
|
extracted_content = ""
|
||
|
for page in data:
|
||
|
extracted_content += page.page_content
|
||
|
|
||
|
return {"content": extracted_content}
|
||
|
|
||
|
async def _arun(
|
||
|
self, url: str, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
|
||
|
) -> Dict:
|
||
|
"""Run the tool asynchronously."""
|
||
|
loader = PlaywrightURLLoader(urls=[url], remove_selectors=["header", "footer"])
|
||
|
data = loader.load()
|
||
|
|
||
|
extracted_content = ""
|
||
|
for page in data:
|
||
|
extracted_content += page.page_content
|
||
|
|
||
|
return {"content": extracted_content}
|