quivr/backend/modules/assistant/ito/crawler.py
Stan Girard 9db8b63bb9
feat: Add tags to AssistantOutput classes (#2425)
This pull request adds a new optional field "tags" to the
AssistantOutput classes. The "tags" field allows for categorizing the
outputs and can be used for filtering or organizing purposes.
2024-04-11 09:42:01 -07:00

74 lines
1.9 KiB
Python

from bs4 import BeautifulSoup as Soup
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from logger import get_logger
from modules.assistant.dto.outputs import (
AssistantOutput,
Inputs,
InputUrl,
OutputBrain,
OutputEmail,
Outputs,
)
from modules.assistant.ito.ito import ITO
logger = get_logger(__name__)
class CrawlerAssistant(ITO):
def __init__(
self,
**kwargs,
):
super().__init__(
**kwargs,
)
async def process_assistant(self):
url = self.url
loader = RecursiveUrlLoader(
url=url, max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
)
docs = loader.load()
nice_url = url.split("://")[1].replace("/", "_").replace(".", "_")
nice_url += ".txt"
for docs in docs:
await self.create_and_upload_processed_file(
docs.page_content, nice_url, "Crawler"
)
def crawler_inputs():
output = AssistantOutput(
name="Crawler",
description="Crawls a website and extracts the text from the pages",
tags=["new"],
input_description="One URL to crawl",
output_description="Text extracted from the pages",
inputs=Inputs(
urls=[
InputUrl(
key="url",
required=True,
description="The URL to crawl",
)
],
),
outputs=Outputs(
brain=OutputBrain(
required=True,
description="The brain to which upload the document",
type="uuid",
),
email=OutputEmail(
required=True,
description="Send the document by email",
type="str",
),
),
)
return output