mirror of
https://github.com/QuivrHQ/quivr.git
synced 2024-12-22 05:41:39 +03:00
9db8b63bb9
This pull request adds a new optional field "tags" to the AssistantOutput classes. The "tags" field allows for categorizing the outputs and can be used for filtering or organizing purposes.
74 lines
1.9 KiB
Python
74 lines
1.9 KiB
Python
from bs4 import BeautifulSoup as Soup
|
|
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
|
|
from logger import get_logger
|
|
from modules.assistant.dto.outputs import (
|
|
AssistantOutput,
|
|
Inputs,
|
|
InputUrl,
|
|
OutputBrain,
|
|
OutputEmail,
|
|
Outputs,
|
|
)
|
|
from modules.assistant.ito.ito import ITO
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
class CrawlerAssistant(ITO):
|
|
|
|
def __init__(
|
|
self,
|
|
**kwargs,
|
|
):
|
|
super().__init__(
|
|
**kwargs,
|
|
)
|
|
|
|
async def process_assistant(self):
|
|
|
|
url = self.url
|
|
loader = RecursiveUrlLoader(
|
|
url=url, max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
|
|
)
|
|
docs = loader.load()
|
|
|
|
nice_url = url.split("://")[1].replace("/", "_").replace(".", "_")
|
|
nice_url += ".txt"
|
|
|
|
for docs in docs:
|
|
await self.create_and_upload_processed_file(
|
|
docs.page_content, nice_url, "Crawler"
|
|
)
|
|
|
|
|
|
def crawler_inputs():
|
|
output = AssistantOutput(
|
|
name="Crawler",
|
|
description="Crawls a website and extracts the text from the pages",
|
|
tags=["new"],
|
|
input_description="One URL to crawl",
|
|
output_description="Text extracted from the pages",
|
|
inputs=Inputs(
|
|
urls=[
|
|
InputUrl(
|
|
key="url",
|
|
required=True,
|
|
description="The URL to crawl",
|
|
)
|
|
],
|
|
),
|
|
outputs=Outputs(
|
|
brain=OutputBrain(
|
|
required=True,
|
|
description="The brain to which upload the document",
|
|
type="uuid",
|
|
),
|
|
email=OutputEmail(
|
|
required=True,
|
|
description="Send the document by email",
|
|
type="str",
|
|
),
|
|
),
|
|
)
|
|
return output
|