2024-04-02 04:40:56 +03:00
|
|
|
from bs4 import BeautifulSoup as Soup
|
|
|
|
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
|
|
|
|
from logger import get_logger
|
2024-04-11 19:31:16 +03:00
|
|
|
from modules.assistant.dto.outputs import (
|
|
|
|
AssistantOutput,
|
|
|
|
Inputs,
|
|
|
|
InputUrl,
|
|
|
|
OutputBrain,
|
|
|
|
OutputEmail,
|
|
|
|
Outputs,
|
|
|
|
)
|
2024-04-10 14:28:22 +03:00
|
|
|
from modules.assistant.ito.ito import ITO
|
2024-04-02 04:40:56 +03:00
|
|
|
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
|
|
|
2024-04-10 14:28:22 +03:00
|
|
|
class CrawlerAssistant(ITO):
|
2024-04-02 04:40:56 +03:00
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
**kwargs,
|
|
|
|
):
|
|
|
|
super().__init__(
|
|
|
|
**kwargs,
|
|
|
|
)
|
|
|
|
|
2024-04-10 14:28:22 +03:00
|
|
|
async def process_assistant(self):
|
2024-04-02 04:40:56 +03:00
|
|
|
|
|
|
|
url = self.url
|
|
|
|
loader = RecursiveUrlLoader(
|
|
|
|
url=url, max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
|
|
|
|
)
|
|
|
|
docs = loader.load()
|
|
|
|
|
|
|
|
nice_url = url.split("://")[1].replace("/", "_").replace(".", "_")
|
|
|
|
nice_url += ".txt"
|
|
|
|
|
|
|
|
for docs in docs:
|
|
|
|
await self.create_and_upload_processed_file(
|
|
|
|
docs.page_content, nice_url, "Crawler"
|
|
|
|
)
|
2024-04-11 19:31:16 +03:00
|
|
|
|
|
|
|
|
|
|
|
def crawler_inputs():
|
|
|
|
output = AssistantOutput(
|
|
|
|
name="Crawler",
|
|
|
|
description="Crawls a website and extracts the text from the pages",
|
2024-04-11 19:42:01 +03:00
|
|
|
tags=["new"],
|
2024-04-11 19:31:16 +03:00
|
|
|
input_description="One URL to crawl",
|
|
|
|
output_description="Text extracted from the pages",
|
|
|
|
inputs=Inputs(
|
|
|
|
urls=[
|
|
|
|
InputUrl(
|
|
|
|
key="url",
|
|
|
|
required=True,
|
|
|
|
description="The URL to crawl",
|
|
|
|
)
|
|
|
|
],
|
|
|
|
),
|
|
|
|
outputs=Outputs(
|
|
|
|
brain=OutputBrain(
|
|
|
|
required=True,
|
|
|
|
description="The brain to which upload the document",
|
|
|
|
type="uuid",
|
|
|
|
),
|
|
|
|
email=OutputEmail(
|
|
|
|
required=True,
|
|
|
|
description="Send the document by email",
|
|
|
|
type="str",
|
|
|
|
),
|
|
|
|
),
|
|
|
|
)
|
|
|
|
return output
|