quivr/backend/modules/assistant/ito/crawler.py

from bs4 import BeautifulSoup as Soup
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from logger import get_logger
from modules.assistant.dto.outputs import (
    AssistantOutput,
    Inputs,
    InputUrl,
    OutputBrain,
    OutputEmail,
    Outputs,
)
from modules.assistant.ito.ito import ITO

logger = get_logger(__name__)


class CrawlerAssistant(ITO):

    def __init__(
        self,
        **kwargs,
    ):
        super().__init__(
            **kwargs,
        )

    async def process_assistant(self):

        url = self.url
        loader = RecursiveUrlLoader(
            url=url, max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
        )
        docs = loader.load()

        nice_url = url.split("://")[1].replace("/", "_").replace(".", "_")
        nice_url += ".txt"

        for docs in docs:
            await self.create_and_upload_processed_file(
                docs.page_content, nice_url, "Crawler"
            )


def crawler_inputs():
    output = AssistantOutput(
        name="Crawler",
        description="Crawls a website and extracts the text from the pages",
        tags=["new"],
        input_description="One URL to crawl",
        output_description="Text extracted from the pages",
        inputs=Inputs(
            urls=[
                InputUrl(
                    key="url",
                    required=True,
                    description="The URL to crawl",
                )
            ],
        ),
        outputs=Outputs(
            brain=OutputBrain(
                required=True,
                description="The brain to which upload the document",
                type="uuid",
            ),
            email=OutputEmail(
                required=True,
                description="Send the document by email",
                type="str",
            ),
        ),
    )
    return output
feat(ingestion): Add ingestion module and routes (#2393) This pull request adds the ingestion module and routes to the project. It includes the necessary files and code changes to implement the ingestion functionality. 2024-04-02 04:40:56 +03:00			`from bs4 import BeautifulSoup as Soup`
			`from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader`
			`from logger import get_logger`
feat: assistants (#2421) …dio_transcript and crawler assistants # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): 2024-04-11 19:31:16 +03:00			`from modules.assistant.dto.outputs import (`
			`AssistantOutput,`
			`Inputs,`
			`InputUrl,`
			`OutputBrain,`
			`OutputEmail,`
			`Outputs,`
			`)`
feat: Add assistant module and remove ingestion module (#2420) # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): 2024-04-10 14:28:22 +03:00			`from modules.assistant.ito.ito import ITO`
feat(ingestion): Add ingestion module and routes (#2393) This pull request adds the ingestion module and routes to the project. It includes the necessary files and code changes to implement the ingestion functionality. 2024-04-02 04:40:56 +03:00
			`logger = get_logger(__name__)`


feat: Add assistant module and remove ingestion module (#2420) # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): 2024-04-10 14:28:22 +03:00			`class CrawlerAssistant(ITO):`
feat(ingestion): Add ingestion module and routes (#2393) This pull request adds the ingestion module and routes to the project. It includes the necessary files and code changes to implement the ingestion functionality. 2024-04-02 04:40:56 +03:00
			`def __init__(`
			`self,`
			`**kwargs,`
			`):`
			`super().__init__(`
			`**kwargs,`
			`)`

feat: Add assistant module and remove ingestion module (#2420) # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): 2024-04-10 14:28:22 +03:00			`async def process_assistant(self):`
feat(ingestion): Add ingestion module and routes (#2393) This pull request adds the ingestion module and routes to the project. It includes the necessary files and code changes to implement the ingestion functionality. 2024-04-02 04:40:56 +03:00
			`url = self.url`
			`loader = RecursiveUrlLoader(`
			`url=url, max_depth=2, extractor=lambda x: Soup(x, "html.parser").text`
			`)`
			`docs = loader.load()`

			`nice_url = url.split("://")[1].replace("/", "_").replace(".", "_")`
			`nice_url += ".txt"`

			`for docs in docs:`
			`await self.create_and_upload_processed_file(`
			`docs.page_content, nice_url, "Crawler"`
			`)`
feat: assistants (#2421) …dio_transcript and crawler assistants # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): 2024-04-11 19:31:16 +03:00

			`def crawler_inputs():`
			`output = AssistantOutput(`
			`name="Crawler",`
			`description="Crawls a website and extracts the text from the pages",`
feat: Add tags to AssistantOutput classes (#2425) This pull request adds a new optional field "tags" to the AssistantOutput classes. The "tags" field allows for categorizing the outputs and can be used for filtering or organizing purposes. 2024-04-11 19:42:01 +03:00			`tags=["new"],`
feat: assistants (#2421) …dio_transcript and crawler assistants # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): 2024-04-11 19:31:16 +03:00			`input_description="One URL to crawl",`
			`output_description="Text extracted from the pages",`
			`inputs=Inputs(`
			`urls=[`
			`InputUrl(`
			`key="url",`
			`required=True,`
			`description="The URL to crawl",`
			`)`
			`],`
			`),`
			`outputs=Outputs(`
			`brain=OutputBrain(`
			`required=True,`
			`description="The brain to which upload the document",`
			`type="uuid",`
			`),`
			`email=OutputEmail(`
			`required=True,`
			`description="Send the document by email",`
			`type="str",`
			`),`
			`),`
			`)`
			`return output`