quivr/backend/crawl/crawler.py

import os
import re
import tempfile
import unicodedata

import requests
from langchain.document_loaders import GitLoader
from pydantic import BaseModel


class CrawlWebsite(BaseModel):
    url : str
    js : bool = False
    depth : int = 1
    max_pages : int = 100
    max_time : int = 60

    def _crawl(self, url):
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            return None

    def process(self):
        content = self._crawl(self.url)

        ## Create a file
        file_name = slugify(self.url) + ".html"
        temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
        with open(temp_file_path, 'w') as temp_file:
            temp_file.write(content)
            ## Process the file

        if content:
            return temp_file_path, file_name
        else:
            return None
    def checkGithub(self):
        if "github.com" in self.url:
            return True
        else:
            return False


def slugify(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    text = re.sub(r'[^\w\s-]', '', text).strip().lower()
    text = re.sub(r'[-\s]+', '-', text)
    return text