quivr/backend/core/crawl/crawler.py
2023-07-17 07:57:27 +01:00

51 lines
1.2 KiB
Python

import os
import re
import tempfile
import unicodedata
import requests
from pydantic import BaseModel
class CrawlWebsite(BaseModel):
url: str
js: bool = False
depth: int = 1
max_pages: int = 100
max_time: int = 60
def _crawl(self, url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
def process(self):
content = self._crawl(self.url)
# Create a file
file_name = slugify(self.url) + ".html"
temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
with open(temp_file_path, "w") as temp_file:
temp_file.write(content) # pyright: ignore reportPrivateUsage=none
# Process the file
if content:
return temp_file_path, file_name
else:
return None
def checkGithub(self):
if "github.com" in self.url:
return True
else:
return False
def slugify(text):
text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8")
text = re.sub(r"[^\w\s-]", "", text).strip().lower()
text = re.sub(r"[-\s]+", "-", text)
return text