quivr/backend/crawl/crawler.py
Mamadou DICKO e1a740472f
Feat: chat name edit (#343)
* feat(chat): add name update

* chore(linting): add flake8

* feat: add chat name edit
2023-06-20 09:54:23 +02:00

52 lines
1.2 KiB
Python

import os
import re
import tempfile
import unicodedata
import requests
from langchain.document_loaders import GitLoader
from pydantic import BaseModel
class CrawlWebsite(BaseModel):
url: str
js: bool = False
depth: int = 1
max_pages: int = 100
max_time: int = 60
def _crawl(self, url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
def process(self):
content = self._crawl(self.url)
# Create a file
file_name = slugify(self.url) + ".html"
temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
with open(temp_file_path, "w") as temp_file:
temp_file.write(content)
# Process the file
if content:
return temp_file_path, file_name
else:
return None
def checkGithub(self):
if "github.com" in self.url:
return True
else:
return False
def slugify(text):
text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8")
text = re.sub(r"[^\w\s-]", "", text).strip().lower()
text = re.sub(r"[-\s]+", "-", text)
return text