quivr/backend/crawl/crawler.py
2023-06-06 00:38:15 +02:00

51 lines
1.2 KiB
Python

import os
import re
import tempfile
import unicodedata
import requests
from langchain.document_loaders import GitLoader
from pydantic import BaseModel
class CrawlWebsite(BaseModel):
url : str
js : bool = False
depth : int = 1
max_pages : int = 100
max_time : int = 60
def _crawl(self, url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
def process(self):
content = self._crawl(self.url)
## Create a file
file_name = slugify(self.url) + ".html"
temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
with open(temp_file_path, 'w') as temp_file:
temp_file.write(content)
## Process the file
if content:
return temp_file_path, file_name
else:
return None
def checkGithub(self):
if "github.com" in self.url:
return True
else:
return False
def slugify(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
text = re.sub(r'[^\w\s-]', '', text).strip().lower()
text = re.sub(r'[-\s]+', '-', text)
return text