import os import re import tempfile import unicodedata import requests from langchain.document_loaders import GitLoader from pydantic import BaseModel class CrawlWebsite(BaseModel): url: str js: bool = False depth: int = 1 max_pages: int = 100 max_time: int = 60 def _crawl(self, url): response = requests.get(url) if response.status_code == 200: return response.text else: return None def process(self): content = self._crawl(self.url) # Create a file file_name = slugify(self.url) + ".html" temp_file_path = os.path.join(tempfile.gettempdir(), file_name) with open(temp_file_path, "w") as temp_file: temp_file.write(content) # Process the file if content: return temp_file_path, file_name else: return None def checkGithub(self): if "github.com" in self.url: return True else: return False def slugify(text): text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8") text = re.sub(r"[^\w\s-]", "", text).strip().lower() text = re.sub(r"[-\s]+", "-", text) return text