import os import re import tempfile import unicodedata from urllib.parse import urljoin import requests from pydantic import BaseModel from newspaper import Article from bs4 import BeautifulSoup class CrawlWebsite(BaseModel): url: str js: bool = False depth: int = int(os.getenv("CRAWL_DEPTH","1")) max_pages: int = 100 max_time: int = 60 def _crawl(self, url): try: response = requests.get(url) if response.status_code == 200: return response.text else: return None except Exception as e: print(e) raise def extract_content(self, url): article = Article(url) try: article.download() article.parse() except Exception as e: print(f'Error downloading or parsing article: {e}') return None return article.text def _process_recursive(self, url, depth, visited_urls): if depth == 0 or url in visited_urls: return "" visited_urls.add(url) content = self.extract_content(url) raw_html = self._crawl(url) if not raw_html: return content soup = BeautifulSoup(raw_html, 'html.parser') links = [a['href'] for a in soup.find_all('a', href=True)] for link in links: full_url = urljoin(url, link) # Ensure we're staying on the same domain if self.url in full_url: content += self._process_recursive(full_url, depth-1, visited_urls) return content def process(self): # Extract and combine content recursively visited_urls = set() extracted_content = self._process_recursive(self.url, self.depth, visited_urls) # Create a file file_name = slugify(self.url) + ".txt" temp_file_path = os.path.join(tempfile.gettempdir(), file_name) with open(temp_file_path, "w") as temp_file: temp_file.write(extracted_content) return temp_file_path, file_name def checkGithub(self): return 'github.com' in self.url def slugify(text): text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8") text = re.sub(r"[^\w\s-]", "", text).strip().lower() text = re.sub(r"[-\s]+", "-", text) return text