quivr/backend/crawl/crawler.py

83 lines
2.3 KiB
Python

import os
import re
import tempfile
import unicodedata
from urllib.parse import urljoin
import requests
from pydantic import BaseModel
from newspaper import Article
from bs4 import BeautifulSoup
class CrawlWebsite(BaseModel):
url: str
js: bool = False
depth: int = int(os.getenv("CRAWL_DEPTH","1"))
max_pages: int = 100
max_time: int = 60
def _crawl(self, url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
except Exception as e:
print(e)
raise
def extract_content(self, url):
article = Article(url)
try:
article.download()
article.parse()
except Exception as e:
print(f'Error downloading or parsing article: {e}')
return None
return article.text
def _process_recursive(self, url, depth, visited_urls):
if depth == 0 or url in visited_urls:
return ""
visited_urls.add(url)
content = self.extract_content(url)
raw_html = self._crawl(url)
if not raw_html:
return content
soup = BeautifulSoup(raw_html, 'html.parser')
links = [a['href'] for a in soup.find_all('a', href=True)]
for link in links:
full_url = urljoin(url, link)
# Ensure we're staying on the same domain
if self.url in full_url:
content += self._process_recursive(full_url, depth-1, visited_urls)
return content
def process(self):
# Extract and combine content recursively
visited_urls = set()
extracted_content = self._process_recursive(self.url, self.depth, visited_urls)
# Create a file
file_name = slugify(self.url) + ".txt"
temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
with open(temp_file_path, "w") as temp_file:
temp_file.write(extracted_content)
return temp_file_path, file_name
def checkGithub(self):
return 'github.com' in self.url
def slugify(text):
text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8")
text = re.sub(r"[^\w\s-]", "", text).strip().lower()
text = re.sub(r"[-\s]+", "-", text)
return text