mirror of
https://github.com/QuivrHQ/quivr.git
synced 2024-12-14 17:03:29 +03:00
feat (926): make crawl recursively navigate linked pages (#927)
This commit is contained in:
parent
b0e7c85316
commit
d4d19bbf0b
@ -3,6 +3,7 @@ import re
|
|||||||
import tempfile
|
import tempfile
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
|
from urllib.parse import urljoin
|
||||||
import requests
|
import requests
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
@ -10,7 +11,7 @@ from pydantic import BaseModel
|
|||||||
class CrawlWebsite(BaseModel):
|
class CrawlWebsite(BaseModel):
|
||||||
url: str
|
url: str
|
||||||
js: bool = False
|
js: bool = False
|
||||||
depth: int = 1
|
depth: int = int(os.getenv("CRAWL_DEPTH","1"))
|
||||||
max_pages: int = 100
|
max_pages: int = 100
|
||||||
max_time: int = 60
|
max_time: int = 60
|
||||||
|
|
||||||
@ -26,19 +27,43 @@ class CrawlWebsite(BaseModel):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def process(self):
|
def process(self):
|
||||||
content = self._crawl(self.url)
|
visited_list=[]
|
||||||
|
self._process_level(self.url, 0, visited_list)
|
||||||
|
return visited_list
|
||||||
|
|
||||||
|
def _process_level(self, url, level_depth, visited_list):
|
||||||
|
content = self._crawl(url)
|
||||||
|
if content is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
# Create a file
|
# Create a file
|
||||||
file_name = slugify(self.url) + ".html"
|
file_name = slugify(url) + ".html"
|
||||||
temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
|
temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
|
||||||
with open(temp_file_path, "w") as temp_file:
|
with open(temp_file_path, "w") as temp_file:
|
||||||
temp_file.write(content) # pyright: ignore reportPrivateUsage=none
|
temp_file.write(content) # pyright: ignore reportPrivateUsage=none
|
||||||
# Process the file
|
# Process the file
|
||||||
|
|
||||||
if content:
|
if content:
|
||||||
return temp_file_path, file_name
|
visited_list.append((temp_file_path, file_name))
|
||||||
else:
|
from bs4 import BeautifulSoup
|
||||||
return None
|
soup = BeautifulSoup(content, 'html5lib')
|
||||||
|
links = soup.findAll('a')
|
||||||
|
if level_depth < self.depth:
|
||||||
|
for a in links:
|
||||||
|
if not a.has_attr('href'):
|
||||||
|
continue
|
||||||
|
new_url = a['href']
|
||||||
|
file_name = slugify(new_url) + ".html"
|
||||||
|
already_visited = False
|
||||||
|
for (fpath,fname) in visited_list:
|
||||||
|
if fname == file_name :
|
||||||
|
already_visited = True
|
||||||
|
break
|
||||||
|
if not already_visited:
|
||||||
|
self._process_level(urljoin(url,new_url),level_depth + 1,visited_list)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def checkGithub(self):
|
def checkGithub(self):
|
||||||
if "github.com" in self.url:
|
if "github.com" in self.url:
|
||||||
|
@ -24,3 +24,5 @@ pyright==1.1.316
|
|||||||
resend==0.5.1
|
resend==0.5.1
|
||||||
psycopg2-binary==2.9.6
|
psycopg2-binary==2.9.6
|
||||||
sqlalchemy==2.0.19
|
sqlalchemy==2.0.19
|
||||||
|
html5lib==1.1
|
||||||
|
bs4==0.0.1
|
||||||
|
Loading…
Reference in New Issue
Block a user