feat (926): make crawl recursively navigate linked pages (#927)

This commit is contained in:
Tural Sadık 2023-08-11 08:20:12 +02:00 committed by GitHub
parent b0e7c85316
commit d4d19bbf0b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 33 additions and 6 deletions

View File

@ -3,6 +3,7 @@ import re
import tempfile import tempfile
import unicodedata import unicodedata
from urllib.parse import urljoin
import requests import requests
from pydantic import BaseModel from pydantic import BaseModel
@ -10,7 +11,7 @@ from pydantic import BaseModel
class CrawlWebsite(BaseModel): class CrawlWebsite(BaseModel):
url: str url: str
js: bool = False js: bool = False
depth: int = 1 depth: int = int(os.getenv("CRAWL_DEPTH","1"))
max_pages: int = 100 max_pages: int = 100
max_time: int = 60 max_time: int = 60
@ -26,19 +27,43 @@ class CrawlWebsite(BaseModel):
return None return None
def process(self): def process(self):
content = self._crawl(self.url) visited_list=[]
self._process_level(self.url, 0, visited_list)
return visited_list
def _process_level(self, url, level_depth, visited_list):
content = self._crawl(url)
if content is None:
return
# Create a file # Create a file
file_name = slugify(self.url) + ".html" file_name = slugify(url) + ".html"
temp_file_path = os.path.join(tempfile.gettempdir(), file_name) temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
with open(temp_file_path, "w") as temp_file: with open(temp_file_path, "w") as temp_file:
temp_file.write(content) # pyright: ignore reportPrivateUsage=none temp_file.write(content) # pyright: ignore reportPrivateUsage=none
# Process the file # Process the file
if content: if content:
return temp_file_path, file_name visited_list.append((temp_file_path, file_name))
else: from bs4 import BeautifulSoup
return None soup = BeautifulSoup(content, 'html5lib')
links = soup.findAll('a')
if level_depth < self.depth:
for a in links:
if not a.has_attr('href'):
continue
new_url = a['href']
file_name = slugify(new_url) + ".html"
already_visited = False
for (fpath,fname) in visited_list:
if fname == file_name :
already_visited = True
break
if not already_visited:
self._process_level(urljoin(url,new_url),level_depth + 1,visited_list)
def checkGithub(self): def checkGithub(self):
if "github.com" in self.url: if "github.com" in self.url:

View File

@ -24,3 +24,5 @@ pyright==1.1.316
resend==0.5.1 resend==0.5.1
psycopg2-binary==2.9.6 psycopg2-binary==2.9.6
sqlalchemy==2.0.19 sqlalchemy==2.0.19
html5lib==1.1
bs4==0.0.1