feat(crawler): Add Playwright for web crawling (#2562)

This pull request adds the Playwright library for web crawling. It
includes the necessary dependencies and updates the code to use
Playwright for crawling websites.
This commit is contained in:
Stan Girard 2024-05-08 16:20:35 +02:00 committed by GitHub
parent cb0ec25190
commit e33d497598
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 738 additions and 399 deletions

View File

@ -63,6 +63,7 @@ unidecode = "*"
flashrank = "*"
langchain-cohere = "*"
pyinstrument = "*"
playwright = "*"
[dev-packages]
black = "*"

946
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -53,7 +53,8 @@ COPY ./requirements.txt .
# Upgrade pip and install dependencies
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r requirements.txt
pip install --no-cache-dir -r requirements.txt && \
playwright install --with-deps
# Copy the rest of the application
COPY . .

View File

@ -41,7 +41,8 @@ COPY ./requirements.txt .
RUN pip install --upgrade pip
# Increase timeout to wait for the new installation
RUN pip install --no-cache-dir -r requirements.txt --timeout 200
RUN pip install --no-cache-dir -r requirements.txt --timeout 200 && \
playwright install --with-deps
WORKDIR /code
# Copy the rest of the application

View File

@ -126,6 +126,13 @@ def process_crawl_and_notify(
original_file_name=crawl_website_url,
)
)
notification_service.update_notification_by_id(
notification_id,
NotificationUpdatableProperties(
status=NotificationsStatusEnum.SUCCESS,
description=f"Your URL has been properly crawled!",
),
)
else:
loop = asyncio.get_event_loop()
message = loop.run_until_complete(

View File

@ -1,8 +1,10 @@
import asyncio
import os
import tempfile
from typing import List
import nest_asyncio
import uvloop
from fastapi import UploadFile
from langchain.prompts import HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain_community.chat_models import ChatLiteLLM
@ -22,7 +24,8 @@ from modules.assistant.dto.outputs import (
from modules.assistant.ito.ito import ITO
from modules.user.entity.user_identity import UserIdentity
nest_asyncio.apply()
if not isinstance(asyncio.get_event_loop(), uvloop.Loop):
nest_asyncio.apply()
logger = get_logger(__name__)

View File

@ -2,13 +2,13 @@ import os
import re
import tempfile
import unicodedata
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from newspaper import Article
from langchain_community.document_loaders import PlaywrightURLLoader
from logger import get_logger
from pydantic import BaseModel
logger = get_logger(__name__)
class CrawlWebsite(BaseModel):
url: str
@ -17,53 +17,20 @@ class CrawlWebsite(BaseModel):
max_pages: int = 100
max_time: int = 60
def _crawl(self, url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
except Exception as e:
print(e)
raise
def extract_content(self, url):
article = Article(url)
try:
article.download()
article.parse()
except Exception as e:
print(f"Error downloading or parsing article: {e}")
return None
return article.text
def _process_recursive(self, url, depth, visited_urls):
if depth == 0 or url in visited_urls:
return ""
visited_urls.add(url)
content = self.extract_content(url)
raw_html = self._crawl(url)
if not raw_html:
return content
soup = BeautifulSoup(raw_html, "html.parser")
links = [a["href"] for a in soup.find_all("a", href=True)]
for link in links:
full_url = urljoin(url, link)
# Ensure we're staying on the same domain
if self.url in full_url:
content += self._process_recursive(full_url, depth - 1, visited_urls) # type: ignore
return content
def process(self):
# Extract and combine content recursively
visited_urls = set()
extracted_content = self._process_recursive(self.url, self.depth, visited_urls)
loader = PlaywrightURLLoader(
urls=[self.url], remove_selectors=["header", "footer"]
)
data = loader.load()
# Now turn the data into a string
logger.info(f"Extracted content from {len(data)} pages")
logger.info(data)
extracted_content = ""
for page in data:
extracted_content += page.page_content
# Create a file
file_name = slugify(self.url) + ".txt"

View File

@ -1,9 +1,11 @@
import asyncio
import os
import tempfile
import time
import nest_asyncio
import tiktoken
import uvloop
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_parse import LlamaParse
@ -13,7 +15,8 @@ from modules.brain.service.brain_vector_service import BrainVectorService
from modules.upload.service.upload_file import DocumentSerializable
from packages.embeddings.vectors import Neurons
nest_asyncio.apply()
if not isinstance(asyncio.get_event_loop(), uvloop.Loop):
nest_asyncio.apply()
logger = get_logger(__name__)

View File

@ -17,8 +17,8 @@ backoff==2.2.1; python_version >= '3.7' and python_version < '4.0'
beautifulsoup4==4.12.3; python_full_version >= '3.6.0'
billiard==4.2.0; python_version >= '3.7'
black==24.4.2; python_version >= '3.8'
boto3==1.34.95; python_version >= '3.8'
botocore==1.34.95; python_version >= '3.8'
boto3==1.34.100; python_version >= '3.8'
botocore==1.34.100; python_version >= '3.8'
cachetools==5.3.3; python_version >= '3.7'
celery[redis,sqs]==5.4.0; python_version >= '3.8'
certifi==2024.2.2; python_version >= '3.6'
@ -29,15 +29,15 @@ click==8.1.7; python_version >= '3.7'
click-didyoumean==0.3.1; python_full_version >= '3.6.2'
click-plugins==1.1.1
click-repl==0.3.0; python_version >= '3.6'
cohere==5.3.4; python_version >= '3.8' and python_version < '4.0'
cohere==5.3.5; python_version >= '3.8' and python_version < '4.0'
coloredlogs==15.0.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
colorlog==6.8.2; python_version >= '3.6'
contourpy==1.2.1; python_version >= '3.9'
cryptography==42.0.5; python_version >= '3.7'
cryptography==42.0.7; python_version >= '3.7'
cssselect==1.2.0; python_version >= '3.7'
cycler==0.12.1; python_version >= '3.8'
dataclasses-json==0.6.5; python_version >= '3.7' and python_version < '4.0'
datasets==2.19.0; python_full_version >= '3.8.0'
datasets==2.19.1; python_full_version >= '3.8.0'
debugpy==1.8.1; python_version >= '3.8'
decorator==5.1.1; python_version >= '3.5'
deepdiff==7.0.1; python_version >= '3.8'
@ -48,15 +48,18 @@ dill==0.3.8; python_version >= '3.8'
dirtyjson==1.0.8
diskcache==5.6.3; python_version >= '3'
distro==1.9.0; python_version >= '3.6'
dnspython==2.6.1; python_version >= '3.8'
docker==7.0.0; python_version >= '3.8'
docx2txt==0.8
duckdb==0.10.2; python_full_version >= '3.7.0'
ecdsa==0.19.0; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
effdet==0.4.1
email-validator==2.1.1; python_version >= '3.8'
emoji==2.11.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
et-xmlfile==1.1.0; python_version >= '3.6'
faker==19.13.0; python_version >= '3.8'
fastapi==0.110.3; python_version >= '3.8'
fastapi==0.111.0; python_version >= '3.8'
fastapi-cli==0.0.3; python_version >= '3.8'
fastavro==1.9.4; python_version >= '3.8'
feedfinder2==0.0.4
feedparser==6.0.11; python_version >= '3.6'
@ -78,15 +81,16 @@ google-auth==2.29.0; python_version >= '3.7'
google-cloud-vision==3.7.2
googleapis-common-protos==1.63.0; python_version >= '3.7'
gotrue==2.4.2; python_version >= '3.8' and python_version < '4.0'
greenlet==3.0.3
greenlet==3.0.3; python_version >= '3.7'
grpcio==1.63.0
grpcio-status==1.62.2
h11==0.14.0; python_version >= '3.7'
html5lib==1.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
httpcore==1.0.5; python_version >= '3.8'
httptools==0.6.1
httpx==0.27.0; python_version >= '3.8'
httpx-sse==0.4.0; python_version >= '3.8'
huggingface-hub==0.22.2; python_full_version >= '3.8.0'
huggingface-hub==0.23.0; python_full_version >= '3.8.0'
humanfriendly==10.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
humanize==4.9.0; python_version >= '3.8'
idna==3.7; python_version >= '3.5'
@ -94,9 +98,9 @@ importlib-metadata==7.1.0; python_version >= '3.8'
iniconfig==2.0.0; python_version >= '3.7'
iopath==0.1.10; python_version >= '3.6'
jieba3k==0.35.1
jinja2==3.1.3; python_version >= '3.7'
jinja2==3.1.4; python_version >= '3.7'
jmespath==1.0.1; python_version >= '3.7'
joblib==1.4.0; python_version >= '3.8'
joblib==1.4.2; python_version >= '3.8'
jq==1.7.0; python_version >= '3.5'
jsonpatch==1.33; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6'
jsonpath-python==1.0.6; python_version >= '3.6'
@ -105,38 +109,40 @@ kiwisolver==1.4.5; python_version >= '3.7'
kombu[sqs]==5.3.7; python_version >= '3.8'
langchain==0.1.17; python_version < '4.0' and python_full_version >= '3.8.1'
langchain-cohere==0.1.4; python_version < '4.0' and python_full_version >= '3.8.1'
langchain-community==0.0.36; python_version < '4.0' and python_full_version >= '3.8.1'
langchain-core==0.1.48; python_version < '4.0' and python_full_version >= '3.8.1'
langchain-openai==0.1.4; python_version < '4.0' and python_full_version >= '3.8.1'
langchain-community==0.0.37; python_version < '4.0' and python_full_version >= '3.8.1'
langchain-core==0.1.52; python_version < '4.0' and python_full_version >= '3.8.1'
langchain-openai==0.1.6; python_version < '4.0' and python_full_version >= '3.8.1'
langchain-text-splitters==0.0.1; python_version < '4.0' and python_full_version >= '3.8.1'
langdetect==1.0.9
langfuse==2.28.0; python_version < '4.0' and python_full_version >= '3.8.1'
langsmith==0.1.52; python_version < '4.0' and python_full_version >= '3.8.1'
langfuse==2.29.2; python_version < '4.0' and python_full_version >= '3.8.1'
langsmith==0.1.55; python_version < '4.0' and python_full_version >= '3.8.1'
layoutparser[layoutmodels,tesseract]==0.3.4; python_version >= '3.6'
litellm==1.35.33; python_version not in '2.7, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7' and python_version >= '3.8'
litellm==1.36.2; python_version not in '2.7, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7' and python_version >= '3.8'
llama-cpp-python==0.2.67; python_version >= '3.8'
llama-index==0.10.33; python_version < '4.0' and python_full_version >= '3.8.1'
llama-index-agent-openai==0.2.3; python_version < '4.0' and python_full_version >= '3.8.1'
llama-index==0.10.35; python_version < '4.0' and python_full_version >= '3.8.1'
llama-index-agent-openai==0.2.4; python_version < '4.0' and python_full_version >= '3.8.1'
llama-index-cli==0.1.12; python_version < '4.0' and python_full_version >= '3.8.1'
llama-index-core==0.10.33; python_version < '4.0' and python_full_version >= '3.8.1'
llama-index-core==0.10.35.post1; python_version < '4.0' and python_full_version >= '3.8.1'
llama-index-embeddings-openai==0.1.9; python_version < '4.0' and python_full_version >= '3.8.1'
llama-index-indices-managed-llama-cloud==0.1.6; python_version < '4.0' and python_full_version >= '3.8.1'
llama-index-legacy==0.9.48; python_version < '4.0' and python_full_version >= '3.8.1'
llama-index-llms-openai==0.1.16; python_version < '4.0' and python_full_version >= '3.8.1'
llama-index-llms-openai==0.1.17; python_version < '4.0' and python_full_version >= '3.8.1'
llama-index-multi-modal-llms-openai==0.1.5; python_version < '4.0' and python_full_version >= '3.8.1'
llama-index-program-openai==0.1.6; python_version < '4.0' and python_full_version >= '3.8.1'
llama-index-question-gen-openai==0.1.3; python_version < '4.0' and python_full_version >= '3.8.1'
llama-index-readers-file==0.1.19; python_version < '4.0' and python_full_version >= '3.8.1'
llama-index-readers-file==0.1.21; python_version < '4.0' and python_full_version >= '3.8.1'
llama-index-readers-llama-parse==0.1.4; python_version < '4.0' and python_full_version >= '3.8.1'
llama-parse==0.4.2; python_version < '4.0' and python_full_version >= '3.8.1'
llamaindex-py-client==0.1.19; python_version >= '3.8' and python_version < '4'
lxml[html_clean]==5.2.1; python_version >= '3.6'
lxml-html-clean==0.1.1
markdown==3.6
markdown-it-py==3.0.0; python_version >= '3.8'
markupsafe==2.1.5; python_version >= '3.7'
marshmallow==3.21.1; python_version >= '3.8'
marshmallow==3.21.2; python_version >= '3.8'
matplotlib==3.8.4; python_version >= '3.9'
mccabe==0.7.0; python_version >= '3.6'
mdurl==0.1.2; python_version >= '3.7'
monotonic==1.6
mpmath==1.3.0
msg-parser==1.2.0
@ -153,14 +159,14 @@ olefile==0.47; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2,
omegaconf==2.3.0; python_version >= '3.6'
onnx==1.16.0
onnxruntime==1.17.3
openai==1.25.0; python_full_version >= '3.7.1'
openai==1.26.0; python_full_version >= '3.7.1'
opencv-python==4.9.0.80; python_version >= '3.6'
openpyxl==3.1.2
ordered-set==4.1.0; python_version >= '3.7'
orjson==3.10.2; python_version >= '3.8'
orjson==3.10.3; python_version >= '3.8'
packaging==23.2; python_version >= '3.7'
pandas==1.5.3; python_version >= '3.8'
pandasai==2.0.36; python_version not in '2.7, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8' and python_version >= '3.9'
pandasai==2.0.37; python_version not in '2.7, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8' and python_version >= '3.9'
pathspec==0.12.1; python_version >= '3.8'
pdf2image==1.17.0
pdfminer.six==20231228
@ -169,6 +175,7 @@ pikepdf==8.15.1
pillow==10.3.0; python_version >= '3.8'
pillow-heif==0.16.0
platformdirs==4.2.1; python_version >= '3.8'
playwright==1.43.0; python_version >= '3.8'
pluggy==1.5.0; python_version >= '3.8'
portalocker==2.8.2; python_version >= '3.8'
postgrest==0.16.4; python_version >= '3.8' and python_version < '4.0'
@ -192,13 +199,15 @@ pycurl==7.45.3
pydantic==2.7.1; python_version >= '3.8'
pydantic-core==2.18.2; python_version >= '3.8'
pydantic-settings==2.2.1; python_version >= '3.8'
pyee==11.1.0; python_version >= '3.8'
pyflakes==3.2.0; python_version >= '3.8'
pygments==2.18.0; python_version >= '3.8'
pyinstrument==4.6.2; python_version >= '3.7'
pypandoc==1.13; python_version >= '3.6'
pyparsing==3.1.2; python_full_version >= '3.6.8'
pypdf==4.2.0; python_version >= '3.6'
pypdfium2==4.29.0; python_version >= '3.6'
pyright==1.1.361; python_version >= '3.7'
pyright==1.1.362; python_version >= '3.7'
pysbd==0.3.4; python_version >= '3'
pytesseract==0.3.10; python_version >= '3.7'
pytest==8.2.0; python_version >= '3.8'
@ -207,7 +216,7 @@ pytest-docker-tools==3.1.3; python_full_version >= '3.7.0' and python_full_versi
pytest-dotenv==0.5.2
pytest-mock==3.14.0; python_version >= '3.8'
python-dateutil==2.9.0.post0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
python-docx==1.1.0
python-docx==1.1.2
python-dotenv==1.0.1; python_version >= '3.8'
python-iso639==2024.4.27; python_version >= '3.8'
python-jose==3.3.0
@ -217,7 +226,7 @@ python-pptx==0.6.23
pytz==2024.1
pyyaml==6.0.1; python_version >= '3.6'
ragas==0.1.7
rapidfuzz==3.8.1; python_version >= '3.8'
rapidfuzz==3.9.0; python_version >= '3.8'
realtime==1.0.4; python_version >= '3.8' and python_version < '4.0'
redis==5.0.4; python_version >= '3.7'
regex==2024.4.28; python_version >= '3.8'
@ -225,27 +234,29 @@ requests==2.31.0; python_version >= '3.7'
requests-file==2.0.0
resend==0.8.0; python_version >= '3.7'
retry==0.9.2
rich==13.7.1; python_full_version >= '3.7.0'
rsa==4.9; python_version >= '3.6' and python_version < '4'
s3transfer==0.10.1; python_version >= '3.8'
safetensors==0.4.3; python_version >= '3.7'
scipy==1.13.0; python_version >= '3.9'
sentry-sdk[fastapi]==2.0.1; python_version >= '3.6'
sentry-sdk[fastapi]==2.1.1; python_version >= '3.6'
setuptools==69.5.1; python_version >= '3.8'
sgmllib3k==1.0.0
shellingham==1.5.4; python_version >= '3.7'
six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
smmap==5.0.1; python_version >= '3.7'
sniffio==1.3.1; python_version >= '3.7'
soupsieve==2.5; python_version >= '3.8'
sqlalchemy[asyncio]==2.0.29; python_version >= '3.7'
sqlalchemy[asyncio]==2.0.30; python_version >= '3.7'
starlette==0.37.2; python_version >= '3.8'
storage3==0.7.4; python_version >= '3.8' and python_version < '4.0'
strenum==0.4.15
striprtf==0.0.26
supabase==2.4.3; python_version >= '3.8' and python_version < '4.0'
supabase==2.4.5; python_version >= '3.8' and python_version < '4.0'
supafunc==0.4.5; python_version >= '3.8' and python_version < '4.0'
sympy==1.12; python_version >= '3.8'
tabulate==0.9.0; python_version >= '3.7'
tenacity==8.2.3; python_version >= '3.7'
tenacity==8.3.0; python_version >= '3.8'
tiktoken==0.6.0; python_version >= '3.8'
timm==0.9.16; python_version >= '3.8'
tinysegmenter==0.3
@ -254,21 +265,25 @@ tokenizers==0.19.1; python_version >= '3.7'
torch==2.3.0
torchvision==0.18.0
tornado==6.4; python_version >= '3.8'
tqdm==4.66.2; python_version >= '3.7'
transformers==4.40.1; python_full_version >= '3.8.0'
tqdm==4.66.4; python_version >= '3.7'
transformers==4.40.2; python_full_version >= '3.8.0'
typer==0.12.3; python_version >= '3.7'
types-requests==2.31.0.20240406; python_version >= '3.8'
typing-extensions==4.11.0; python_version >= '3.8'
typing-inspect==0.9.0
tzdata==2024.1; python_version >= '2'
ujson==5.9.0; python_version >= '3.8'
unidecode==1.3.8; python_version >= '3.5'
unstructured[all-docs]==0.13.6; python_version < '3.12' and python_full_version >= '3.9.0'
unstructured-client==0.22.0; python_version >= '3.8'
unstructured-inference==0.7.29
unstructured.pytesseract==0.3.12
urllib3==2.2.1; python_version >= '3.10'
uvicorn==0.29.0; python_version >= '3.8'
uvicorn[standard]==0.29.0; python_version >= '3.8'
uvloop==0.19.0
vine==5.1.0; python_version >= '3.6'
watchdog==4.0.0; python_version >= '3.8'
watchfiles==0.21.0
wcwidth==0.2.13
webencodings==0.5.1
websockets==12.0; python_version >= '3.8'

View File

@ -5,8 +5,14 @@ from celery_worker import process_crawl_and_notify
from fastapi import APIRouter, Depends, Query, Request
from logger import get_logger
from middlewares.auth import AuthBearer, get_current_user
from modules.brain.entity.brain_entity import RoleEnum
from modules.brain.service.brain_authorization_service import (
validate_brain_authorization,
)
from modules.knowledge.dto.inputs import CreateKnowledgeProperties
from modules.knowledge.service.knowledge_service import KnowledgeService
from modules.notification.dto.inputs import CreateNotification
from modules.notification.entity.notification import NotificationsStatusEnum
from modules.notification.service.notification_service import NotificationService
from modules.user.entity.user_identity import UserIdentity
from modules.user.service.user_usage import UserUsage
@ -37,7 +43,9 @@ async def crawl_endpoint(
Crawl a website and process the crawled data.
"""
# [TODO] check if the user is the owner/editor of the brain
validate_brain_authorization(
brain_id, current_user.id, [RoleEnum.Editor, RoleEnum.Owner]
)
userDailyUsage = UserUsage(
id=current_user.id,
@ -54,6 +62,13 @@ async def crawl_endpoint(
"type": "error",
}
else:
upload_notification = notification_service.add_notification(
CreateNotification(
user_id=current_user.id,
status=NotificationsStatusEnum.INFO,
title=f"Processing Crawl {crawl_website.url}",
)
)
knowledge_to_add = CreateKnowledgeProperties(
brain_id=brain_id,
url=crawl_website.url,
@ -66,7 +81,7 @@ async def crawl_endpoint(
process_crawl_and_notify.delay(
crawl_website_url=crawl_website.url,
brain_id=brain_id,
notification_id=None,
notification_id=upload_notification.id,
)
return {"message": "Crawl processing has started."}