mirror of
https://github.com/StanGirard/quivr.git
synced 2024-11-26 03:15:19 +03:00
feat(crawler): Add Playwright for web crawling (#2562)
This pull request adds the Playwright library for web crawling. It includes the necessary dependencies and updates the code to use Playwright for crawling websites.
This commit is contained in:
parent
cb0ec25190
commit
e33d497598
1
Pipfile
1
Pipfile
@ -63,6 +63,7 @@ unidecode = "*"
|
||||
flashrank = "*"
|
||||
langchain-cohere = "*"
|
||||
pyinstrument = "*"
|
||||
playwright = "*"
|
||||
|
||||
[dev-packages]
|
||||
black = "*"
|
||||
|
946
Pipfile.lock
generated
946
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
@ -53,7 +53,8 @@ COPY ./requirements.txt .
|
||||
|
||||
# Upgrade pip and install dependencies
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r requirements.txt
|
||||
pip install --no-cache-dir -r requirements.txt && \
|
||||
playwright install --with-deps
|
||||
|
||||
# Copy the rest of the application
|
||||
COPY . .
|
||||
|
@ -41,7 +41,8 @@ COPY ./requirements.txt .
|
||||
RUN pip install --upgrade pip
|
||||
|
||||
# Increase timeout to wait for the new installation
|
||||
RUN pip install --no-cache-dir -r requirements.txt --timeout 200
|
||||
RUN pip install --no-cache-dir -r requirements.txt --timeout 200 && \
|
||||
playwright install --with-deps
|
||||
|
||||
WORKDIR /code
|
||||
# Copy the rest of the application
|
||||
|
@ -126,6 +126,13 @@ def process_crawl_and_notify(
|
||||
original_file_name=crawl_website_url,
|
||||
)
|
||||
)
|
||||
notification_service.update_notification_by_id(
|
||||
notification_id,
|
||||
NotificationUpdatableProperties(
|
||||
status=NotificationsStatusEnum.SUCCESS,
|
||||
description=f"Your URL has been properly crawled!",
|
||||
),
|
||||
)
|
||||
else:
|
||||
loop = asyncio.get_event_loop()
|
||||
message = loop.run_until_complete(
|
||||
|
@ -1,8 +1,10 @@
|
||||
import asyncio
|
||||
import os
|
||||
import tempfile
|
||||
from typing import List
|
||||
|
||||
import nest_asyncio
|
||||
import uvloop
|
||||
from fastapi import UploadFile
|
||||
from langchain.prompts import HumanMessagePromptTemplate, SystemMessagePromptTemplate
|
||||
from langchain_community.chat_models import ChatLiteLLM
|
||||
@ -22,7 +24,8 @@ from modules.assistant.dto.outputs import (
|
||||
from modules.assistant.ito.ito import ITO
|
||||
from modules.user.entity.user_identity import UserIdentity
|
||||
|
||||
nest_asyncio.apply()
|
||||
if not isinstance(asyncio.get_event_loop(), uvloop.Loop):
|
||||
nest_asyncio.apply()
|
||||
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
@ -2,13 +2,13 @@ import os
|
||||
import re
|
||||
import tempfile
|
||||
import unicodedata
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from newspaper import Article
|
||||
from langchain_community.document_loaders import PlaywrightURLLoader
|
||||
from logger import get_logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class CrawlWebsite(BaseModel):
|
||||
url: str
|
||||
@ -17,53 +17,20 @@ class CrawlWebsite(BaseModel):
|
||||
max_pages: int = 100
|
||||
max_time: int = 60
|
||||
|
||||
def _crawl(self, url):
|
||||
try:
|
||||
response = requests.get(url)
|
||||
if response.status_code == 200:
|
||||
return response.text
|
||||
else:
|
||||
return None
|
||||
except Exception as e:
|
||||
print(e)
|
||||
raise
|
||||
|
||||
def extract_content(self, url):
|
||||
article = Article(url)
|
||||
try:
|
||||
article.download()
|
||||
article.parse()
|
||||
except Exception as e:
|
||||
print(f"Error downloading or parsing article: {e}")
|
||||
return None
|
||||
return article.text
|
||||
|
||||
def _process_recursive(self, url, depth, visited_urls):
|
||||
if depth == 0 or url in visited_urls:
|
||||
return ""
|
||||
|
||||
visited_urls.add(url)
|
||||
|
||||
content = self.extract_content(url)
|
||||
raw_html = self._crawl(url)
|
||||
|
||||
if not raw_html:
|
||||
return content
|
||||
|
||||
soup = BeautifulSoup(raw_html, "html.parser")
|
||||
links = [a["href"] for a in soup.find_all("a", href=True)]
|
||||
for link in links:
|
||||
full_url = urljoin(url, link)
|
||||
# Ensure we're staying on the same domain
|
||||
if self.url in full_url:
|
||||
content += self._process_recursive(full_url, depth - 1, visited_urls) # type: ignore
|
||||
|
||||
return content
|
||||
|
||||
def process(self):
|
||||
# Extract and combine content recursively
|
||||
visited_urls = set()
|
||||
extracted_content = self._process_recursive(self.url, self.depth, visited_urls)
|
||||
loader = PlaywrightURLLoader(
|
||||
urls=[self.url], remove_selectors=["header", "footer"]
|
||||
)
|
||||
data = loader.load()
|
||||
|
||||
# Now turn the data into a string
|
||||
logger.info(f"Extracted content from {len(data)} pages")
|
||||
logger.info(data)
|
||||
extracted_content = ""
|
||||
for page in data:
|
||||
extracted_content += page.page_content
|
||||
|
||||
# Create a file
|
||||
file_name = slugify(self.url) + ".txt"
|
||||
|
@ -1,9 +1,11 @@
|
||||
import asyncio
|
||||
import os
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
import nest_asyncio
|
||||
import tiktoken
|
||||
import uvloop
|
||||
from langchain.schema import Document
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from llama_parse import LlamaParse
|
||||
@ -13,7 +15,8 @@ from modules.brain.service.brain_vector_service import BrainVectorService
|
||||
from modules.upload.service.upload_file import DocumentSerializable
|
||||
from packages.embeddings.vectors import Neurons
|
||||
|
||||
nest_asyncio.apply()
|
||||
if not isinstance(asyncio.get_event_loop(), uvloop.Loop):
|
||||
nest_asyncio.apply()
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
@ -17,8 +17,8 @@ backoff==2.2.1; python_version >= '3.7' and python_version < '4.0'
|
||||
beautifulsoup4==4.12.3; python_full_version >= '3.6.0'
|
||||
billiard==4.2.0; python_version >= '3.7'
|
||||
black==24.4.2; python_version >= '3.8'
|
||||
boto3==1.34.95; python_version >= '3.8'
|
||||
botocore==1.34.95; python_version >= '3.8'
|
||||
boto3==1.34.100; python_version >= '3.8'
|
||||
botocore==1.34.100; python_version >= '3.8'
|
||||
cachetools==5.3.3; python_version >= '3.7'
|
||||
celery[redis,sqs]==5.4.0; python_version >= '3.8'
|
||||
certifi==2024.2.2; python_version >= '3.6'
|
||||
@ -29,15 +29,15 @@ click==8.1.7; python_version >= '3.7'
|
||||
click-didyoumean==0.3.1; python_full_version >= '3.6.2'
|
||||
click-plugins==1.1.1
|
||||
click-repl==0.3.0; python_version >= '3.6'
|
||||
cohere==5.3.4; python_version >= '3.8' and python_version < '4.0'
|
||||
cohere==5.3.5; python_version >= '3.8' and python_version < '4.0'
|
||||
coloredlogs==15.0.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
|
||||
colorlog==6.8.2; python_version >= '3.6'
|
||||
contourpy==1.2.1; python_version >= '3.9'
|
||||
cryptography==42.0.5; python_version >= '3.7'
|
||||
cryptography==42.0.7; python_version >= '3.7'
|
||||
cssselect==1.2.0; python_version >= '3.7'
|
||||
cycler==0.12.1; python_version >= '3.8'
|
||||
dataclasses-json==0.6.5; python_version >= '3.7' and python_version < '4.0'
|
||||
datasets==2.19.0; python_full_version >= '3.8.0'
|
||||
datasets==2.19.1; python_full_version >= '3.8.0'
|
||||
debugpy==1.8.1; python_version >= '3.8'
|
||||
decorator==5.1.1; python_version >= '3.5'
|
||||
deepdiff==7.0.1; python_version >= '3.8'
|
||||
@ -48,15 +48,18 @@ dill==0.3.8; python_version >= '3.8'
|
||||
dirtyjson==1.0.8
|
||||
diskcache==5.6.3; python_version >= '3'
|
||||
distro==1.9.0; python_version >= '3.6'
|
||||
dnspython==2.6.1; python_version >= '3.8'
|
||||
docker==7.0.0; python_version >= '3.8'
|
||||
docx2txt==0.8
|
||||
duckdb==0.10.2; python_full_version >= '3.7.0'
|
||||
ecdsa==0.19.0; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
|
||||
effdet==0.4.1
|
||||
email-validator==2.1.1; python_version >= '3.8'
|
||||
emoji==2.11.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
||||
et-xmlfile==1.1.0; python_version >= '3.6'
|
||||
faker==19.13.0; python_version >= '3.8'
|
||||
fastapi==0.110.3; python_version >= '3.8'
|
||||
fastapi==0.111.0; python_version >= '3.8'
|
||||
fastapi-cli==0.0.3; python_version >= '3.8'
|
||||
fastavro==1.9.4; python_version >= '3.8'
|
||||
feedfinder2==0.0.4
|
||||
feedparser==6.0.11; python_version >= '3.6'
|
||||
@ -78,15 +81,16 @@ google-auth==2.29.0; python_version >= '3.7'
|
||||
google-cloud-vision==3.7.2
|
||||
googleapis-common-protos==1.63.0; python_version >= '3.7'
|
||||
gotrue==2.4.2; python_version >= '3.8' and python_version < '4.0'
|
||||
greenlet==3.0.3
|
||||
greenlet==3.0.3; python_version >= '3.7'
|
||||
grpcio==1.63.0
|
||||
grpcio-status==1.62.2
|
||||
h11==0.14.0; python_version >= '3.7'
|
||||
html5lib==1.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
|
||||
httpcore==1.0.5; python_version >= '3.8'
|
||||
httptools==0.6.1
|
||||
httpx==0.27.0; python_version >= '3.8'
|
||||
httpx-sse==0.4.0; python_version >= '3.8'
|
||||
huggingface-hub==0.22.2; python_full_version >= '3.8.0'
|
||||
huggingface-hub==0.23.0; python_full_version >= '3.8.0'
|
||||
humanfriendly==10.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
|
||||
humanize==4.9.0; python_version >= '3.8'
|
||||
idna==3.7; python_version >= '3.5'
|
||||
@ -94,9 +98,9 @@ importlib-metadata==7.1.0; python_version >= '3.8'
|
||||
iniconfig==2.0.0; python_version >= '3.7'
|
||||
iopath==0.1.10; python_version >= '3.6'
|
||||
jieba3k==0.35.1
|
||||
jinja2==3.1.3; python_version >= '3.7'
|
||||
jinja2==3.1.4; python_version >= '3.7'
|
||||
jmespath==1.0.1; python_version >= '3.7'
|
||||
joblib==1.4.0; python_version >= '3.8'
|
||||
joblib==1.4.2; python_version >= '3.8'
|
||||
jq==1.7.0; python_version >= '3.5'
|
||||
jsonpatch==1.33; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6'
|
||||
jsonpath-python==1.0.6; python_version >= '3.6'
|
||||
@ -105,38 +109,40 @@ kiwisolver==1.4.5; python_version >= '3.7'
|
||||
kombu[sqs]==5.3.7; python_version >= '3.8'
|
||||
langchain==0.1.17; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
langchain-cohere==0.1.4; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
langchain-community==0.0.36; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
langchain-core==0.1.48; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
langchain-openai==0.1.4; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
langchain-community==0.0.37; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
langchain-core==0.1.52; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
langchain-openai==0.1.6; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
langchain-text-splitters==0.0.1; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
langdetect==1.0.9
|
||||
langfuse==2.28.0; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
langsmith==0.1.52; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
langfuse==2.29.2; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
langsmith==0.1.55; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
layoutparser[layoutmodels,tesseract]==0.3.4; python_version >= '3.6'
|
||||
litellm==1.35.33; python_version not in '2.7, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7' and python_version >= '3.8'
|
||||
litellm==1.36.2; python_version not in '2.7, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7' and python_version >= '3.8'
|
||||
llama-cpp-python==0.2.67; python_version >= '3.8'
|
||||
llama-index==0.10.33; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llama-index-agent-openai==0.2.3; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llama-index==0.10.35; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llama-index-agent-openai==0.2.4; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llama-index-cli==0.1.12; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llama-index-core==0.10.33; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llama-index-core==0.10.35.post1; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llama-index-embeddings-openai==0.1.9; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llama-index-indices-managed-llama-cloud==0.1.6; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llama-index-legacy==0.9.48; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llama-index-llms-openai==0.1.16; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llama-index-llms-openai==0.1.17; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llama-index-multi-modal-llms-openai==0.1.5; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llama-index-program-openai==0.1.6; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llama-index-question-gen-openai==0.1.3; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llama-index-readers-file==0.1.19; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llama-index-readers-file==0.1.21; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llama-index-readers-llama-parse==0.1.4; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llama-parse==0.4.2; python_version < '4.0' and python_full_version >= '3.8.1'
|
||||
llamaindex-py-client==0.1.19; python_version >= '3.8' and python_version < '4'
|
||||
lxml[html_clean]==5.2.1; python_version >= '3.6'
|
||||
lxml-html-clean==0.1.1
|
||||
markdown==3.6
|
||||
markdown-it-py==3.0.0; python_version >= '3.8'
|
||||
markupsafe==2.1.5; python_version >= '3.7'
|
||||
marshmallow==3.21.1; python_version >= '3.8'
|
||||
marshmallow==3.21.2; python_version >= '3.8'
|
||||
matplotlib==3.8.4; python_version >= '3.9'
|
||||
mccabe==0.7.0; python_version >= '3.6'
|
||||
mdurl==0.1.2; python_version >= '3.7'
|
||||
monotonic==1.6
|
||||
mpmath==1.3.0
|
||||
msg-parser==1.2.0
|
||||
@ -153,14 +159,14 @@ olefile==0.47; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2,
|
||||
omegaconf==2.3.0; python_version >= '3.6'
|
||||
onnx==1.16.0
|
||||
onnxruntime==1.17.3
|
||||
openai==1.25.0; python_full_version >= '3.7.1'
|
||||
openai==1.26.0; python_full_version >= '3.7.1'
|
||||
opencv-python==4.9.0.80; python_version >= '3.6'
|
||||
openpyxl==3.1.2
|
||||
ordered-set==4.1.0; python_version >= '3.7'
|
||||
orjson==3.10.2; python_version >= '3.8'
|
||||
orjson==3.10.3; python_version >= '3.8'
|
||||
packaging==23.2; python_version >= '3.7'
|
||||
pandas==1.5.3; python_version >= '3.8'
|
||||
pandasai==2.0.36; python_version not in '2.7, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8' and python_version >= '3.9'
|
||||
pandasai==2.0.37; python_version not in '2.7, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8' and python_version >= '3.9'
|
||||
pathspec==0.12.1; python_version >= '3.8'
|
||||
pdf2image==1.17.0
|
||||
pdfminer.six==20231228
|
||||
@ -169,6 +175,7 @@ pikepdf==8.15.1
|
||||
pillow==10.3.0; python_version >= '3.8'
|
||||
pillow-heif==0.16.0
|
||||
platformdirs==4.2.1; python_version >= '3.8'
|
||||
playwright==1.43.0; python_version >= '3.8'
|
||||
pluggy==1.5.0; python_version >= '3.8'
|
||||
portalocker==2.8.2; python_version >= '3.8'
|
||||
postgrest==0.16.4; python_version >= '3.8' and python_version < '4.0'
|
||||
@ -192,13 +199,15 @@ pycurl==7.45.3
|
||||
pydantic==2.7.1; python_version >= '3.8'
|
||||
pydantic-core==2.18.2; python_version >= '3.8'
|
||||
pydantic-settings==2.2.1; python_version >= '3.8'
|
||||
pyee==11.1.0; python_version >= '3.8'
|
||||
pyflakes==3.2.0; python_version >= '3.8'
|
||||
pygments==2.18.0; python_version >= '3.8'
|
||||
pyinstrument==4.6.2; python_version >= '3.7'
|
||||
pypandoc==1.13; python_version >= '3.6'
|
||||
pyparsing==3.1.2; python_full_version >= '3.6.8'
|
||||
pypdf==4.2.0; python_version >= '3.6'
|
||||
pypdfium2==4.29.0; python_version >= '3.6'
|
||||
pyright==1.1.361; python_version >= '3.7'
|
||||
pyright==1.1.362; python_version >= '3.7'
|
||||
pysbd==0.3.4; python_version >= '3'
|
||||
pytesseract==0.3.10; python_version >= '3.7'
|
||||
pytest==8.2.0; python_version >= '3.8'
|
||||
@ -207,7 +216,7 @@ pytest-docker-tools==3.1.3; python_full_version >= '3.7.0' and python_full_versi
|
||||
pytest-dotenv==0.5.2
|
||||
pytest-mock==3.14.0; python_version >= '3.8'
|
||||
python-dateutil==2.9.0.post0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
||||
python-docx==1.1.0
|
||||
python-docx==1.1.2
|
||||
python-dotenv==1.0.1; python_version >= '3.8'
|
||||
python-iso639==2024.4.27; python_version >= '3.8'
|
||||
python-jose==3.3.0
|
||||
@ -217,7 +226,7 @@ python-pptx==0.6.23
|
||||
pytz==2024.1
|
||||
pyyaml==6.0.1; python_version >= '3.6'
|
||||
ragas==0.1.7
|
||||
rapidfuzz==3.8.1; python_version >= '3.8'
|
||||
rapidfuzz==3.9.0; python_version >= '3.8'
|
||||
realtime==1.0.4; python_version >= '3.8' and python_version < '4.0'
|
||||
redis==5.0.4; python_version >= '3.7'
|
||||
regex==2024.4.28; python_version >= '3.8'
|
||||
@ -225,27 +234,29 @@ requests==2.31.0; python_version >= '3.7'
|
||||
requests-file==2.0.0
|
||||
resend==0.8.0; python_version >= '3.7'
|
||||
retry==0.9.2
|
||||
rich==13.7.1; python_full_version >= '3.7.0'
|
||||
rsa==4.9; python_version >= '3.6' and python_version < '4'
|
||||
s3transfer==0.10.1; python_version >= '3.8'
|
||||
safetensors==0.4.3; python_version >= '3.7'
|
||||
scipy==1.13.0; python_version >= '3.9'
|
||||
sentry-sdk[fastapi]==2.0.1; python_version >= '3.6'
|
||||
sentry-sdk[fastapi]==2.1.1; python_version >= '3.6'
|
||||
setuptools==69.5.1; python_version >= '3.8'
|
||||
sgmllib3k==1.0.0
|
||||
shellingham==1.5.4; python_version >= '3.7'
|
||||
six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
||||
smmap==5.0.1; python_version >= '3.7'
|
||||
sniffio==1.3.1; python_version >= '3.7'
|
||||
soupsieve==2.5; python_version >= '3.8'
|
||||
sqlalchemy[asyncio]==2.0.29; python_version >= '3.7'
|
||||
sqlalchemy[asyncio]==2.0.30; python_version >= '3.7'
|
||||
starlette==0.37.2; python_version >= '3.8'
|
||||
storage3==0.7.4; python_version >= '3.8' and python_version < '4.0'
|
||||
strenum==0.4.15
|
||||
striprtf==0.0.26
|
||||
supabase==2.4.3; python_version >= '3.8' and python_version < '4.0'
|
||||
supabase==2.4.5; python_version >= '3.8' and python_version < '4.0'
|
||||
supafunc==0.4.5; python_version >= '3.8' and python_version < '4.0'
|
||||
sympy==1.12; python_version >= '3.8'
|
||||
tabulate==0.9.0; python_version >= '3.7'
|
||||
tenacity==8.2.3; python_version >= '3.7'
|
||||
tenacity==8.3.0; python_version >= '3.8'
|
||||
tiktoken==0.6.0; python_version >= '3.8'
|
||||
timm==0.9.16; python_version >= '3.8'
|
||||
tinysegmenter==0.3
|
||||
@ -254,21 +265,25 @@ tokenizers==0.19.1; python_version >= '3.7'
|
||||
torch==2.3.0
|
||||
torchvision==0.18.0
|
||||
tornado==6.4; python_version >= '3.8'
|
||||
tqdm==4.66.2; python_version >= '3.7'
|
||||
transformers==4.40.1; python_full_version >= '3.8.0'
|
||||
tqdm==4.66.4; python_version >= '3.7'
|
||||
transformers==4.40.2; python_full_version >= '3.8.0'
|
||||
typer==0.12.3; python_version >= '3.7'
|
||||
types-requests==2.31.0.20240406; python_version >= '3.8'
|
||||
typing-extensions==4.11.0; python_version >= '3.8'
|
||||
typing-inspect==0.9.0
|
||||
tzdata==2024.1; python_version >= '2'
|
||||
ujson==5.9.0; python_version >= '3.8'
|
||||
unidecode==1.3.8; python_version >= '3.5'
|
||||
unstructured[all-docs]==0.13.6; python_version < '3.12' and python_full_version >= '3.9.0'
|
||||
unstructured-client==0.22.0; python_version >= '3.8'
|
||||
unstructured-inference==0.7.29
|
||||
unstructured.pytesseract==0.3.12
|
||||
urllib3==2.2.1; python_version >= '3.10'
|
||||
uvicorn==0.29.0; python_version >= '3.8'
|
||||
uvicorn[standard]==0.29.0; python_version >= '3.8'
|
||||
uvloop==0.19.0
|
||||
vine==5.1.0; python_version >= '3.6'
|
||||
watchdog==4.0.0; python_version >= '3.8'
|
||||
watchfiles==0.21.0
|
||||
wcwidth==0.2.13
|
||||
webencodings==0.5.1
|
||||
websockets==12.0; python_version >= '3.8'
|
||||
|
@ -5,8 +5,14 @@ from celery_worker import process_crawl_and_notify
|
||||
from fastapi import APIRouter, Depends, Query, Request
|
||||
from logger import get_logger
|
||||
from middlewares.auth import AuthBearer, get_current_user
|
||||
from modules.brain.entity.brain_entity import RoleEnum
|
||||
from modules.brain.service.brain_authorization_service import (
|
||||
validate_brain_authorization,
|
||||
)
|
||||
from modules.knowledge.dto.inputs import CreateKnowledgeProperties
|
||||
from modules.knowledge.service.knowledge_service import KnowledgeService
|
||||
from modules.notification.dto.inputs import CreateNotification
|
||||
from modules.notification.entity.notification import NotificationsStatusEnum
|
||||
from modules.notification.service.notification_service import NotificationService
|
||||
from modules.user.entity.user_identity import UserIdentity
|
||||
from modules.user.service.user_usage import UserUsage
|
||||
@ -37,7 +43,9 @@ async def crawl_endpoint(
|
||||
Crawl a website and process the crawled data.
|
||||
"""
|
||||
|
||||
# [TODO] check if the user is the owner/editor of the brain
|
||||
validate_brain_authorization(
|
||||
brain_id, current_user.id, [RoleEnum.Editor, RoleEnum.Owner]
|
||||
)
|
||||
|
||||
userDailyUsage = UserUsage(
|
||||
id=current_user.id,
|
||||
@ -54,6 +62,13 @@ async def crawl_endpoint(
|
||||
"type": "error",
|
||||
}
|
||||
else:
|
||||
upload_notification = notification_service.add_notification(
|
||||
CreateNotification(
|
||||
user_id=current_user.id,
|
||||
status=NotificationsStatusEnum.INFO,
|
||||
title=f"Processing Crawl {crawl_website.url}",
|
||||
)
|
||||
)
|
||||
knowledge_to_add = CreateKnowledgeProperties(
|
||||
brain_id=brain_id,
|
||||
url=crawl_website.url,
|
||||
@ -66,7 +81,7 @@ async def crawl_endpoint(
|
||||
process_crawl_and_notify.delay(
|
||||
crawl_website_url=crawl_website.url,
|
||||
brain_id=brain_id,
|
||||
notification_id=None,
|
||||
notification_id=upload_notification.id,
|
||||
)
|
||||
|
||||
return {"message": "Crawl processing has started."}
|
||||
|
Loading…
Reference in New Issue
Block a user