mirror of
https://github.com/QuivrHQ/quivr.git
synced 2024-12-15 17:43:03 +03:00
380cf82706
# Description # Testing backend ## Docker setup 1. Copy `.env.example` to `.env`. Some env variables were added : EMBEDDING_DIM 2. Apply supabase migratrions : ```sh supabase stop supabase db reset supabase start ``` 3. Start backend containers ``` make dev ``` ## Local setup You can also run backend without docker. 1. Install [`rye`](https://rye.astral.sh/guide/installation/). Choose the managed python version and set the version to 3.11 2. Run the following: ``` cd quivr/backend rye sync ``` 3. Source `.venv` virtual env : `source .venv/bin/activate` 4. Run the backend, make sure you are running redis and supabase API: ``` LOG_LEVEL=debug uvicorn quivr_api.main:app --log-level debug --reload --host 0.0.0.0 --port 5050 --workers 1 ``` Worker: ``` LOG_LEVEL=debug celery -A quivr_worker.celery_worker worker -l info -E --concurrency 1 ``` Notifier: ``` LOG_LEVEL=debug python worker/quivr_worker/celery_monitor.py ``` --------- Co-authored-by: chloedia <chloedaems0@gmail.com> Co-authored-by: aminediro <aminedirhoussi1@gmail.com> Co-authored-by: Antoine Dewez <44063631+Zewed@users.noreply.github.com> Co-authored-by: Chloé Daems <73901882+chloedia@users.noreply.github.com> Co-authored-by: Zewed <dewez.antoine2@gmail.com>
38 lines
1.0 KiB
Python
38 lines
1.0 KiB
Python
import os
|
|
import re
|
|
import unicodedata
|
|
|
|
from langchain_community.document_loaders import PlaywrightURLLoader
|
|
from pydantic import BaseModel
|
|
from quivr_api.logger import get_logger
|
|
|
|
logger = get_logger("celery_worker")
|
|
|
|
|
|
class URL(BaseModel):
|
|
url: str
|
|
js: bool = False
|
|
depth: int = int(os.getenv("CRAWL_DEPTH", "1"))
|
|
max_pages: int = 100
|
|
max_time: int = 60
|
|
|
|
|
|
async def extract_from_url(url: URL) -> str:
|
|
# Extract and combine content recursively
|
|
loader = PlaywrightURLLoader(urls=[url.url], remove_selectors=["header", "footer"])
|
|
|
|
data = await loader.aload()
|
|
# Now turn the data into a string
|
|
logger.info(f"Extracted content from {len(data)} pages")
|
|
extracted_content = ""
|
|
for page in data:
|
|
extracted_content += page.page_content
|
|
return extracted_content
|
|
|
|
|
|
def slugify(text):
|
|
text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8")
|
|
text = re.sub(r"[^\w\s-]", "", text).strip().lower()
|
|
text = re.sub(r"[-\s]+", "-", text)
|
|
return text
|