2023-09-14 12:56:59 +03:00
|
|
|
import os
|
2024-06-24 23:17:48 +03:00
|
|
|
from datetime import datetime, timedelta
|
feat(upload): async improved (#2544)
# Description
Hey,
Here's a breakdown of what I've done:
- Reducing the number of opened fd and memory footprint: Previously, for
each uploaded file, we were opening a temporary NamedTemporaryFile to
write existing content read from Supabase. However, due to the
dependency on `langchain` loader classes, we couldn't use memory buffers
for the loaders. Now, with the changes made, we only open a single
temporary file for each `process_file_and_notify`, cutting down on
excessive file opening, read syscalls, and memory buffer usage. This
could cause stability issues when ingesting and processing large volumes
of documents. Unfortunately, there is still reopening of temporary files
in some code paths but this can be improved further in later work.
- Removing `UploadFile` class from File: The `UploadFile` ( a FastAPI
abstraction over a SpooledTemporaryFile for multipart upload) was
redundant in our `File` setup since we already downloaded the file from
remote storage and read it into memory + wrote the file into a temp
file. By removing this abstraction, we streamline our code and eliminate
unnecessary complexity.
- `async` function Adjustments: I've removed the async labeling from
functions where it wasn't truly asynchronous. For instance, calling
`filter_file` for processing files isn't genuinely async, ass async file
reading isn't actually asynchronous—it [uses a threadpool for reading
the
file](https://github.com/encode/starlette/blob/9f16bf5c25e126200701f6e04330864f4a91a898/starlette/datastructures.py#L458)
. Given that we're already leveraging `celery` for parallelism (one
worker per core), we need to ensure that reading and processing occur in
the same thread, or at least minimize thread spawning. Additionally,
since the rest of the code isn't inherently asynchronous, our bottleneck
lies in CPU operations rather than asynchronous processing.
These changes aim to improve performance and streamline our codebase.
Let me know if you have any questions or suggestions for further
improvements!
## Checklist before requesting a review
- [x] My code follows the style guidelines of this project
- [x] I have performed a self-review of my code
- [x] I have ideally added tests that prove my fix is effective or that
my feature works
---------
Signed-off-by: aminediro <aminediro@github.com>
Co-authored-by: aminediro <aminediro@github.com>
Co-authored-by: Stan Girard <girard.stanislas@gmail.com>
2024-06-04 16:29:27 +03:00
|
|
|
from tempfile import NamedTemporaryFile
|
|
|
|
from uuid import UUID
|
2023-09-14 12:56:59 +03:00
|
|
|
|
2023-10-16 17:11:34 +03:00
|
|
|
from celery.schedules import crontab
|
2024-06-13 11:54:26 +03:00
|
|
|
from pytz import timezone
|
2024-06-27 13:51:01 +03:00
|
|
|
from quivr_api.celery_config import celery
|
|
|
|
from quivr_api.logger import get_logger
|
|
|
|
from quivr_api.middlewares.auth.auth_bearer import AuthBearer
|
|
|
|
from quivr_api.models.files import File
|
|
|
|
from quivr_api.models.settings import get_supabase_client, get_supabase_db
|
|
|
|
from quivr_api.modules.brain.integrations.Notion.Notion_connector import NotionConnector
|
|
|
|
from quivr_api.modules.brain.service.brain_service import BrainService
|
|
|
|
from quivr_api.modules.brain.service.brain_vector_service import BrainVectorService
|
|
|
|
from quivr_api.modules.notification.service.notification_service import (
|
2024-06-26 10:58:55 +03:00
|
|
|
NotificationService,
|
|
|
|
)
|
2024-06-27 13:51:01 +03:00
|
|
|
from quivr_api.packages.files.crawl.crawler import CrawlWebsite, slugify
|
|
|
|
from quivr_api.packages.files.processors import filter_file
|
|
|
|
from quivr_api.packages.utils.telemetry import maybe_send_telemetry
|
2024-06-26 10:58:55 +03:00
|
|
|
|
2023-12-14 01:08:15 +03:00
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
2023-11-28 16:27:39 +03:00
|
|
|
notification_service = NotificationService()
|
2023-12-01 00:29:28 +03:00
|
|
|
brain_service = BrainService()
|
2024-05-22 00:36:51 +03:00
|
|
|
auth_bearer = AuthBearer()
|
2023-10-16 17:11:34 +03:00
|
|
|
|
2023-09-14 12:56:59 +03:00
|
|
|
|
2024-07-15 13:32:00 +03:00
|
|
|
@celery.task(
|
|
|
|
retries=3,
|
|
|
|
default_retry_delay=1,
|
|
|
|
name="process_file_and_notify",
|
|
|
|
autoretry_for=(Exception,),
|
|
|
|
)
|
2023-09-14 12:56:59 +03:00
|
|
|
def process_file_and_notify(
|
|
|
|
file_name: str,
|
2023-09-18 22:28:07 +03:00
|
|
|
file_original_name: str,
|
2023-09-14 12:56:59 +03:00
|
|
|
brain_id,
|
2024-07-15 16:37:42 +03:00
|
|
|
notification_id: UUID,
|
|
|
|
knowledge_id: UUID,
|
2024-02-06 08:02:46 +03:00
|
|
|
integration=None,
|
2024-07-15 20:04:57 +03:00
|
|
|
integration_link=None,
|
2024-02-28 08:30:25 +03:00
|
|
|
delete_file=False,
|
2023-11-28 16:27:39 +03:00
|
|
|
):
|
2024-07-15 16:37:42 +03:00
|
|
|
logger.debug(
|
|
|
|
f"process_file file_name={file_name}, knowledge_id={knowledge_id}, brain_id={brain_id}, notification_id={notification_id}"
|
|
|
|
)
|
|
|
|
supabase_client = get_supabase_client()
|
|
|
|
tmp_name = file_name.replace("/", "_")
|
|
|
|
base_file_name = os.path.basename(file_name)
|
|
|
|
_, file_extension = os.path.splitext(base_file_name)
|
2024-07-15 13:32:00 +03:00
|
|
|
|
2024-07-15 16:37:42 +03:00
|
|
|
with NamedTemporaryFile(
|
|
|
|
suffix="_" + tmp_name, # pyright: ignore reportPrivateUsage=none
|
|
|
|
) as tmp_file:
|
|
|
|
res = supabase_client.storage.from_("quivr").download(file_name)
|
|
|
|
tmp_file.write(res)
|
|
|
|
tmp_file.flush()
|
|
|
|
file_instance = File(
|
|
|
|
file_name=base_file_name,
|
|
|
|
tmp_file_path=tmp_file.name,
|
|
|
|
bytes_content=res,
|
|
|
|
file_size=len(res),
|
|
|
|
file_extension=file_extension,
|
2023-11-28 16:27:39 +03:00
|
|
|
)
|
2024-07-15 16:37:42 +03:00
|
|
|
brain_vector_service = BrainVectorService(brain_id)
|
|
|
|
if delete_file: # TODO fix bug
|
|
|
|
brain_vector_service.delete_file_from_brain(
|
|
|
|
file_original_name, only_vectors=True
|
2024-07-12 11:51:27 +03:00
|
|
|
)
|
2023-09-22 12:44:09 +03:00
|
|
|
|
2024-07-15 16:37:42 +03:00
|
|
|
filter_file(
|
|
|
|
file=file_instance,
|
|
|
|
brain_id=brain_id,
|
|
|
|
original_file_name=file_original_name,
|
2024-07-17 11:10:35 +03:00
|
|
|
integration=integration,
|
|
|
|
integration_link=integration_link,
|
2024-07-15 16:37:42 +03:00
|
|
|
)
|
|
|
|
|
|
|
|
brain_service.update_brain_last_update_time(brain_id)
|
|
|
|
|
2023-09-14 12:56:59 +03:00
|
|
|
|
2024-07-15 18:31:46 +03:00
|
|
|
@celery.task(
|
|
|
|
retries=3,
|
|
|
|
default_retry_delay=1,
|
|
|
|
name="process_crawl_and_notify",
|
|
|
|
autoretry_for=(Exception,),
|
|
|
|
)
|
2023-09-14 12:56:59 +03:00
|
|
|
def process_crawl_and_notify(
|
feat(upload): async improved (#2544)
# Description
Hey,
Here's a breakdown of what I've done:
- Reducing the number of opened fd and memory footprint: Previously, for
each uploaded file, we were opening a temporary NamedTemporaryFile to
write existing content read from Supabase. However, due to the
dependency on `langchain` loader classes, we couldn't use memory buffers
for the loaders. Now, with the changes made, we only open a single
temporary file for each `process_file_and_notify`, cutting down on
excessive file opening, read syscalls, and memory buffer usage. This
could cause stability issues when ingesting and processing large volumes
of documents. Unfortunately, there is still reopening of temporary files
in some code paths but this can be improved further in later work.
- Removing `UploadFile` class from File: The `UploadFile` ( a FastAPI
abstraction over a SpooledTemporaryFile for multipart upload) was
redundant in our `File` setup since we already downloaded the file from
remote storage and read it into memory + wrote the file into a temp
file. By removing this abstraction, we streamline our code and eliminate
unnecessary complexity.
- `async` function Adjustments: I've removed the async labeling from
functions where it wasn't truly asynchronous. For instance, calling
`filter_file` for processing files isn't genuinely async, ass async file
reading isn't actually asynchronous—it [uses a threadpool for reading
the
file](https://github.com/encode/starlette/blob/9f16bf5c25e126200701f6e04330864f4a91a898/starlette/datastructures.py#L458)
. Given that we're already leveraging `celery` for parallelism (one
worker per core), we need to ensure that reading and processing occur in
the same thread, or at least minimize thread spawning. Additionally,
since the rest of the code isn't inherently asynchronous, our bottleneck
lies in CPU operations rather than asynchronous processing.
These changes aim to improve performance and streamline our codebase.
Let me know if you have any questions or suggestions for further
improvements!
## Checklist before requesting a review
- [x] My code follows the style guidelines of this project
- [x] I have performed a self-review of my code
- [x] I have ideally added tests that prove my fix is effective or that
my feature works
---------
Signed-off-by: aminediro <aminediro@github.com>
Co-authored-by: aminediro <aminediro@github.com>
Co-authored-by: Stan Girard <girard.stanislas@gmail.com>
2024-06-04 16:29:27 +03:00
|
|
|
crawl_website_url: str,
|
|
|
|
brain_id: UUID,
|
2024-07-15 18:31:46 +03:00
|
|
|
knowledge_id: UUID,
|
2023-09-14 12:56:59 +03:00
|
|
|
notification_id=None,
|
|
|
|
):
|
|
|
|
crawl_website = CrawlWebsite(url=crawl_website_url)
|
2024-07-15 18:31:46 +03:00
|
|
|
# Build file data
|
|
|
|
extracted_content = crawl_website.process()
|
|
|
|
extracted_content_bytes = extracted_content.encode("utf-8")
|
|
|
|
file_name = slugify(crawl_website.url) + ".txt"
|
2023-09-14 12:56:59 +03:00
|
|
|
|
2024-07-15 18:31:46 +03:00
|
|
|
with NamedTemporaryFile(
|
|
|
|
suffix="_" + file_name, # pyright: ignore reportPrivateUsage=none
|
|
|
|
) as tmp_file:
|
|
|
|
tmp_file.write(extracted_content_bytes)
|
|
|
|
tmp_file.flush()
|
|
|
|
file_instance = File(
|
|
|
|
file_name=file_name,
|
|
|
|
tmp_file_path=tmp_file.name,
|
|
|
|
bytes_content=extracted_content_bytes,
|
|
|
|
file_size=len(extracted_content),
|
|
|
|
file_extension=".txt",
|
2023-09-14 12:56:59 +03:00
|
|
|
)
|
2024-07-15 18:31:46 +03:00
|
|
|
filter_file(
|
|
|
|
file=file_instance,
|
|
|
|
brain_id=brain_id,
|
|
|
|
original_file_name=crawl_website_url,
|
2023-09-14 12:56:59 +03:00
|
|
|
)
|
2024-05-08 17:42:31 +03:00
|
|
|
|
2023-10-16 17:11:34 +03:00
|
|
|
|
2024-02-06 08:02:46 +03:00
|
|
|
@celery.task(name="NotionConnectorLoad")
|
|
|
|
def process_integration_brain_created_initial_load(brain_id, user_id):
|
|
|
|
notion_connector = NotionConnector(brain_id=brain_id, user_id=user_id)
|
|
|
|
|
2024-02-28 08:30:25 +03:00
|
|
|
pages = notion_connector.load()
|
2024-02-06 08:02:46 +03:00
|
|
|
|
|
|
|
print("pages: ", len(pages))
|
2024-02-28 08:30:25 +03:00
|
|
|
|
|
|
|
|
|
|
|
@celery.task
|
|
|
|
def process_integration_brain_sync_user_brain(brain_id, user_id):
|
|
|
|
notion_connector = NotionConnector(brain_id=brain_id, user_id=user_id)
|
|
|
|
|
|
|
|
notion_connector.poll()
|
|
|
|
|
|
|
|
|
2024-04-25 17:22:13 +03:00
|
|
|
@celery.task
|
|
|
|
def ping_telemetry():
|
|
|
|
maybe_send_telemetry("ping", {"ping": "pong"})
|
|
|
|
|
|
|
|
|
2024-06-13 11:14:12 +03:00
|
|
|
@celery.task(name="check_if_is_premium_user")
|
|
|
|
def check_if_is_premium_user():
|
2024-07-15 20:04:57 +03:00
|
|
|
if os.getenv("DEACTIVATE_STRIPE") == "true":
|
|
|
|
logger.info("Stripe deactivated, skipping check for premium users")
|
|
|
|
return True
|
|
|
|
|
2024-06-13 11:14:12 +03:00
|
|
|
supabase = get_supabase_db()
|
|
|
|
supabase_db = supabase.db
|
2024-06-13 11:54:26 +03:00
|
|
|
|
|
|
|
paris_tz = timezone("Europe/Paris")
|
2024-06-24 23:17:48 +03:00
|
|
|
current_time = datetime.now(paris_tz)
|
|
|
|
current_time_str = current_time.strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
|
|
logger.debug(f"Current time: {current_time_str}")
|
|
|
|
|
|
|
|
# Define the memoization period (e.g., 1 hour)
|
|
|
|
memoization_period = timedelta(hours=1)
|
|
|
|
memoization_cutoff = current_time - memoization_period
|
|
|
|
|
|
|
|
# Fetch all necessary data in bulk
|
2024-06-13 11:14:12 +03:00
|
|
|
subscriptions = (
|
|
|
|
supabase_db.table("subscriptions")
|
|
|
|
.select("*")
|
2024-06-24 23:17:48 +03:00
|
|
|
.filter("current_period_end", "gt", current_time_str)
|
2024-06-13 11:14:12 +03:00
|
|
|
.execute()
|
|
|
|
).data
|
|
|
|
|
2024-06-24 23:17:48 +03:00
|
|
|
customers = (supabase_db.table("customers").select("*").execute()).data
|
|
|
|
|
|
|
|
customer_emails = [customer["email"] for customer in customers]
|
2024-06-13 11:14:12 +03:00
|
|
|
|
2024-06-24 23:17:48 +03:00
|
|
|
# Split customer emails into batches of 50
|
|
|
|
email_batches = [
|
|
|
|
customer_emails[i : i + 20] for i in range(0, len(customer_emails), 20)
|
|
|
|
]
|
|
|
|
|
|
|
|
users = []
|
|
|
|
for email_batch in email_batches:
|
|
|
|
batch_users = (
|
2024-06-13 11:14:12 +03:00
|
|
|
supabase_db.table("users")
|
2024-06-24 23:17:48 +03:00
|
|
|
.select("id, email")
|
|
|
|
.in_("email", email_batch)
|
2024-06-13 11:14:12 +03:00
|
|
|
.execute()
|
|
|
|
).data
|
2024-06-24 23:17:48 +03:00
|
|
|
users.extend(batch_users)
|
|
|
|
|
|
|
|
product_features = (
|
|
|
|
supabase_db.table("product_to_features").select("*").execute()
|
|
|
|
).data
|
|
|
|
|
|
|
|
user_settings = (supabase_db.table("user_settings").select("*").execute()).data
|
|
|
|
|
|
|
|
# Create lookup dictionaries for faster access
|
|
|
|
user_dict = {user["email"]: user["id"] for user in users}
|
|
|
|
customer_dict = {customer["id"]: customer for customer in customers}
|
|
|
|
product_dict = {
|
|
|
|
product["stripe_product_id"]: product for product in product_features
|
|
|
|
}
|
|
|
|
settings_dict = {setting["user_id"]: setting for setting in user_settings}
|
|
|
|
|
|
|
|
# Process subscriptions and update user settings
|
|
|
|
premium_user_ids = set()
|
|
|
|
settings_to_upsert = {}
|
|
|
|
for sub in subscriptions:
|
2024-07-22 18:24:06 +03:00
|
|
|
logger.info(f"Subscription {sub['id']}")
|
2024-07-22 19:15:13 +03:00
|
|
|
if sub["attrs"]["status"] != "active" and sub["attrs"]["status"] != "trialing":
|
|
|
|
logger.info(f"Subscription {sub['id']} is not active or trialing")
|
2024-06-24 23:17:48 +03:00
|
|
|
continue
|
|
|
|
|
|
|
|
customer = customer_dict.get(sub["customer"])
|
|
|
|
if not customer:
|
2024-07-22 18:24:06 +03:00
|
|
|
logger.info(f"No customer found for subscription: {sub['customer']}")
|
2024-06-24 23:17:48 +03:00
|
|
|
continue
|
|
|
|
|
|
|
|
user_id = user_dict.get(customer["email"])
|
|
|
|
if not user_id:
|
2024-07-22 18:24:06 +03:00
|
|
|
logger.info(f"No user found for customer: {customer['email']}")
|
2024-06-13 11:14:12 +03:00
|
|
|
continue
|
|
|
|
|
2024-06-24 23:17:48 +03:00
|
|
|
current_settings = settings_dict.get(user_id, {})
|
|
|
|
last_check = current_settings.get("last_stripe_check")
|
|
|
|
|
|
|
|
# Skip if the user was checked recently
|
|
|
|
if last_check and datetime.fromisoformat(last_check) > memoization_cutoff:
|
|
|
|
premium_user_ids.add(user_id)
|
2024-07-22 18:24:06 +03:00
|
|
|
logger.info(f"User {user_id} was checked recently")
|
2024-06-24 23:17:48 +03:00
|
|
|
continue
|
|
|
|
|
|
|
|
user_id = str(user_id) # Ensure user_id is a string
|
|
|
|
premium_user_ids.add(user_id)
|
|
|
|
|
|
|
|
product_id = sub["attrs"]["items"]["data"][0]["plan"]["product"]
|
|
|
|
product = product_dict.get(product_id)
|
|
|
|
if not product:
|
|
|
|
logger.warning(f"No matching product found for subscription: {sub['id']}")
|
|
|
|
continue
|
|
|
|
|
|
|
|
settings_to_upsert[user_id] = {
|
|
|
|
"user_id": user_id,
|
|
|
|
"max_brains": product["max_brains"],
|
|
|
|
"max_brain_size": product["max_brain_size"],
|
|
|
|
"monthly_chat_credit": product["monthly_chat_credit"],
|
|
|
|
"api_access": product["api_access"],
|
|
|
|
"models": product["models"],
|
|
|
|
"is_premium": True,
|
|
|
|
"last_stripe_check": current_time_str,
|
|
|
|
}
|
2024-07-22 18:24:06 +03:00
|
|
|
logger.info(f"Upserting settings for user {user_id}")
|
2024-06-24 23:17:48 +03:00
|
|
|
|
|
|
|
# Bulk upsert premium user settings in batches of 10
|
|
|
|
settings_list = list(settings_to_upsert.values())
|
2024-07-22 18:24:06 +03:00
|
|
|
logger.info(f"Upserting {len(settings_list)} settings")
|
2024-06-24 23:17:48 +03:00
|
|
|
for i in range(0, len(settings_list), 10):
|
|
|
|
batch = settings_list[i : i + 10]
|
|
|
|
supabase_db.table("user_settings").upsert(batch).execute()
|
|
|
|
|
|
|
|
# Delete settings for non-premium users in batches of 10
|
|
|
|
settings_to_delete = [
|
|
|
|
setting["user_id"]
|
|
|
|
for setting in user_settings
|
|
|
|
if setting["user_id"] not in premium_user_ids and setting.get("is_premium")
|
|
|
|
]
|
|
|
|
for i in range(0, len(settings_to_delete), 10):
|
|
|
|
batch = settings_to_delete[i : i + 10]
|
|
|
|
supabase_db.table("user_settings").delete().in_("user_id", batch).execute()
|
|
|
|
|
|
|
|
logger.info(
|
|
|
|
f"Updated {len(settings_to_upsert)} premium users, deleted settings for {len(settings_to_delete)} non-premium users"
|
|
|
|
)
|
2024-06-13 11:14:12 +03:00
|
|
|
return True
|
2024-02-28 08:30:25 +03:00
|
|
|
|
|
|
|
|
|
|
|
celery.conf.beat_schedule = {
|
2024-04-25 17:22:13 +03:00
|
|
|
"ping_telemetry": {
|
|
|
|
"task": f"{__name__}.ping_telemetry",
|
|
|
|
"schedule": crontab(minute="*/30", hour="*"),
|
|
|
|
},
|
2024-05-21 23:20:35 +03:00
|
|
|
"process_sync_active": {
|
|
|
|
"task": "process_sync_active",
|
2024-06-06 20:06:21 +03:00
|
|
|
"schedule": crontab(minute="*/1", hour="*"),
|
2024-05-21 23:20:35 +03:00
|
|
|
},
|
2024-06-13 11:14:12 +03:00
|
|
|
"process_premium_users": {
|
|
|
|
"task": "check_if_is_premium_user",
|
|
|
|
"schedule": crontab(minute="*/1", hour="*"),
|
|
|
|
},
|
2024-02-28 08:30:25 +03:00
|
|
|
}
|