From 7ff081cf4059babb115768abbdf03ff38cb76106 Mon Sep 17 00:00:00 2001 From: Stan Girard Date: Mon, 27 Nov 2023 17:36:46 +0100 Subject: [PATCH] =?UTF-8?q?fix:=20=F0=9F=90=9B=20crawler=20(#1735)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fixed # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): --- backend/celery_worker.py | 94 ++++++++++++++---------- backend/packages/files/parsers/common.py | 2 +- backend/packages/files/processors.py | 9 ++- backend/routes/upload_routes.py | 17 +++++ scripts/quivr_team_only/theodo_gpt4.sql | 2 +- 5 files changed, 82 insertions(+), 42 deletions(-) diff --git a/backend/celery_worker.py b/backend/celery_worker.py index 2a83189cf..a5aa0055b 100644 --- a/backend/celery_worker.py +++ b/backend/celery_worker.py @@ -57,50 +57,68 @@ def process_file_and_notify( file_original_name: str, brain_id, notification_id=None, -): - supabase_client = get_supabase_client() - tmp_file_name = "tmp-file-" + file_name - tmp_file_name = tmp_file_name.replace("/", "_") +): + try: + supabase_client = get_supabase_client() + tmp_file_name = "tmp-file-" + file_name + tmp_file_name = tmp_file_name.replace("/", "_") - with open(tmp_file_name, "wb+") as f: - res = supabase_client.storage.from_("quivr").download(file_name) - f.write(res) - f.seek(0) - file_content = f.read() + with open(tmp_file_name, "wb+") as f: + res = supabase_client.storage.from_("quivr").download(file_name) + f.write(res) + f.seek(0) + file_content = f.read() - upload_file = UploadFile( - file=f, filename=file_name.split("/")[-1], size=len(file_content) - ) - - file_instance = File(file=upload_file) - loop = asyncio.get_event_loop() - message = loop.run_until_complete( - filter_file( - file=file_instance, - brain_id=brain_id, - original_file_name=file_original_name, + upload_file = UploadFile( + file=f, filename=file_name.split("/")[-1], size=len(file_content) ) - ) - f.close() - os.remove(tmp_file_name) - - if notification_id: - notification_message = { - "status": message["type"], - "message": message["message"], - "name": file_instance.file.filename if file_instance.file else "", - } - update_notification_by_id( - notification_id, - NotificationUpdatableProperties( - status=NotificationsStatusEnum.Done, - message=str(notification_message), - ), + file_instance = File(file=upload_file) + loop = asyncio.get_event_loop() + message = loop.run_until_complete( + filter_file( + file=file_instance, + brain_id=brain_id, + original_file_name=file_original_name, + ) ) - update_brain_last_update_time(brain_id) - return True + f.close() + os.remove(tmp_file_name) + + if notification_id: + notification_message = { + "status": message["type"], + "message": message["message"], + "name": file_instance.file.filename if file_instance.file else "", + } + update_notification_by_id( + notification_id, + NotificationUpdatableProperties( + status=NotificationsStatusEnum.Done, + message=str(notification_message), + ), + ) + update_brain_last_update_time(brain_id) + + return True + except Exception as e: + notification_message = { + "status": "error", + "message": "There was an error uploading the file. Please check the file and try again. If the issue persist, please open an issue on Github", + "name": file_instance.file.filename if file_instance.file else "", + } + update_notification_by_id( + notification_id, + NotificationUpdatableProperties( + status=NotificationsStatusEnum.Done, + message=str(notification_message), + ), + ) + raise e + + + @celery.task(name="process_crawl_and_notify") diff --git a/backend/packages/files/parsers/common.py b/backend/packages/files/parsers/common.py index eacd501ab..cfd7d1b68 100644 --- a/backend/packages/files/parsers/common.py +++ b/backend/packages/files/parsers/common.py @@ -31,4 +31,4 @@ async def process_file( brain_id, doc_with_metadata.to_json(), file.file_sha1 ) - return "Hello World!" + return len(file.documents) diff --git a/backend/packages/files/processors.py b/backend/packages/files/processors.py index d7fb0e9ca..e4363581c 100644 --- a/backend/packages/files/processors.py +++ b/backend/packages/files/processors.py @@ -81,12 +81,17 @@ async def filter_file( if file.file_extension in file_processors: try: - await file_processors[file.file_extension]( + result = await file_processors[file.file_extension]( file=file, brain_id=brain_id, ) + if result is None or result == 0: + return create_response( + f"? {using_file_name} has been uploaded to brain. There might have been an error while reading it, please make sure the file is not illformed or just an image", # pyright: ignore reportPrivateUsage=none + "warning", + ) return create_response( - f"✅ {using_file_name} has been uploaded to brain {brain.name}.", # pyright: ignore reportPrivateUsage=none + f"✅ {using_file_name} has been uploaded to brain {brain.name} in {result} chunks", # pyright: ignore reportPrivateUsage=none "success", ) except Exception as e: diff --git a/backend/routes/upload_routes.py b/backend/routes/upload_routes.py index 26d6bd0f2..c69004345 100644 --- a/backend/routes/upload_routes.py +++ b/backend/routes/upload_routes.py @@ -15,6 +15,10 @@ from packages.files.file import convert_bytes, get_file_size from repository.knowledge.add_knowledge import add_knowledge from repository.files.upload_file import upload_file_storage from repository.notification.add_notification import add_notification +from repository.notification.update_notification import update_notification_by_id +from models.databases.supabase.notifications import NotificationUpdatableProperties + + from routes.authorizations.brain_authorization import ( RoleEnum, @@ -73,6 +77,19 @@ async def upload_file( logger.info(f"File {file_in_storage} uploaded successfully") except Exception as e: + print(e) + notification_message = { + "status": "error", + "message": "There was an error uploading the file. Please check the file and try again. If the issue persist, please open an issue on Github", + "name": uploadFile.filename if uploadFile else "Last Upload File", + } + update_notification_by_id( + upload_notification.id, + NotificationUpdatableProperties( + status=NotificationsStatusEnum.Done, + message=str(notification_message), + ), + ) if "The resource already exists" in str(e): raise HTTPException( status_code=403, diff --git a/scripts/quivr_team_only/theodo_gpt4.sql b/scripts/quivr_team_only/theodo_gpt4.sql index 613399356..2160bf918 100644 --- a/scripts/quivr_team_only/theodo_gpt4.sql +++ b/scripts/quivr_team_only/theodo_gpt4.sql @@ -1,7 +1,7 @@ CREATE OR REPLACE FUNCTION update_max_brains_theodo() RETURNS TRIGGER AS $$ DECLARE userEmail TEXT; - allowedDomains TEXT[] := ARRAY['%@theodo.fr', '%@theodo.com', '%@theodo.co.uk', '%@bam.tech', '%@padok.fr', '%@sicara.com', '%@hokla.com', '%@sipios.com']; + allowedDomains TEXT[] := ARRAY['%@theodo.fr', '%@theodo.com', '%@theodo.co.uk', '%@bam.tech', '%@padok.fr', '%@aleios.com', '%@sicara.com', '%@hokla.com', '%@sipios.com']; BEGIN SELECT email INTO userEmail FROM auth.users WHERE id = NEW.user_id;