fix: 🐛 crawler (#1735)

fixed

# Description

Please include a summary of the changes and the related issue. Please
also include relevant motivation and context.

## Checklist before requesting a review

Please delete options that are not relevant.

- [ ] My code follows the style guidelines of this project
- [ ] I have performed a self-review of my code
- [ ] I have commented hard-to-understand areas
- [ ] I have ideally added tests that prove my fix is effective or that
my feature works
- [ ] New and existing unit tests pass locally with my changes
- [ ] Any dependent changes have been merged

## Screenshots (if appropriate):
This commit is contained in:
Stan Girard 2023-11-27 17:36:46 +01:00 committed by GitHub
parent 10e94e5a91
commit 7ff081cf40
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 82 additions and 42 deletions

View File

@ -57,50 +57,68 @@ def process_file_and_notify(
file_original_name: str,
brain_id,
notification_id=None,
):
supabase_client = get_supabase_client()
tmp_file_name = "tmp-file-" + file_name
tmp_file_name = tmp_file_name.replace("/", "_")
):
try:
supabase_client = get_supabase_client()
tmp_file_name = "tmp-file-" + file_name
tmp_file_name = tmp_file_name.replace("/", "_")
with open(tmp_file_name, "wb+") as f:
res = supabase_client.storage.from_("quivr").download(file_name)
f.write(res)
f.seek(0)
file_content = f.read()
with open(tmp_file_name, "wb+") as f:
res = supabase_client.storage.from_("quivr").download(file_name)
f.write(res)
f.seek(0)
file_content = f.read()
upload_file = UploadFile(
file=f, filename=file_name.split("/")[-1], size=len(file_content)
)
file_instance = File(file=upload_file)
loop = asyncio.get_event_loop()
message = loop.run_until_complete(
filter_file(
file=file_instance,
brain_id=brain_id,
original_file_name=file_original_name,
upload_file = UploadFile(
file=f, filename=file_name.split("/")[-1], size=len(file_content)
)
)
f.close()
os.remove(tmp_file_name)
if notification_id:
notification_message = {
"status": message["type"],
"message": message["message"],
"name": file_instance.file.filename if file_instance.file else "",
}
update_notification_by_id(
notification_id,
NotificationUpdatableProperties(
status=NotificationsStatusEnum.Done,
message=str(notification_message),
),
file_instance = File(file=upload_file)
loop = asyncio.get_event_loop()
message = loop.run_until_complete(
filter_file(
file=file_instance,
brain_id=brain_id,
original_file_name=file_original_name,
)
)
update_brain_last_update_time(brain_id)
return True
f.close()
os.remove(tmp_file_name)
if notification_id:
notification_message = {
"status": message["type"],
"message": message["message"],
"name": file_instance.file.filename if file_instance.file else "",
}
update_notification_by_id(
notification_id,
NotificationUpdatableProperties(
status=NotificationsStatusEnum.Done,
message=str(notification_message),
),
)
update_brain_last_update_time(brain_id)
return True
except Exception as e:
notification_message = {
"status": "error",
"message": "There was an error uploading the file. Please check the file and try again. If the issue persist, please open an issue on Github",
"name": file_instance.file.filename if file_instance.file else "",
}
update_notification_by_id(
notification_id,
NotificationUpdatableProperties(
status=NotificationsStatusEnum.Done,
message=str(notification_message),
),
)
raise e
@celery.task(name="process_crawl_and_notify")

View File

@ -31,4 +31,4 @@ async def process_file(
brain_id, doc_with_metadata.to_json(), file.file_sha1
)
return "Hello World!"
return len(file.documents)

View File

@ -81,12 +81,17 @@ async def filter_file(
if file.file_extension in file_processors:
try:
await file_processors[file.file_extension](
result = await file_processors[file.file_extension](
file=file,
brain_id=brain_id,
)
if result is None or result == 0:
return create_response(
f" {using_file_name} has been uploaded to brain. There might have been an error while reading it, please make sure the file is not illformed or just an image", # pyright: ignore reportPrivateUsage=none
"warning",
)
return create_response(
f"{using_file_name} has been uploaded to brain {brain.name}.", # pyright: ignore reportPrivateUsage=none
f"{using_file_name} has been uploaded to brain {brain.name} in {result} chunks", # pyright: ignore reportPrivateUsage=none
"success",
)
except Exception as e:

View File

@ -15,6 +15,10 @@ from packages.files.file import convert_bytes, get_file_size
from repository.knowledge.add_knowledge import add_knowledge
from repository.files.upload_file import upload_file_storage
from repository.notification.add_notification import add_notification
from repository.notification.update_notification import update_notification_by_id
from models.databases.supabase.notifications import NotificationUpdatableProperties
from routes.authorizations.brain_authorization import (
RoleEnum,
@ -73,6 +77,19 @@ async def upload_file(
logger.info(f"File {file_in_storage} uploaded successfully")
except Exception as e:
print(e)
notification_message = {
"status": "error",
"message": "There was an error uploading the file. Please check the file and try again. If the issue persist, please open an issue on Github",
"name": uploadFile.filename if uploadFile else "Last Upload File",
}
update_notification_by_id(
upload_notification.id,
NotificationUpdatableProperties(
status=NotificationsStatusEnum.Done,
message=str(notification_message),
),
)
if "The resource already exists" in str(e):
raise HTTPException(
status_code=403,

View File

@ -1,7 +1,7 @@
CREATE OR REPLACE FUNCTION update_max_brains_theodo() RETURNS TRIGGER AS $$
DECLARE
userEmail TEXT;
allowedDomains TEXT[] := ARRAY['%@theodo.fr', '%@theodo.com', '%@theodo.co.uk', '%@bam.tech', '%@padok.fr', '%@sicara.com', '%@hokla.com', '%@sipios.com'];
allowedDomains TEXT[] := ARRAY['%@theodo.fr', '%@theodo.com', '%@theodo.co.uk', '%@bam.tech', '%@padok.fr', '%@aleios.com', '%@sicara.com', '%@hokla.com', '%@sipios.com'];
BEGIN
SELECT email INTO userEmail FROM auth.users WHERE id = NEW.user_id;