fix: fixing pdf parsing (#3349)

Our default pdf parser is Unstructured, for which we were using the 'fast' strategy, which fails in parsing some pdf. We instead use the 'auto' strategy which is more flexible and powerful. Closes TICK-86 # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): --------- Co-authored-by: chloedia <chloedaems0@gmail.com>
2024-11-29 14:57:03 +03:00 · 2024-10-14 15:21:12 +02:00 · 2024-10-14 15:21:12 +02:00 · 367242a3d5
commit 367242a3d5
parent 90848eb827
6 changed files with 63 additions and 16 deletions
--- a/backend/api/quivr_api/modules/sync/utils/normalize.py
+++ b/backend/api/quivr_api/modules/sync/utils/normalize.py
@ -1,3 +1,4 @@
+import os
 import re
 import unicodedata

@ -15,3 +16,35 @@ def remove_special_characters(input):
    except Exception as e:
        logger.error(f"Error removing special characters: {e}")
        return input
+
+
+def sanitize_filename(filename: str) -> str:
+    """
+    Sanitize the filename to make it usable.
+
+    Args:
+        filename (str): The original filename.
+
+    Returns:
+        str: The sanitized filename.
+
+    This function:
+    1. Removes or replaces invalid characters
+    2. Handles double extensions
+    3. Ensures the filename is not empty
+    4. Truncates long filenames
+    """
+    valid_chars = re.sub(r"[^\w\-_\. ]", "", filename)
+
+    name, ext = os.path.splitext(valid_chars)
+
+    name = name.replace(".", "_")
+
+    if not name:
+        name = "unnamed"
+    max_length = 255 - len(ext)
+    if len(name) > max_length:
+        name = name[:max_length]
+    sanitized_filename = f"{name}{ext}"
+
+    return sanitized_filename
--- a/backend/api/quivr_api/modules/sync/utils/syncutils.py
+++ b/backend/api/quivr_api/modules/sync/utils/syncutils.py
@ -29,6 +29,7 @@ from quivr_api.modules.sync.service.sync_service import (
    ISyncService,
    ISyncUserService,
 )
+from quivr_api.modules.sync.utils.normalize import sanitize_filename
 from quivr_api.modules.sync.utils.sync import BaseSync
 from quivr_api.modules.upload.service.upload_file import (
    check_file_exists,
@ -168,6 +169,8 @@ class SyncUtils:
        ]:
            raise ValueError(f"Incompatible file extension for {downloaded_file}")

+        storage_path = sanitize_filename(storage_path)
+
        response = await upload_file_storage(
            downloaded_file.file_data,
            storage_path,
--- a/backend/api/quivr_api/modules/upload/controller/upload_routes.py
+++ b/backend/api/quivr_api/modules/upload/controller/upload_routes.py
@ -30,6 +30,7 @@ from quivr_api.modules.notification.entity.notification import NotificationsStat
 from quivr_api.modules.notification.service.notification_service import (
    NotificationService,
 )
+from quivr_api.modules.sync.utils.normalize import sanitize_filename
 from quivr_api.modules.upload.service.upload_file import (
    upload_file_storage,
 )
@ -85,12 +86,14 @@ async def upload_file(
            brain_id=str(brain_id),
        )
    )
+    file_name = f"{str(uploadFile.filename).split('.')[0]}.{str(uploadFile.filename).split('.')[-1]}"

    background_tasks.add_task(
-        maybe_send_telemetry, "upload_file", {"file_name": uploadFile.filename}
+        maybe_send_telemetry, "upload_file", {"file_name": file_name}
    )

-    filename_with_brain_id = str(brain_id) + "/" + str(uploadFile.filename)
+    filename_with_brain_id = str(brain_id) + "/" + file_name
+    filename_with_brain_id = sanitize_filename(filename_with_brain_id)

    buff_reader = io.BufferedReader(uploadFile.file)  # type: ignore
    try:
@ -110,9 +113,9 @@ async def upload_file(

    knowledge_to_add = CreateKnowledgeProperties(
        brain_id=brain_id,
-        file_name=uploadFile.filename,
+        file_name=file_name,
        extension=os.path.splitext(
-            uploadFile.filename  # pyright: ignore reportPrivateUsage=none
+            file_name  # pyright: ignore reportPrivateUsage=none
        )[-1].lower(),
        source=integration if integration else "local",
        source_link=integration_link,  # FIXME: Should return the s3 link @chloedia
@ -127,7 +130,7 @@ async def upload_file(
        "process_file_task",
        kwargs={
            "file_name": filename_with_brain_id,
-            "file_original_name": uploadFile.filename,
+            "file_original_name": file_name,
            "brain_id": brain_id,
            "notification_id": upload_notification.id,
            "knowledge_id": knowledge.id,
--- a/backend/core/MegaParse/megaparse/Converter.py
+++ b/backend/core/MegaParse/megaparse/Converter.py
@ -320,22 +320,28 @@ class PDFConverter:
        else:
            raise ValueError(f"Method {self.method} not supported")

-        if not gpt4o_cleaner:
-            return LangChainDocument(
-                page_content=parsed_md,
-                metadata={"filename": file_path.name, "type": "pdf"},
-            )
-        else:
+        if gpt4o_cleaner:
            md_processor = MarkdownProcessor(
                parsed_md,
                strict=True,
                remove_pagination=True,
            )
            md_cleaned = md_processor.process(gpt4o_cleaner=gpt4o_cleaner)
-            return LangChainDocument(
-                page_content=md_cleaned,
-                metadata={"filename": file_path.name, "type": "pdf"},
-            )
+            parsed_md = md_cleaned
+
+        if (
+            len(parsed_md) < 5
+            and file_path.stat().st_size > 100
+            and self.strategy == "fast"
+        ):
+            logger.info(f"Switching to auto strategy for {file_path.name}")
+            self.strategy = "auto"
+            return await self.convert(file_path, model, gpt4o_cleaner=gpt4o_cleaner)
+
+        return LangChainDocument(
+            page_content=parsed_md,
+            metadata={"filename": file_path.name, "type": "pdf"},
+        )

    def save_md(self, md_content: str, file_path: Path | str) -> None:
        with open(file_path, "w") as f:
--- a/backend/core/quivr_core/processor/implementations/megaparse_processor.py
+++ b/backend/core/quivr_core/processor/implementations/megaparse_processor.py
@ -59,7 +59,6 @@ class MegaparseProcessor(ProcessorBase):
    async def process_file_inner(self, file: QuivrFile) -> list[Document]:
        mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config)  # type: ignore
        document: Document = await mega_parse.aload()
-        print("\n\n document: ", document.page_content)
        if len(document.page_content) > self.splitter_config.chunk_size:
            docs = self.text_splitter.split_documents([document])
            for doc in docs:
--- a/backend/worker/quivr_worker/celery_monitor.py
+++ b/backend/worker/quivr_worker/celery_monitor.py
@ -178,6 +178,9 @@ def is_being_executed(task_name: str) -> bool:
        running currently.
    """
    active_tasks = celery.control.inspect().active()
+    if not active_tasks:
+        return False
+    
    for worker, running_tasks in active_tasks.items():
        for task in running_tasks:
            if task["name"] == task_name:  # type: ignore