fix: fixing pdf parsing (#3349)

Our default pdf parser is Unstructured, for which we were using the
'fast' strategy, which fails in parsing some pdf. We instead use the
'auto' strategy which is more flexible and powerful.

Closes TICK-86

# Description

Please include a summary of the changes and the related issue. Please
also include relevant motivation and context.

## Checklist before requesting a review

Please delete options that are not relevant.

- [ ] My code follows the style guidelines of this project
- [ ] I have performed a self-review of my code
- [ ] I have commented hard-to-understand areas
- [ ] I have ideally added tests that prove my fix is effective or that
my feature works
- [ ] New and existing unit tests pass locally with my changes
- [ ] Any dependent changes have been merged

## Screenshots (if appropriate):

---------

Co-authored-by: chloedia <chloedaems0@gmail.com>
This commit is contained in:
Jacopo Chevallard 2024-10-14 15:21:12 +02:00 committed by GitHub
parent 90848eb827
commit 367242a3d5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 63 additions and 16 deletions

View File

@ -1,3 +1,4 @@
import os
import re
import unicodedata
@ -15,3 +16,35 @@ def remove_special_characters(input):
except Exception as e:
logger.error(f"Error removing special characters: {e}")
return input
def sanitize_filename(filename: str) -> str:
"""
Sanitize the filename to make it usable.
Args:
filename (str): The original filename.
Returns:
str: The sanitized filename.
This function:
1. Removes or replaces invalid characters
2. Handles double extensions
3. Ensures the filename is not empty
4. Truncates long filenames
"""
valid_chars = re.sub(r"[^\w\-_\. ]", "", filename)
name, ext = os.path.splitext(valid_chars)
name = name.replace(".", "_")
if not name:
name = "unnamed"
max_length = 255 - len(ext)
if len(name) > max_length:
name = name[:max_length]
sanitized_filename = f"{name}{ext}"
return sanitized_filename

View File

@ -29,6 +29,7 @@ from quivr_api.modules.sync.service.sync_service import (
ISyncService,
ISyncUserService,
)
from quivr_api.modules.sync.utils.normalize import sanitize_filename
from quivr_api.modules.sync.utils.sync import BaseSync
from quivr_api.modules.upload.service.upload_file import (
check_file_exists,
@ -168,6 +169,8 @@ class SyncUtils:
]:
raise ValueError(f"Incompatible file extension for {downloaded_file}")
storage_path = sanitize_filename(storage_path)
response = await upload_file_storage(
downloaded_file.file_data,
storage_path,

View File

@ -30,6 +30,7 @@ from quivr_api.modules.notification.entity.notification import NotificationsStat
from quivr_api.modules.notification.service.notification_service import (
NotificationService,
)
from quivr_api.modules.sync.utils.normalize import sanitize_filename
from quivr_api.modules.upload.service.upload_file import (
upload_file_storage,
)
@ -85,12 +86,14 @@ async def upload_file(
brain_id=str(brain_id),
)
)
file_name = f"{str(uploadFile.filename).split('.')[0]}.{str(uploadFile.filename).split('.')[-1]}"
background_tasks.add_task(
maybe_send_telemetry, "upload_file", {"file_name": uploadFile.filename}
maybe_send_telemetry, "upload_file", {"file_name": file_name}
)
filename_with_brain_id = str(brain_id) + "/" + str(uploadFile.filename)
filename_with_brain_id = str(brain_id) + "/" + file_name
filename_with_brain_id = sanitize_filename(filename_with_brain_id)
buff_reader = io.BufferedReader(uploadFile.file) # type: ignore
try:
@ -110,9 +113,9 @@ async def upload_file(
knowledge_to_add = CreateKnowledgeProperties(
brain_id=brain_id,
file_name=uploadFile.filename,
file_name=file_name,
extension=os.path.splitext(
uploadFile.filename # pyright: ignore reportPrivateUsage=none
file_name # pyright: ignore reportPrivateUsage=none
)[-1].lower(),
source=integration if integration else "local",
source_link=integration_link, # FIXME: Should return the s3 link @chloedia
@ -127,7 +130,7 @@ async def upload_file(
"process_file_task",
kwargs={
"file_name": filename_with_brain_id,
"file_original_name": uploadFile.filename,
"file_original_name": file_name,
"brain_id": brain_id,
"notification_id": upload_notification.id,
"knowledge_id": knowledge.id,

View File

@ -320,22 +320,28 @@ class PDFConverter:
else:
raise ValueError(f"Method {self.method} not supported")
if not gpt4o_cleaner:
return LangChainDocument(
page_content=parsed_md,
metadata={"filename": file_path.name, "type": "pdf"},
)
else:
if gpt4o_cleaner:
md_processor = MarkdownProcessor(
parsed_md,
strict=True,
remove_pagination=True,
)
md_cleaned = md_processor.process(gpt4o_cleaner=gpt4o_cleaner)
return LangChainDocument(
page_content=md_cleaned,
metadata={"filename": file_path.name, "type": "pdf"},
)
parsed_md = md_cleaned
if (
len(parsed_md) < 5
and file_path.stat().st_size > 100
and self.strategy == "fast"
):
logger.info(f"Switching to auto strategy for {file_path.name}")
self.strategy = "auto"
return await self.convert(file_path, model, gpt4o_cleaner=gpt4o_cleaner)
return LangChainDocument(
page_content=parsed_md,
metadata={"filename": file_path.name, "type": "pdf"},
)
def save_md(self, md_content: str, file_path: Path | str) -> None:
with open(file_path, "w") as f:

View File

@ -59,7 +59,6 @@ class MegaparseProcessor(ProcessorBase):
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore
document: Document = await mega_parse.aload()
print("\n\n document: ", document.page_content)
if len(document.page_content) > self.splitter_config.chunk_size:
docs = self.text_splitter.split_documents([document])
for doc in docs:

View File

@ -178,6 +178,9 @@ def is_being_executed(task_name: str) -> bool:
running currently.
"""
active_tasks = celery.control.inspect().active()
if not active_tasks:
return False
for worker, running_tasks in active_tasks.items():
for task in running_tasks:
if task["name"] == task_name: # type: ignore