mirror of
https://github.com/StanGirard/quivr.git
synced 2024-11-29 14:57:03 +03:00
fix: fixing pdf parsing (#3349)
Our default pdf parser is Unstructured, for which we were using the 'fast' strategy, which fails in parsing some pdf. We instead use the 'auto' strategy which is more flexible and powerful. Closes TICK-86 # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): --------- Co-authored-by: chloedia <chloedaems0@gmail.com>
This commit is contained in:
parent
90848eb827
commit
367242a3d5
@ -1,3 +1,4 @@
|
||||
import os
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
@ -15,3 +16,35 @@ def remove_special_characters(input):
|
||||
except Exception as e:
|
||||
logger.error(f"Error removing special characters: {e}")
|
||||
return input
|
||||
|
||||
|
||||
def sanitize_filename(filename: str) -> str:
|
||||
"""
|
||||
Sanitize the filename to make it usable.
|
||||
|
||||
Args:
|
||||
filename (str): The original filename.
|
||||
|
||||
Returns:
|
||||
str: The sanitized filename.
|
||||
|
||||
This function:
|
||||
1. Removes or replaces invalid characters
|
||||
2. Handles double extensions
|
||||
3. Ensures the filename is not empty
|
||||
4. Truncates long filenames
|
||||
"""
|
||||
valid_chars = re.sub(r"[^\w\-_\. ]", "", filename)
|
||||
|
||||
name, ext = os.path.splitext(valid_chars)
|
||||
|
||||
name = name.replace(".", "_")
|
||||
|
||||
if not name:
|
||||
name = "unnamed"
|
||||
max_length = 255 - len(ext)
|
||||
if len(name) > max_length:
|
||||
name = name[:max_length]
|
||||
sanitized_filename = f"{name}{ext}"
|
||||
|
||||
return sanitized_filename
|
||||
|
@ -29,6 +29,7 @@ from quivr_api.modules.sync.service.sync_service import (
|
||||
ISyncService,
|
||||
ISyncUserService,
|
||||
)
|
||||
from quivr_api.modules.sync.utils.normalize import sanitize_filename
|
||||
from quivr_api.modules.sync.utils.sync import BaseSync
|
||||
from quivr_api.modules.upload.service.upload_file import (
|
||||
check_file_exists,
|
||||
@ -168,6 +169,8 @@ class SyncUtils:
|
||||
]:
|
||||
raise ValueError(f"Incompatible file extension for {downloaded_file}")
|
||||
|
||||
storage_path = sanitize_filename(storage_path)
|
||||
|
||||
response = await upload_file_storage(
|
||||
downloaded_file.file_data,
|
||||
storage_path,
|
||||
|
@ -30,6 +30,7 @@ from quivr_api.modules.notification.entity.notification import NotificationsStat
|
||||
from quivr_api.modules.notification.service.notification_service import (
|
||||
NotificationService,
|
||||
)
|
||||
from quivr_api.modules.sync.utils.normalize import sanitize_filename
|
||||
from quivr_api.modules.upload.service.upload_file import (
|
||||
upload_file_storage,
|
||||
)
|
||||
@ -85,12 +86,14 @@ async def upload_file(
|
||||
brain_id=str(brain_id),
|
||||
)
|
||||
)
|
||||
file_name = f"{str(uploadFile.filename).split('.')[0]}.{str(uploadFile.filename).split('.')[-1]}"
|
||||
|
||||
background_tasks.add_task(
|
||||
maybe_send_telemetry, "upload_file", {"file_name": uploadFile.filename}
|
||||
maybe_send_telemetry, "upload_file", {"file_name": file_name}
|
||||
)
|
||||
|
||||
filename_with_brain_id = str(brain_id) + "/" + str(uploadFile.filename)
|
||||
filename_with_brain_id = str(brain_id) + "/" + file_name
|
||||
filename_with_brain_id = sanitize_filename(filename_with_brain_id)
|
||||
|
||||
buff_reader = io.BufferedReader(uploadFile.file) # type: ignore
|
||||
try:
|
||||
@ -110,9 +113,9 @@ async def upload_file(
|
||||
|
||||
knowledge_to_add = CreateKnowledgeProperties(
|
||||
brain_id=brain_id,
|
||||
file_name=uploadFile.filename,
|
||||
file_name=file_name,
|
||||
extension=os.path.splitext(
|
||||
uploadFile.filename # pyright: ignore reportPrivateUsage=none
|
||||
file_name # pyright: ignore reportPrivateUsage=none
|
||||
)[-1].lower(),
|
||||
source=integration if integration else "local",
|
||||
source_link=integration_link, # FIXME: Should return the s3 link @chloedia
|
||||
@ -127,7 +130,7 @@ async def upload_file(
|
||||
"process_file_task",
|
||||
kwargs={
|
||||
"file_name": filename_with_brain_id,
|
||||
"file_original_name": uploadFile.filename,
|
||||
"file_original_name": file_name,
|
||||
"brain_id": brain_id,
|
||||
"notification_id": upload_notification.id,
|
||||
"knowledge_id": knowledge.id,
|
||||
|
@ -320,22 +320,28 @@ class PDFConverter:
|
||||
else:
|
||||
raise ValueError(f"Method {self.method} not supported")
|
||||
|
||||
if not gpt4o_cleaner:
|
||||
return LangChainDocument(
|
||||
page_content=parsed_md,
|
||||
metadata={"filename": file_path.name, "type": "pdf"},
|
||||
)
|
||||
else:
|
||||
if gpt4o_cleaner:
|
||||
md_processor = MarkdownProcessor(
|
||||
parsed_md,
|
||||
strict=True,
|
||||
remove_pagination=True,
|
||||
)
|
||||
md_cleaned = md_processor.process(gpt4o_cleaner=gpt4o_cleaner)
|
||||
return LangChainDocument(
|
||||
page_content=md_cleaned,
|
||||
metadata={"filename": file_path.name, "type": "pdf"},
|
||||
)
|
||||
parsed_md = md_cleaned
|
||||
|
||||
if (
|
||||
len(parsed_md) < 5
|
||||
and file_path.stat().st_size > 100
|
||||
and self.strategy == "fast"
|
||||
):
|
||||
logger.info(f"Switching to auto strategy for {file_path.name}")
|
||||
self.strategy = "auto"
|
||||
return await self.convert(file_path, model, gpt4o_cleaner=gpt4o_cleaner)
|
||||
|
||||
return LangChainDocument(
|
||||
page_content=parsed_md,
|
||||
metadata={"filename": file_path.name, "type": "pdf"},
|
||||
)
|
||||
|
||||
def save_md(self, md_content: str, file_path: Path | str) -> None:
|
||||
with open(file_path, "w") as f:
|
||||
|
@ -59,7 +59,6 @@ class MegaparseProcessor(ProcessorBase):
|
||||
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
|
||||
mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore
|
||||
document: Document = await mega_parse.aload()
|
||||
print("\n\n document: ", document.page_content)
|
||||
if len(document.page_content) > self.splitter_config.chunk_size:
|
||||
docs = self.text_splitter.split_documents([document])
|
||||
for doc in docs:
|
||||
|
@ -178,6 +178,9 @@ def is_being_executed(task_name: str) -> bool:
|
||||
running currently.
|
||||
"""
|
||||
active_tasks = celery.control.inspect().active()
|
||||
if not active_tasks:
|
||||
return False
|
||||
|
||||
for worker, running_tasks in active_tasks.items():
|
||||
for task in running_tasks:
|
||||
if task["name"] == task_name: # type: ignore
|
||||
|
Loading…
Reference in New Issue
Block a user