mirror of
https://github.com/StanGirard/quivr.git
synced 2024-11-23 04:17:48 +03:00
fix: utf8 encoding (#2555)
# Description Delete the replacement of non ASCII characters into spaces ## Checklist before requesting a review Please delete options that are not relevant. - [x] My code follows the style guidelines of this project - [x] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [x] New and existing unit tests pass locally with my changes - [x] Any dependent changes have been merged ## Screenshots (if appropriate):
This commit is contained in:
parent
cde758055b
commit
748733df2d
@ -1,5 +1,4 @@
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
@ -84,8 +83,6 @@ async def process_file(
|
||||
doc.page_content = f"Filename: {new_metadata['original_file_name']} Content: {doc.page_content}"
|
||||
|
||||
doc.page_content = doc.page_content.replace("\u0000", "")
|
||||
# Replace unsupported Unicode characters
|
||||
doc.page_content = re.sub(r"[^\x00-\x7F]+", " ", doc.page_content)
|
||||
|
||||
len_chunk = len(enc.encode(doc.page_content))
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user