feat: Add filename to page content in embedding

This commit is contained in:
Stan Girard 2024-04-15 11:11:52 +02:00
parent 7c9f67e539
commit 6a691d36d0

View File

@ -41,6 +41,8 @@ async def process_file(
if file.documents is not None:
for doc in file.documents: # pyright: ignore reportPrivateUsage=none
new_metadata = metadata.copy()
# Add filename at beginning of page content
doc.page_content = f"Filename: {new_metadata['original_file_name']} Content: {doc.page_content}"
len_chunk = len(enc.encode(doc.page_content))
page_content_encoded = doc.page_content.encode("unicode_escape").decode(
"ascii", "replace"