From badb27bf195137ad1bc3321fbfa97b24094b1dfb Mon Sep 17 00:00:00 2001 From: Stan Girard Date: Sat, 13 May 2023 00:39:40 +0200 Subject: [PATCH] feat(metadata): updated metadata --- loaders/audio.py | 8 +++++--- loaders/common.py | 5 ++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/loaders/audio.py b/loaders/audio.py index d79db3ce4..8a7e3e913 100644 --- a/loaders/audio.py +++ b/loaders/audio.py @@ -1,7 +1,7 @@ import os import tempfile from io import BytesIO - +import time import openai import streamlit as st from langchain.document_loaders import TextLoader @@ -16,6 +16,7 @@ from langchain.schema import Document def _transcribe_audio(api_key, audio_file): openai.api_key = api_key transcript = "" + with BytesIO(audio_file.read()) as audio_bytes: # Get the extension of the uploaded file file_extension = os.path.splitext(audio_file.name)[-1] @@ -32,7 +33,8 @@ def _transcribe_audio(api_key, audio_file): def process_audio(openai_api_key, vector_store, file_name): file_sha = "" - + dateshort = time.strftime("%Y%m%d-%H%M%S") + file_name = f"audiotranscript_{dateshort}.audio" transcript = _transcribe_audio(openai_api_key, file_name) file_sha = compute_sha1_from_content(transcript.text.encode("utf-8")) @@ -44,7 +46,7 @@ def process_audio(openai_api_key, vector_store, file_name): text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap) texts = text_splitter.split_text(transcript.text) - docs_with_metadata = [Document(page_content=text, metadata={"file_sha1": file_sha}) for text in texts] + docs_with_metadata = [Document(page_content=text, metadata={"file_sha1": file_sha,"file_name": file_name, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort}) for text in texts] vector_store.add_documents(docs_with_metadata) diff --git a/loaders/common.py b/loaders/common.py index a4607375f..146a3e2f7 100644 --- a/loaders/common.py +++ b/loaders/common.py @@ -1,4 +1,5 @@ import tempfile +import time from utils import compute_sha1_from_file from langchain.schema import Document import streamlit as st @@ -7,6 +8,8 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter def process_file(vector_store, file, loader_class, file_suffix): documents = [] file_sha = "" + file_name = file.name + dateshort = time.strftime("%Y%m%d") with tempfile.NamedTemporaryFile(delete=True, suffix=file_suffix) as tmp_file: tmp_file.write(file.getvalue()) tmp_file.flush() @@ -23,7 +26,7 @@ def process_file(vector_store, file, loader_class, file_suffix): documents = text_splitter.split_documents(documents) # Add the document sha1 as metadata to each document - docs_with_metadata = [Document(page_content=doc.page_content, metadata={"file_sha1": file_sha1}) for doc in documents] + docs_with_metadata = [Document(page_content=doc.page_content, metadata={"file_sha1": file_sha1, "file_name": file_name, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort}) for doc in documents] vector_store.add_documents(docs_with_metadata) return \ No newline at end of file