feat(init): init repository

2024-09-11 14:36:35 +03:00 · 2023-05-12 23:05:31 +02:00 · 2023-05-12 23:05:31 +02:00 · 921d7e2502
commit 921d7e2502
12 changed files with 430 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
+secondbrain/
+.env
+.streamlit/
+**/*.pyc
+toto.txt
--- a/files.py
+++ b/files.py
@ -0,0 +1,37 @@
+import streamlit as st
+from loaders.audio import  process_audio
+from loaders.txt import process_txt
+from loaders.csv import process_csv
+from loaders.markdown import process_markdown
+from utils import compute_sha1_from_content
+
+def file_uploader(supabase, openai_key, vector_store):
+    files = st.file_uploader("Upload a file", accept_multiple_files=True, type=["txt", "csv", "md", "m4a", "mp3", "webm", "mp4", "mpga", "wav", "mpeg"])
+    if st.button("Add to Database"):
+        if files is not None:
+             for file in files:
+                if file_already_exists(supabase, file):
+                    st.write(f"😎 {file.name} is already in the database.")
+                else: 
+                    if file.name.endswith(".txt"):
+                        process_txt(vector_store, file)
+                        st.write(f"✅ {file.name} ")
+                    elif file.name.endswith((".m4a", ".mp3", ".webm", ".mp4", ".mpga", ".wav", ".mpeg")):
+                        process_audio(openai_key,vector_store, file)
+                        st.write(f"✅ {file.name} ")
+                    elif file.name.endswith(".csv"):
+                        process_csv(vector_store, file)
+                        st.write(f"✅ {file.name} ")
+                    elif file.name.endswith(".md"):
+                        process_markdown(vector_store, file)
+                        st.write(f"✅ {file.name} ")
+                    else:
+                        st.write(f"❌ {file.name} is not a valid file type.")
+
+def file_already_exists(supabase, file):
+    file_sha1 = compute_sha1_from_content(file.getvalue())
+    response = supabase.table("documents").select("id").eq("metadata->>file_sha1", file_sha1).execute()
+    if len(response.data) > 0:
+        return True
+    else:
+        return False
--- a/loaders/init.py
+++ b/loaders/init.py
--- a/loaders/audio.py
+++ b/loaders/audio.py
@ -0,0 +1,51 @@
+import os
+import tempfile
+from io import BytesIO
+
+import openai
+import streamlit as st
+from langchain.document_loaders import TextLoader
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from utils import compute_sha1_from_content
+from langchain.schema import Document
+
+
+
+# Create a function to transcribe audio using Whisper
+def _transcribe_audio(api_key, audio_file):
+    openai.api_key = api_key
+    transcript = ""
+    with BytesIO(audio_file.read()) as audio_bytes:
+        # Get the extension of the uploaded file
+        file_extension = os.path.splitext(audio_file.name)[-1]
+        
+        # Create a temporary file with the uploaded audio data and the correct extension
+        with tempfile.NamedTemporaryFile(delete=True, suffix=file_extension) as temp_audio_file:
+            temp_audio_file.write(audio_bytes.read())
+            temp_audio_file.seek(0)  # Move the file pointer to the beginning of the file
+            
+            # Transcribe the temporary audio file
+            transcript = openai.Audio.translate("whisper-1", temp_audio_file)
+
+    return transcript
+
+def process_audio(openai_api_key, vector_store, file_name):
+    file_sha = ""
+
+    transcript = _transcribe_audio(openai_api_key, file_name)
+    file_sha = compute_sha1_from_content(transcript.text.encode("utf-8"))
+
+
+    ## Load chunk size and overlap from sidebar
+    chunk_size = st.session_state['chunk_size']
+    chunk_overlap = st.session_state['chunk_overlap']
+
+    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    texts = text_splitter.split_text(transcript.text)
+
+    docs_with_metadata = [Document(page_content=text, metadata={"file_sha1": file_sha}) for text in texts]
+
+    
+    vector_store.add_documents(docs_with_metadata)
+    return vector_store
--- a/loaders/csv.py
+++ b/loaders/csv.py
@ -0,0 +1,34 @@
+import tempfile
+
+import streamlit as st
+from langchain.document_loaders.csv_loader import CSVLoader
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from utils import compute_sha1_from_file
+from langchain.schema import Document
+
+
+def process_csv(vector_store, file):
+    documents = []
+    file_sha = ""
+    with tempfile.NamedTemporaryFile(delete=True, suffix=".csv") as tmp_file:
+        tmp_file.write(file.getvalue())
+        tmp_file.flush()
+
+        loader = CSVLoader(tmp_file.name)
+        documents = loader.load()
+        file_sha1 = compute_sha1_from_file(tmp_file.name)
+    
+    chunk_size = st.session_state['chunk_size']
+    chunk_overlap = st.session_state['chunk_overlap']
+
+    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    
+    documents = text_splitter.split_documents(documents)
+    # Add the document sha1 as metadata to each document
+    docs_with_metadata = [Document(page_content=doc.page_content, metadata={"file_sha1": file_sha1}) for doc in documents]
+    
+
+    # We're using the default `documents` table here. You can modify this by passing in a `table_name` argument to the `from_documents` method.
+    vector_store.add_documents(docs_with_metadata)
+    return 
--- a/loaders/markdown.py
+++ b/loaders/markdown.py
@ -0,0 +1,36 @@
+import tempfile
+import streamlit as st
+from langchain.document_loaders import UnstructuredMarkdownLoader
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.schema import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from utils import compute_sha1_from_file
+
+
+def process_markdown(vector_store, file):
+    documents = []
+    file_sha = ""
+   
+    with tempfile.NamedTemporaryFile(delete=True, suffix=".md") as tmp_file:
+        tmp_file.write(file.getvalue())
+        tmp_file.flush()
+        
+        loader = UnstructuredMarkdownLoader(tmp_file.name)
+        
+        documents = loader.load()
+        file_sha1 = compute_sha1_from_file(tmp_file.name)
+    
+    ## Load chunk size and overlap from sidebar
+    chunk_size = st.session_state['chunk_size']
+    chunk_overlap = st.session_state['chunk_overlap']
+
+    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    
+    
+    documents = text_splitter.split_documents(documents)
+    # Add the document sha1 as metadata to each document
+    docs_with_metadata = [Document(page_content=doc.page_content, metadata={"file_sha1": file_sha1}) for doc in documents]
+    
+    # We're using the default `documents` table here. You can modify this by passing in a `table_name` argument to the `from_documents` method.
+    vector_store.add_documents(docs_with_metadata)
+    return 
--- a/loaders/txt.py
+++ b/loaders/txt.py
@ -0,0 +1,35 @@
+import tempfile
+import streamlit as st
+from langchain.document_loaders import TextLoader
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.schema import Document
+
+
+
+def process_txt(vector_store, file):
+    documents = []
+    file_sha = ""
+    with tempfile.NamedTemporaryFile(delete=True, suffix=".txt") as tmp_file:
+        tmp_file.write(file.getvalue())
+        tmp_file.flush()
+
+        loader = TextLoader(tmp_file.name)
+        documents = loader.load()
+        file_sha1 = compute_sha1_from_file(tmp_file.name)
+    
+    ## Load chunk size and overlap from sidebar
+    chunk_size = st.session_state['chunk_size']
+    chunk_overlap = st.session_state['chunk_overlap']
+
+    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    
+    docs = text_splitter.split_documents(documents)
+
+    # Add the document sha1 as metadata to each document
+    docs_with_metadata = [Document(page_content=doc.page_content, metadata={"file_sha1": file_sha1}) for doc in docs]
+    
+
+    # We're using the default `documents` table here. You can modify this by passing in a `table_name` argument to the `from_documents` method.
+    vector_store.add_documents(docs_with_metadata)
+    return 
--- a/main.py
+++ b/main.py
@ -0,0 +1,35 @@
+import os
+import tempfile
+
+import streamlit as st
+from sidebar import sidebar
+from files import file_uploader
+from question import chat_with_doc
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.vectorstores import SupabaseVectorStore
+from supabase import Client, create_client
+
+
+
+# supabase_url = "https://fqgpcifsfmamprzldyiv.supabase.co"
+supabase_url = st.secrets.supabase_url
+supabase_key = st.secrets.supabase_key
+openai_api_key = st.secrets.openai_api_key
+supabase: Client = create_client(supabase_url, supabase_key)
+
+embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
+vector_store = SupabaseVectorStore(supabase, embeddings, table_name="documents")
+
+
+
+
+st.title("🧠 Second Brain 🧠")
+st.markdown("Store your knowledge in a vector store and query it with OpenAI's GPT-3/4.")
+st.markdown("---\n\n")
+
+
+
+sidebar(supabase)
+file_uploader(supabase,openai_api_key, vector_store)
+st.markdown("---\n\n")
+chat_with_doc(openai_api_key, vector_store)
--- a/question.py
+++ b/question.py
@ -0,0 +1,18 @@
+import streamlit as st
+from langchain.chains import ConversationalRetrievalChain
+from langchain.memory import ConversationBufferMemory
+from langchain.llms import OpenAI
+
+
+memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+
+
+def chat_with_doc(openai_api_key,vector_store):
+    question = st.text_input("## Ask a question")
+    temperature = st.session_state.get("temperature", 0.0)
+    model = st.session_state.get("model", "gpt-3.5-turbo")
+    button = st.button("Ask")
+    if button:
+        qa = ConversationalRetrievalChain.from_llm(OpenAI(model_name=model, openai_api_key=openai_api_key, temperature=temperature), vector_store.as_retriever(), memory=memory)
+        result = qa({"question": question})
+        st.write(result["answer"])
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,136 @@
+aiohttp==3.8.4
+aiosignal==1.3.1
+altair==4.2.2
+anyio==3.6.2
+argilla==1.7.0
+async-timeout==4.0.2
+attrs==23.1.0
+backoff==2.2.1
+bleach==6.0.0
+blinker==1.6.2
+cachetools==5.3.0
+certifi==2023.5.7
+cffi==1.15.1
+chardet==4.0.0
+charset-normalizer==3.1.0
+click==8.1.3
+click-log==0.4.0
+colorama==0.4.6
+commonmark==0.9.1
+cryptography==40.0.2
+dataclasses==0.6
+dataclasses-json==0.5.7
+decorator==5.1.1
+Deprecated==1.2.13
+deprecation==2.1.0
+docutils==0.20
+dotty-dict==1.3.1
+entrypoints==0.4
+et-xmlfile==1.1.0
+frozenlist==1.3.3
+gitdb==4.0.10
+GitPython==3.1.31
+gotrue==1.0.1
+h11==0.14.0
+httpcore==0.16.3
+httpx==0.23.3
+idna==2.10
+importlib-metadata==6.6.0
+iniconfig==2.0.0
+invoke==1.7.3
+jaraco.classes==3.2.3
+Jinja2==3.1.2
+joblib==1.2.0
+jsonschema==4.17.3
+keyring==23.13.1
+langchain==0.0.166
+lxml==4.9.2
+Markdown==3.4.3
+markdown-it-py==2.2.0
+MarkupSafe==2.1.2
+marshmallow==3.19.0
+marshmallow-enum==1.5.1
+mdurl==0.1.2
+monotonic==1.6
+more-itertools==9.1.0
+msg-parser==1.2.0
+multidict==6.0.4
+mypy-extensions==1.0.0
+nltk==3.8.1
+numexpr==2.8.4
+numpy==1.23.5
+olefile==0.46
+openai==0.27.6
+openapi-schema-pydantic==1.2.4
+openpyxl==3.1.2
+packaging==23.1
+pandas==1.5.3
+pdfminer.six==20221105
+Pillow==9.5.0
+pkginfo==1.9.6
+pluggy==1.0.0
+postgrest==0.10.6
+postgrest-py==0.4.0
+protobuf==3.20.3
+py==1.11.0
+pyarrow==12.0.0
+pycparser==2.21
+pydantic==1.10.7
+pydeck==0.8.1b0
+Pygments==2.15.1
+Pympler==1.0.1
+pypandoc==1.11
+pypdf==3.8.1
+pyrsistent==0.19.3
+pytest==6.2.5
+python-dateutil==2.8.2
+python-docx==0.8.11
+python-dotenv==1.0.0
+python-gitlab==3.14.0
+python-magic==0.4.27
+python-pptx==0.6.21
+python-semantic-release==7.33.2
+pytz==2023.3
+pytz-deprecation-shim==0.1.0.post0
+PyYAML==6.0
+readme-renderer==37.3
+realtime==1.0.0
+realtime-py==0.1.3
+regex==2023.5.5
+requests==2.30.0
+requests-toolbelt==1.0.0
+rfc3986==1.5.0
+rich==13.0.1
+semver==2.13.0
+six==1.16.0
+smmap==5.0.0
+sniffio==1.3.0
+SQLAlchemy==2.0.13
+storage3==0.5.2
+streamlit==1.22.0
+StrEnum==0.4.10
+supabase==1.0.3
+supabase-py==0.0.2
+supafunc==0.2.2
+tenacity==8.2.2
+tiktoken==0.4.0
+toml==0.10.2
+tomlkit==0.11.8
+toolz==0.12.0
+tornado==6.3.1
+tqdm==4.65.0
+twine==3.8.0
+typer==0.9.0
+typing-inspect==0.8.0
+typing_extensions==4.5.0
+tzdata==2023.3
+tzlocal==4.3
+unstructured==0.6.5
+urllib3==1.26.15
+validators==0.20.0
+webencodings==0.5.1
+websockets==10.4
+wrapt==1.14.1
+XlsxWriter==3.1.0
+yarl==1.9.2
+zipp==3.15.0
--- a/sidebar.py
+++ b/sidebar.py
@ -0,0 +1,32 @@
+import streamlit as st
+
+
+def sidebar(supabase):
+    st.sidebar.title("Configuration") 
+    
+    
+    ## Get the number of documents in the database
+    number_of_docs = number_of_documents(supabase)
+    ## Display the number of documents in the database
+    st.sidebar.markdown(f"**Docs in DB:**  {number_of_docs}")
+    
+    ## Allow user to choose model between gpt-3.5-turbo and gpt-4
+    model = st.sidebar.selectbox("Select Model", ["gpt-3.5-turbo", "gpt-4"])
+    ## Allow user to choose temperature between 0.0 and 1.0
+    temperature = st.sidebar.slider("Select Temperature", 0.0, 1.0, 0.0, 0.1)
+    ## Allow user to choose chunk_size between 100 and 1000
+    chunk_size = st.sidebar.slider("Select Chunk Size", 100, 1000, 500, 50)
+    ## Allow user to choose chunk_overlap between 0 and 100
+    chunk_overlap = st.sidebar.slider("Select Chunk Overlap", 0, 100, 0, 10)
+
+
+    ## Save the user's choices
+    st.session_state.model = model
+    st.session_state.temperature = temperature
+    st.session_state.chunk_size = chunk_size
+    st.session_state.chunk_overlap = chunk_overlap
+
+def number_of_documents(supabase):
+    ## Get the number of documents in the database
+    documents = supabase.table("documents").select("id", count="exact").execute()
+    return documents.count
--- a/utils.py
+++ b/utils.py
@ -0,0 +1,11 @@
+import hashlib
+
+def compute_sha1_from_file(file_path):
+    with open(file_path, "rb") as file:
+        bytes = file.read() 
+        readable_hash = hashlib.sha1(bytes).hexdigest()
+    return readable_hash
+
+def compute_sha1_from_content(content):
+    readable_hash = hashlib.sha1(content).hexdigest()
+    return readable_hash