feat(init): init repository

This commit is contained in:
Stan Girard 2023-05-12 23:05:31 +02:00
commit 921d7e2502
12 changed files with 430 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
secondbrain/
.env
.streamlit/
**/*.pyc
toto.txt

37
files.py Normal file
View File

@ -0,0 +1,37 @@
import streamlit as st
from loaders.audio import process_audio
from loaders.txt import process_txt
from loaders.csv import process_csv
from loaders.markdown import process_markdown
from utils import compute_sha1_from_content
def file_uploader(supabase, openai_key, vector_store):
files = st.file_uploader("Upload a file", accept_multiple_files=True, type=["txt", "csv", "md", "m4a", "mp3", "webm", "mp4", "mpga", "wav", "mpeg"])
if st.button("Add to Database"):
if files is not None:
for file in files:
if file_already_exists(supabase, file):
st.write(f"😎 {file.name} is already in the database.")
else:
if file.name.endswith(".txt"):
process_txt(vector_store, file)
st.write(f"{file.name} ")
elif file.name.endswith((".m4a", ".mp3", ".webm", ".mp4", ".mpga", ".wav", ".mpeg")):
process_audio(openai_key,vector_store, file)
st.write(f"{file.name} ")
elif file.name.endswith(".csv"):
process_csv(vector_store, file)
st.write(f"{file.name} ")
elif file.name.endswith(".md"):
process_markdown(vector_store, file)
st.write(f"{file.name} ")
else:
st.write(f"{file.name} is not a valid file type.")
def file_already_exists(supabase, file):
file_sha1 = compute_sha1_from_content(file.getvalue())
response = supabase.table("documents").select("id").eq("metadata->>file_sha1", file_sha1).execute()
if len(response.data) > 0:
return True
else:
return False

0
loaders/__init__.py Normal file
View File

51
loaders/audio.py Normal file
View File

@ -0,0 +1,51 @@
import os
import tempfile
from io import BytesIO
import openai
import streamlit as st
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils import compute_sha1_from_content
from langchain.schema import Document
# Create a function to transcribe audio using Whisper
def _transcribe_audio(api_key, audio_file):
openai.api_key = api_key
transcript = ""
with BytesIO(audio_file.read()) as audio_bytes:
# Get the extension of the uploaded file
file_extension = os.path.splitext(audio_file.name)[-1]
# Create a temporary file with the uploaded audio data and the correct extension
with tempfile.NamedTemporaryFile(delete=True, suffix=file_extension) as temp_audio_file:
temp_audio_file.write(audio_bytes.read())
temp_audio_file.seek(0) # Move the file pointer to the beginning of the file
# Transcribe the temporary audio file
transcript = openai.Audio.translate("whisper-1", temp_audio_file)
return transcript
def process_audio(openai_api_key, vector_store, file_name):
file_sha = ""
transcript = _transcribe_audio(openai_api_key, file_name)
file_sha = compute_sha1_from_content(transcript.text.encode("utf-8"))
## Load chunk size and overlap from sidebar
chunk_size = st.session_state['chunk_size']
chunk_overlap = st.session_state['chunk_overlap']
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
texts = text_splitter.split_text(transcript.text)
docs_with_metadata = [Document(page_content=text, metadata={"file_sha1": file_sha}) for text in texts]
vector_store.add_documents(docs_with_metadata)
return vector_store

34
loaders/csv.py Normal file
View File

@ -0,0 +1,34 @@
import tempfile
import streamlit as st
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils import compute_sha1_from_file
from langchain.schema import Document
def process_csv(vector_store, file):
documents = []
file_sha = ""
with tempfile.NamedTemporaryFile(delete=True, suffix=".csv") as tmp_file:
tmp_file.write(file.getvalue())
tmp_file.flush()
loader = CSVLoader(tmp_file.name)
documents = loader.load()
file_sha1 = compute_sha1_from_file(tmp_file.name)
chunk_size = st.session_state['chunk_size']
chunk_overlap = st.session_state['chunk_overlap']
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
documents = text_splitter.split_documents(documents)
# Add the document sha1 as metadata to each document
docs_with_metadata = [Document(page_content=doc.page_content, metadata={"file_sha1": file_sha1}) for doc in documents]
# We're using the default `documents` table here. You can modify this by passing in a `table_name` argument to the `from_documents` method.
vector_store.add_documents(docs_with_metadata)
return

36
loaders/markdown.py Normal file
View File

@ -0,0 +1,36 @@
import tempfile
import streamlit as st
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils import compute_sha1_from_file
def process_markdown(vector_store, file):
documents = []
file_sha = ""
with tempfile.NamedTemporaryFile(delete=True, suffix=".md") as tmp_file:
tmp_file.write(file.getvalue())
tmp_file.flush()
loader = UnstructuredMarkdownLoader(tmp_file.name)
documents = loader.load()
file_sha1 = compute_sha1_from_file(tmp_file.name)
## Load chunk size and overlap from sidebar
chunk_size = st.session_state['chunk_size']
chunk_overlap = st.session_state['chunk_overlap']
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
documents = text_splitter.split_documents(documents)
# Add the document sha1 as metadata to each document
docs_with_metadata = [Document(page_content=doc.page_content, metadata={"file_sha1": file_sha1}) for doc in documents]
# We're using the default `documents` table here. You can modify this by passing in a `table_name` argument to the `from_documents` method.
vector_store.add_documents(docs_with_metadata)
return

35
loaders/txt.py Normal file
View File

@ -0,0 +1,35 @@
import tempfile
import streamlit as st
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
def process_txt(vector_store, file):
documents = []
file_sha = ""
with tempfile.NamedTemporaryFile(delete=True, suffix=".txt") as tmp_file:
tmp_file.write(file.getvalue())
tmp_file.flush()
loader = TextLoader(tmp_file.name)
documents = loader.load()
file_sha1 = compute_sha1_from_file(tmp_file.name)
## Load chunk size and overlap from sidebar
chunk_size = st.session_state['chunk_size']
chunk_overlap = st.session_state['chunk_overlap']
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents)
# Add the document sha1 as metadata to each document
docs_with_metadata = [Document(page_content=doc.page_content, metadata={"file_sha1": file_sha1}) for doc in docs]
# We're using the default `documents` table here. You can modify this by passing in a `table_name` argument to the `from_documents` method.
vector_store.add_documents(docs_with_metadata)
return

35
main.py Normal file
View File

@ -0,0 +1,35 @@
import os
import tempfile
import streamlit as st
from sidebar import sidebar
from files import file_uploader
from question import chat_with_doc
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import SupabaseVectorStore
from supabase import Client, create_client
# supabase_url = "https://fqgpcifsfmamprzldyiv.supabase.co"
supabase_url = st.secrets.supabase_url
supabase_key = st.secrets.supabase_key
openai_api_key = st.secrets.openai_api_key
supabase: Client = create_client(supabase_url, supabase_key)
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
vector_store = SupabaseVectorStore(supabase, embeddings, table_name="documents")
st.title("🧠 Second Brain 🧠")
st.markdown("Store your knowledge in a vector store and query it with OpenAI's GPT-3/4.")
st.markdown("---\n\n")
sidebar(supabase)
file_uploader(supabase,openai_api_key, vector_store)
st.markdown("---\n\n")
chat_with_doc(openai_api_key, vector_store)

18
question.py Normal file
View File

@ -0,0 +1,18 @@
import streamlit as st
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.llms import OpenAI
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
def chat_with_doc(openai_api_key,vector_store):
question = st.text_input("## Ask a question")
temperature = st.session_state.get("temperature", 0.0)
model = st.session_state.get("model", "gpt-3.5-turbo")
button = st.button("Ask")
if button:
qa = ConversationalRetrievalChain.from_llm(OpenAI(model_name=model, openai_api_key=openai_api_key, temperature=temperature), vector_store.as_retriever(), memory=memory)
result = qa({"question": question})
st.write(result["answer"])

136
requirements.txt Normal file
View File

@ -0,0 +1,136 @@
aiohttp==3.8.4
aiosignal==1.3.1
altair==4.2.2
anyio==3.6.2
argilla==1.7.0
async-timeout==4.0.2
attrs==23.1.0
backoff==2.2.1
bleach==6.0.0
blinker==1.6.2
cachetools==5.3.0
certifi==2023.5.7
cffi==1.15.1
chardet==4.0.0
charset-normalizer==3.1.0
click==8.1.3
click-log==0.4.0
colorama==0.4.6
commonmark==0.9.1
cryptography==40.0.2
dataclasses==0.6
dataclasses-json==0.5.7
decorator==5.1.1
Deprecated==1.2.13
deprecation==2.1.0
docutils==0.20
dotty-dict==1.3.1
entrypoints==0.4
et-xmlfile==1.1.0
frozenlist==1.3.3
gitdb==4.0.10
GitPython==3.1.31
gotrue==1.0.1
h11==0.14.0
httpcore==0.16.3
httpx==0.23.3
idna==2.10
importlib-metadata==6.6.0
iniconfig==2.0.0
invoke==1.7.3
jaraco.classes==3.2.3
Jinja2==3.1.2
joblib==1.2.0
jsonschema==4.17.3
keyring==23.13.1
langchain==0.0.166
lxml==4.9.2
Markdown==3.4.3
markdown-it-py==2.2.0
MarkupSafe==2.1.2
marshmallow==3.19.0
marshmallow-enum==1.5.1
mdurl==0.1.2
monotonic==1.6
more-itertools==9.1.0
msg-parser==1.2.0
multidict==6.0.4
mypy-extensions==1.0.0
nltk==3.8.1
numexpr==2.8.4
numpy==1.23.5
olefile==0.46
openai==0.27.6
openapi-schema-pydantic==1.2.4
openpyxl==3.1.2
packaging==23.1
pandas==1.5.3
pdfminer.six==20221105
Pillow==9.5.0
pkginfo==1.9.6
pluggy==1.0.0
postgrest==0.10.6
postgrest-py==0.4.0
protobuf==3.20.3
py==1.11.0
pyarrow==12.0.0
pycparser==2.21
pydantic==1.10.7
pydeck==0.8.1b0
Pygments==2.15.1
Pympler==1.0.1
pypandoc==1.11
pypdf==3.8.1
pyrsistent==0.19.3
pytest==6.2.5
python-dateutil==2.8.2
python-docx==0.8.11
python-dotenv==1.0.0
python-gitlab==3.14.0
python-magic==0.4.27
python-pptx==0.6.21
python-semantic-release==7.33.2
pytz==2023.3
pytz-deprecation-shim==0.1.0.post0
PyYAML==6.0
readme-renderer==37.3
realtime==1.0.0
realtime-py==0.1.3
regex==2023.5.5
requests==2.30.0
requests-toolbelt==1.0.0
rfc3986==1.5.0
rich==13.0.1
semver==2.13.0
six==1.16.0
smmap==5.0.0
sniffio==1.3.0
SQLAlchemy==2.0.13
storage3==0.5.2
streamlit==1.22.0
StrEnum==0.4.10
supabase==1.0.3
supabase-py==0.0.2
supafunc==0.2.2
tenacity==8.2.2
tiktoken==0.4.0
toml==0.10.2
tomlkit==0.11.8
toolz==0.12.0
tornado==6.3.1
tqdm==4.65.0
twine==3.8.0
typer==0.9.0
typing-inspect==0.8.0
typing_extensions==4.5.0
tzdata==2023.3
tzlocal==4.3
unstructured==0.6.5
urllib3==1.26.15
validators==0.20.0
webencodings==0.5.1
websockets==10.4
wrapt==1.14.1
XlsxWriter==3.1.0
yarl==1.9.2
zipp==3.15.0

32
sidebar.py Normal file
View File

@ -0,0 +1,32 @@
import streamlit as st
def sidebar(supabase):
st.sidebar.title("Configuration")
## Get the number of documents in the database
number_of_docs = number_of_documents(supabase)
## Display the number of documents in the database
st.sidebar.markdown(f"**Docs in DB:** {number_of_docs}")
## Allow user to choose model between gpt-3.5-turbo and gpt-4
model = st.sidebar.selectbox("Select Model", ["gpt-3.5-turbo", "gpt-4"])
## Allow user to choose temperature between 0.0 and 1.0
temperature = st.sidebar.slider("Select Temperature", 0.0, 1.0, 0.0, 0.1)
## Allow user to choose chunk_size between 100 and 1000
chunk_size = st.sidebar.slider("Select Chunk Size", 100, 1000, 500, 50)
## Allow user to choose chunk_overlap between 0 and 100
chunk_overlap = st.sidebar.slider("Select Chunk Overlap", 0, 100, 0, 10)
## Save the user's choices
st.session_state.model = model
st.session_state.temperature = temperature
st.session_state.chunk_size = chunk_size
st.session_state.chunk_overlap = chunk_overlap
def number_of_documents(supabase):
## Get the number of documents in the database
documents = supabase.table("documents").select("id", count="exact").execute()
return documents.count

11
utils.py Normal file
View File

@ -0,0 +1,11 @@
import hashlib
def compute_sha1_from_file(file_path):
with open(file_path, "rb") as file:
bytes = file.read()
readable_hash = hashlib.sha1(bytes).hexdigest()
return readable_hash
def compute_sha1_from_content(content):
readable_hash = hashlib.sha1(content).hexdigest()
return readable_hash