diff --git a/files.py b/files.py index ab999fd62..823855f92 100644 --- a/files.py +++ b/files.py @@ -9,6 +9,8 @@ from loaders.html import process_html from utils import compute_sha1_from_content from loaders.pdf import process_pdf from loaders.html import get_html, create_html_file, delete_tempfile +from loaders.powerpoint import process_powerpoint +from loaders.docx import process_docx import requests import re import unicodedata @@ -28,6 +30,8 @@ file_processors = { ".mpeg": process_audio, ".pdf": process_pdf, ".html": process_html, + ".pptx": process_powerpoint, + ".docx": process_docx } def file_uploader(supabase, openai_key, vector_store): diff --git a/loaders/docx.py b/loaders/docx.py new file mode 100644 index 000000000..95b1bb1d5 --- /dev/null +++ b/loaders/docx.py @@ -0,0 +1,5 @@ +from .common import process_file +from langchain.document_loaders import Docx2txtLoader + +def process_docx(vector_store, file): + return process_file(vector_store, file, Docx2txtLoader, ".docx") \ No newline at end of file diff --git a/loaders/powerpoint.py b/loaders/powerpoint.py new file mode 100644 index 000000000..06157999e --- /dev/null +++ b/loaders/powerpoint.py @@ -0,0 +1,5 @@ +from .common import process_file +from langchain.document_loaders import UnstructuredPowerPointLoader + +def process_powerpoint(vector_store, file): + return process_file(vector_store, file, UnstructuredPowerPointLoader, ".pptx") \ No newline at end of file