feat: 🎸 ocr (#2187)

added ocr

# Description

Please include a summary of the changes and the related issue. Please
also include relevant motivation and context.

## Checklist before requesting a review

Please delete options that are not relevant.

- [ ] My code follows the style guidelines of this project
- [ ] I have performed a self-review of my code
- [ ] I have commented hard-to-understand areas
- [ ] I have ideally added tests that prove my fix is effective or that
my feature works
- [ ] New and existing unit tests pass locally with my changes
- [ ] Any dependent changes have been merged

## Screenshots (if appropriate):
This commit is contained in:
Stan Girard 2024-02-12 19:56:20 -08:00 committed by GitHub
parent d4b40b3b42
commit 2ba3bc1f07
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 1534 additions and 549 deletions

View File

@ -4,8 +4,8 @@
{
"name": "quivr-chat",
"image": "253053805092.dkr.ecr.eu-west-3.amazonaws.com/quivr:600ff1ede02741c66853cc3e4e7f5001aaba3bc2",
"cpu": "1024",
"memory": "2048",
"cpu": "4096",
"memory": "8192",
"essential": true,
"command": ["celery", "-A", "celery_worker", "worker", "-l", "info"],
"environment": [],
@ -70,8 +70,8 @@
"placementConstraints": [],
"compatibilities": ["EC2", "FARGATE"],
"requiresCompatibilities": ["FARGATE"],
"cpu": "1024",
"memory": "2048",
"cpu": "4096",
"memory": "8192",
"runtimePlatform": {
"cpuArchitecture": "X86_64",
"operatingSystemFamily": "LINUX"

View File

@ -4,9 +4,9 @@
{
"name": "quivr-chat",
"image": "253053805092.dkr.ecr.eu-west-3.amazonaws.com/quivr:35bd4727c67790d295a474dd81dfbef8469365e8",
"cpu": 2048,
"memory": 4096,
"memoryReservation": 4096,
"cpu": "4096",
"memory": "8192",
"memoryReservation": 8192,
"portMappings": [],
"essential": true,
"command": ["celery", "-A", "celery_worker", "worker", "-l", "info"],
@ -76,8 +76,8 @@
"placementConstraints": [],
"compatibilities": ["EC2", "FARGATE"],
"requiresCompatibilities": ["FARGATE"],
"cpu": "2048",
"memory": "4096",
"cpu": "4096",
"memory": "8192",
"runtimePlatform": {
"cpuArchitecture": "X86_64",
"operatingSystemFamily": "LINUX"

View File

@ -13,7 +13,6 @@ nest-asyncio = "==1.5.6"
pypdf = "==3.9.0"
supabase = "==1.1.0"
tiktoken = "==0.4.0"
unstructured = "==0.6.7"
fastapi = "==0.95.2"
python-multipart = "==0.0.6"
uvicorn = "==0.22.0"
@ -46,6 +45,7 @@ pytest = "*"
ddtrace = "*"
watchdog = "*"
langchain-community = "*"
unstructured = {extras = ["all-docs"], version = "*"}
[dev-packages]
black = "*"

1957
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -20,7 +20,13 @@ RUN apt-get clean && apt-get update && apt-get install -y \
build-essential \
libtool \
python-dev \
build-essential && \
build-essential \
# Additional dependencies for document handling
libmagic-dev \
poppler-utils \
tesseract-ocr \
libreoffice \
pandoc && \
rm -rf /var/lib/apt/lists/* && apt-get clean
# Add Rust binaries to the PATH

View File

@ -17,7 +17,13 @@ RUN apt-get clean && apt-get update && apt-get install -y \
automake \
libtool \
python-dev \
build-essential && \
build-essential \
# Additional dependencies for document handling
libmagic-dev \
poppler-utils \
tesseract-ocr \
libreoffice \
pandoc && \
rm -rf /var/lib/apt/lists/* && apt-get clean
# Add Rust binaries to the PATH

View File

@ -2,8 +2,8 @@
aiohttp==3.9.3; python_version >= '3.8'
aiosignal==1.3.1; python_version >= '3.7'
amqp==5.2.0; python_version >= '3.6'
antlr4-python3-runtime==4.9.3
anyio==3.7.1; python_version >= '3.7'
argilla==1.23.0; python_version < '3.12' and python_version >= '3.8'
async-generator==1.10; python_version >= '3.5'
async-timeout==4.0.3; python_full_version <= '3.11.2'
asyncpg==0.27.0; python_full_version >= '3.7.0'
@ -11,7 +11,7 @@ attrs==23.2.0; python_version >= '3.7'
backoff==2.2.1; python_version >= '3.7' and python_version < '4.0'
beautifulsoup4==4.12.3; python_full_version >= '3.6.0'
billiard==4.2.0; python_version >= '3.7'
black==24.1.1; python_version >= '3.8'
black==24.2.0; python_version >= '3.8'
boto3==1.33.7; python_version >= '3.7'
botocore==1.33.7; python_version >= '3.7'
bytecode==0.15.1; python_version >= '3.8'
@ -19,30 +19,40 @@ cattrs==23.2.3; python_version >= '3.8'
celery[sqs]==5.3.6; python_version >= '3.8'
certifi==2024.2.2; python_version >= '3.6'
cffi==1.16.0; platform_python_implementation != 'PyPy'
chardet==5.2.0; python_version >= '3.7'
charset-normalizer==3.3.2; python_full_version >= '3.7.0'
click==8.1.7; python_version >= '3.7'
click-didyoumean==0.3.0; python_full_version >= '3.6.2' and python_full_version < '4.0.0'
click-plugins==1.1.1
click-repl==0.3.0; python_version >= '3.6'
coloredlogs==15.0.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
contourpy==1.2.0; python_version >= '3.9'
cryptography==42.0.2; python_version >= '3.7'
cssselect==1.2.0; python_version >= '3.7'
cycler==0.12.1; python_version >= '3.8'
dataclasses-json==0.6.4; python_version >= '3.7' and python_version < '4.0'
dataclasses-json-speakeasy==0.5.11; python_version >= '3.7' and python_version < '4.0'
ddsketch==2.0.4; python_version >= '2.7'
ddtrace==2.5.2; python_version >= '3.7'
ddtrace==2.6.0; python_version >= '3.7'
deprecated==1.2.14; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
deprecation==2.1.0
distro==1.9.0; python_version >= '3.6'
docx2txt==0.8
ecdsa==0.18.0; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'
effdet==0.4.1
emoji==2.10.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
envier==0.5.1; python_version >= '3.7'
et-xmlfile==1.1.0; python_version >= '3.6'
fastapi==0.95.2; python_version >= '3.7'
feedfinder2==0.0.4
feedparser==6.0.11; python_version >= '3.6'
filelock==3.13.1; python_version >= '3.8'
filetype==1.2.0
flake8==6.0.0; python_full_version >= '3.8.1'
flake8-black==0.3.6; python_version >= '3.7'
flatbuffers==23.5.26
flower==2.0.1; python_version >= '3.7'
fonttools==4.48.1; python_version >= '3.8'
frozenlist==1.4.1; python_version >= '3.8'
fsspec==2024.2.0; python_version >= '3.8'
gitdb==4.0.11; python_version >= '3.7'
@ -53,65 +63,82 @@ html5lib==1.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2,
httpcore==0.17.3; python_version >= '3.7'
httpx==0.24.1; python_version >= '3.7'
huggingface-hub==0.20.3; python_full_version >= '3.8.0'
humanfriendly==10.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
humanize==4.9.0; python_version >= '3.8'
idna==3.6; python_version >= '3.5'
importlib-metadata==6.11.0; python_version >= '3.8'
iniconfig==2.0.0; python_version >= '3.7'
iopath==0.1.10; python_version >= '3.6'
jieba3k==0.35.1
jinja2==3.1.3; python_version >= '3.7'
jmespath==1.0.1; python_version >= '3.7'
joblib==1.3.2; python_version >= '3.7'
jq==1.6.0; python_version >= '3.5'
jsonpatch==1.33; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6'
jsonpath-python==1.0.6; python_version >= '3.6'
jsonpointer==2.4; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6'
kiwisolver==1.4.5; python_version >= '3.7'
kombu[sqs]==5.3.5; python_version >= '3.8'
langchain==0.1.5; python_version < '4.0' and python_full_version >= '3.8.1'
langchain-community==0.0.17; python_version < '4.0' and python_full_version >= '3.8.1'
langchain-core==0.1.18; python_version < '4.0' and python_full_version >= '3.8.1'
langsmith==0.0.86; python_version < '4.0' and python_full_version >= '3.8.1'
litellm==1.22.5; python_version not in '2.7, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7' and python_version >= '3.8'
langchain==0.1.6; python_version < '4.0' and python_full_version >= '3.8.1'
langchain-community==0.0.19; python_version < '4.0' and python_full_version >= '3.8.1'
langchain-core==0.1.22; python_version < '4.0' and python_full_version >= '3.8.1'
langdetect==1.0.9
langsmith==0.0.87; python_version < '4.0' and python_full_version >= '3.8.1'
layoutparser[layoutmodels,tesseract]==0.3.4; python_version >= '3.6'
litellm==1.23.10; python_version not in '2.7, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7' and python_version >= '3.8'
lxml==5.1.0; python_version >= '3.6'
markdown==3.5.2; python_version >= '3.8'
markdown-it-py==3.0.0; python_version >= '3.8'
markupsafe==2.1.5; python_version >= '3.7'
marshmallow==3.20.2; python_version >= '3.8'
matplotlib==3.8.2; python_version >= '3.9'
mccabe==0.7.0; python_version >= '3.6'
mdurl==0.1.2; python_version >= '3.7'
monotonic==1.6
mpmath==1.3.0
msg-parser==1.2.0; python_version >= '3.4'
multidict==6.0.5; python_version >= '3.7'
mypy-extensions==1.0.0; python_version >= '3.5'
nest-asyncio==1.5.6; python_version >= '3.5'
networkx==3.2.1
newspaper3k==0.2.8
nltk==3.8.1; python_version >= '3.7'
nodeenv==1.8.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6'
numpy==1.23.5; python_version >= '3.8'
numpy==1.26.4; python_version >= '3.9'
olefile==0.47; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
omegaconf==2.3.0; python_version >= '3.6'
onnx==1.15.0
onnxruntime==1.15.1
openai==1.1.1; python_full_version >= '3.7.1'
opencv-python==4.9.0.80; python_version >= '3.6'
openpyxl==3.1.2; python_version >= '3.6'
opentelemetry-api==1.22.0; python_version >= '3.7'
packaging==23.2; python_version >= '3.7'
pandas==1.5.3; python_version >= '3.8'
pandas==2.2.0; python_version >= '3.8'
pathspec==0.12.1; python_version >= '3.8'
pdf2image==1.16.3
pdfminer.six==20231228; python_version >= '3.6'
pdfminer.six==20221105; python_version >= '3.6'
pdfplumber==0.10.4; python_version >= '3.8'
pikepdf==8.12.0
pillow==10.2.0; python_version >= '3.8'
pillow-heif==0.15.0
platformdirs==4.2.0; python_version >= '3.8'
pluggy==1.4.0; python_version >= '3.8'
portalocker==2.8.2; python_version >= '3.8'
postgrest==0.11.0; python_version >= '3.8' and python_version < '4.0'
posthog==3.1.0
prometheus-client==0.19.0; python_version >= '3.8'
prompt-toolkit==3.0.43; python_full_version >= '3.7.0'
protobuf==4.25.2; python_version >= '3.8'
pyasn1==0.5.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
pycocotools==2.0.7; python_version >= '3.5'
pycodestyle==2.10.0; python_version >= '3.6'
pycparser==2.21
pycurl==7.45.2
pydantic==1.10.14; python_version >= '3.7'
pyflakes==3.0.1; python_version >= '3.6'
pygments==2.17.2; python_version >= '3.7'
pypandoc==1.11; python_version >= '3.6'
pyparsing==3.1.1; python_full_version >= '3.6.8'
pypdf==3.9.0; python_version >= '3.6'
pypdfium2==4.27.0; python_version >= '3.6'
pyright==1.1.316; python_version >= '3.7'
pytesseract==0.3.10; python_version >= '3.7'
pytest==8.0.0; python_version >= '3.8'
@ -120,54 +147,65 @@ pytest-mock==3.12.0; python_version >= '3.8'
python-dateutil==2.8.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
python-docx==1.1.0; python_version >= '3.7'
python-dotenv==1.0.1; python_version >= '3.8'
python-iso639==2024.2.7; python_version >= '3.8'
python-jose==3.3.0
python-magic==0.4.27; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
python-multipart==0.0.6; python_version >= '3.7'
python-pptx==0.6.23
pytz==2024.1
pyyaml==6.0.1; python_version >= '3.6'
rapidfuzz==3.6.1; python_version >= '3.8'
realtime==1.0.2; python_version >= '3.8' and python_version < '4.0'
redis==4.5.4; python_version >= '3.7'
regex==2023.12.25; python_version >= '3.7'
requests==2.31.0; python_version >= '3.7'
requests-file==2.0.0
resend==0.5.1; python_version >= '3.7'
rich==13.7.0; python_full_version >= '3.7.0'
rsa==4.9; python_version >= '3.6' and python_version < '4'
s3transfer==0.8.2; python_version >= '3.7'
sentry-sdk[fastapi]==1.40.0
setuptools==69.0.3; python_version >= '3.8'
safetensors==0.4.2; python_version >= '3.7'
scipy==1.12.0; python_version >= '3.9'
sentry-sdk[fastapi]==1.40.3
setuptools==69.1.0; python_version >= '3.8'
sgmllib3k==1.0.0
six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
smmap==5.0.1; python_version >= '3.7'
sniffio==1.3.0; python_version >= '3.7'
soupsieve==2.5; python_version >= '3.8'
sqlalchemy==2.0.25; python_version >= '3.7'
sqlalchemy==2.0.26; python_version >= '3.7'
starlette==0.27.0; python_version >= '3.7'
storage3==0.6.1; python_version >= '3.8' and python_version < '4.0'
strenum==0.4.15
supabase==1.1.0; python_version >= '3.8' and python_version < '4.0'
supafunc==0.2.3; python_version >= '3.8' and python_version < '4.0'
sympy==1.12; python_version >= '3.8'
tabulate==0.9.0; python_version >= '3.7'
tenacity==8.2.3; python_version >= '3.7'
tiktoken==0.4.0; python_version >= '3.8'
timm==0.9.12; python_version >= '3.7'
tinysegmenter==0.3
tldextract==5.1.1; python_version >= '3.8'
tokenizers==0.15.1; python_version >= '3.7'
tokenizers==0.15.2; python_version >= '3.7'
torch==2.2.0
torchvision==0.17.0
tornado==6.4; python_version >= '3.8'
tqdm==4.66.1; python_version >= '3.7'
typer==0.9.0; python_version >= '3.6'
tqdm==4.66.2; python_version >= '3.7'
transformers==4.37.2; python_full_version >= '3.8.0'
typing-extensions==4.9.0; python_version >= '3.8'
typing-inspect==0.9.0
tzdata==2023.4; python_version >= '2'
unstructured==0.6.7; python_full_version >= '3.7.0'
tzdata==2024.1; python_version >= '2'
unstructured[all-docs]==0.12.4; python_version < '3.12' and python_full_version >= '3.9.0'
unstructured-client==0.18.0; python_version >= '3.8'
unstructured-inference==0.7.23
unstructured.pytesseract==0.3.12
urllib3==2.0.7; python_version >= '3.10'
uvicorn==0.22.0; python_version >= '3.7'
vine==5.1.0; python_version >= '3.6'
watchdog==3.0.0; python_version >= '3.7'
watchdog==4.0.0; python_version >= '3.8'
wcwidth==0.2.13
webencodings==0.5.1
websockets==11.0.3; python_version >= '3.7'
wrapt==1.14.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
wrapt==1.16.0; python_version >= '3.6'
xlrd==1.0.0
xlsxwriter==3.1.9; python_version >= '3.6'
xmltodict==0.13.0; python_version >= '3.4'

View File

@ -71,7 +71,7 @@ site_url = "http://localhost:3000"
# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
additional_redirect_urls = ["https://localhost:3000"]
# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
jwt_expiry = 3600
jwt_expiry = 604800
# If disabled, the refresh token will never expire.
enable_refresh_token_rotation = true
# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.