feat(assistant): cdp (#3305)

# Description

Please include a summary of the changes and the related issue. Please
also include relevant motivation and context.

## Checklist before requesting a review

Please delete options that are not relevant.

- [ ] My code follows the style guidelines of this project
- [ ] I have performed a self-review of my code
- [ ] I have commented hard-to-understand areas
- [ ] I have ideally added tests that prove my fix is effective or that
my feature works
- [ ] New and existing unit tests pass locally with my changes
- [ ] Any dependent changes have been merged

## Screenshots (if appropriate):

---------

Co-authored-by: Zewed <dewez.antoine2@gmail.com>
This commit is contained in:
Stan Girard 2024-10-03 15:46:59 +02:00 committed by GitHub
parent c39913956d
commit b767f19f28
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
110 changed files with 6315 additions and 706 deletions

View File

@ -24,6 +24,7 @@ RUN apt-get clean && apt-get update && apt-get install -y \
libreoffice \
libpq-dev \
gcc \
libhdf5-serial-dev \
pandoc && \
rm -rf /var/lib/apt/lists/* && apt-get clean
@ -46,6 +47,8 @@ COPY core/pyproject.toml core/README.md ./core/
COPY core/quivr_core/__init__.py ./core/quivr_core/__init__.py
COPY worker/pyproject.toml worker/README.md ./worker/
COPY worker/quivr_worker/__init__.py ./worker/quivr_worker/__init__.py
COPY worker/diff-assistant/pyproject.toml worker/diff-assistant/README.md ./worker/diff-assistant/
COPY worker/diff-assistant/quivr_diff_assistant/__init__.py ./worker/diff-assistant/quivr_diff_assistant/__init__.py
COPY core/MegaParse/pyproject.toml core/MegaParse/README.md ./core/MegaParse/
COPY core/MegaParse/megaparse/__init__.py ./core/MegaParse/megaparse/__init__.py

View File

@ -23,6 +23,7 @@ RUN apt-get clean && apt-get update && apt-get install -y \
libreoffice \
libpq-dev \
gcc \
libhdf5-serial-dev \
pandoc && \
rm -rf /var/lib/apt/lists/* && apt-get clean
@ -33,6 +34,8 @@ COPY core/pyproject.toml core/README.md ./core/
COPY core/quivr_core/__init__.py ./core/quivr_core/__init__.py
COPY worker/pyproject.toml worker/README.md ./worker/
COPY worker/quivr_worker/__init__.py ./worker/quivr_worker/__init__.py
COPY worker/diff-assistant/pyproject.toml worker/diff-assistant/README.md ./worker/diff-assistant/
COPY worker/diff-assistant/quivr_diff_assistant/__init__.py ./worker/diff-assistant/quivr_diff_assistant/__init__.py
COPY core/MegaParse/pyproject.toml core/MegaParse/README.md ./core/MegaParse/
COPY core/MegaParse/megaparse/__init__.py ./core/MegaParse/megaparse/__init__.py

View File

@ -2,7 +2,7 @@ import io
from typing import Annotated, List
from uuid import uuid4
from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile
from fastapi import APIRouter, Depends, File, HTTPException, Request, UploadFile
from quivr_api.celery_config import celery
from quivr_api.logger import get_logger
@ -16,6 +16,7 @@ from quivr_api.modules.assistant.dto.outputs import AssistantOutput
from quivr_api.modules.assistant.entity.assistant_entity import (
AssistantSettings,
)
from quivr_api.modules.assistant.entity.task_entity import TaskMetadata
from quivr_api.modules.assistant.services.tasks_service import TasksService
from quivr_api.modules.dependencies import get_service
from quivr_api.modules.upload.service.upload_file import (
@ -64,12 +65,15 @@ async def create_task(
current_user: UserIdentityDep,
tasks_service: TasksServiceDep,
request: Request,
input: InputAssistant,
input: str = File(...),
files: List[UploadFile] = None,
):
input = InputAssistant.model_validate_json(input)
assistant = next(
(assistant for assistant in assistants if assistant.id == input.id), None
)
if assistant is None:
raise HTTPException(status_code=404, detail="Assistant not found")
@ -80,7 +84,7 @@ async def create_task(
raise HTTPException(status_code=400, detail=error)
else:
print("Assistant input is valid.")
notification_uuid = uuid4()
notification_uuid = f"{assistant.name}-{str(uuid4())[:8]}"
# Process files dynamically
for upload_file in files:
@ -96,8 +100,14 @@ async def create_task(
task = CreateTask(
assistant_id=input.id,
pretty_id=str(notification_uuid),
assistant_name=assistant.name,
pretty_id=notification_uuid,
settings=input.model_dump(mode="json"),
task_metadata=TaskMetadata(
input_files=[file.filename for file in files]
).model_dump(mode="json")
if files
else None, # type: ignore
)
task_created = await tasks_service.create_task(task, current_user.id)

View File

@ -1,8 +1,11 @@
from quivr_api.modules.assistant.dto.inputs import InputAssistant
from quivr_api.modules.assistant.dto.outputs import (
AssistantOutput,
ConditionalInput,
InputBoolean,
InputFile,
Inputs,
InputSelectText,
Pricing,
)
@ -166,10 +169,10 @@ def validate_assistant_input(
assistant1 = AssistantOutput(
id=1,
name="Assistant 1",
description="Assistant 1 description",
name="Compliance Check",
description="Allows analyzing the compliance of the information contained in documents against charter or regulatory requirements.",
pricing=Pricing(),
tags=["tag1", "tag2"],
tags=["Disabled"],
input_description="Input description",
output_description="Output description",
inputs=Inputs(
@ -183,19 +186,66 @@ assistant1 = AssistantOutput(
assistant2 = AssistantOutput(
id=2,
name="Assistant 2",
description="Assistant 2 description",
name="Consistency Check",
description="Ensures that the information in one document is replicated identically in another document.",
pricing=Pricing(),
tags=["tag1", "tag2"],
tags=[],
input_description="Input description",
output_description="Output description",
icon_url="https://example.com/icon.png",
inputs=Inputs(
files=[
InputFile(key="file_1", description="File description"),
InputFile(key="file_2", description="File description"),
InputFile(key="Document 1", description="File description"),
InputFile(key="Document 2", description="File description"),
],
select_texts=[
InputSelectText(
key="DocumentsType",
description="Select Documents Type",
options=[
"Etiquettes VS Cahier des charges",
"Fiche Dev VS Cahier des charges",
],
),
],
),
)
assistants = [assistant1, assistant2]
assistant3 = AssistantOutput(
id=3,
name="Difference Detection",
description="Highlights differences between one document and another after modifications.",
pricing=Pricing(),
tags=[],
input_description="Input description",
output_description="Output description",
icon_url="https://example.com/icon.png",
inputs=Inputs(
files=[
InputFile(key="Document 1", description="File description"),
InputFile(key="Document 2", description="File description"),
],
booleans=[
InputBoolean(
key="Hard-to-Read Document?", description="Boolean description"
),
],
select_texts=[
InputSelectText(
key="DocumentsType",
description="Select Documents Type",
options=["Etiquettes", "Cahier des charges"],
),
],
conditional_inputs=[
ConditionalInput(
key="DocumentsType",
conditional_key="Hard-to-Read Document?",
condition="equals",
value="Etiquettes",
),
],
),
)
assistants = [assistant1, assistant2, assistant3]

View File

@ -1,4 +1,4 @@
from typing import List, Optional
from typing import Dict, List, Optional
from uuid import UUID
from pydantic import BaseModel, root_validator
@ -7,7 +7,9 @@ from pydantic import BaseModel, root_validator
class CreateTask(BaseModel):
pretty_id: str
assistant_id: int
assistant_name: str
settings: dict
task_metadata: Dict | None = None
class BrainInput(BaseModel):

View File

@ -61,6 +61,21 @@ class InputSelectNumber(BaseModel):
default: Optional[int] = None
class ConditionalInput(BaseModel):
"""
Conditional input is a list of inputs that are conditional to the value of another input.
key: The key of the input that is conditional.
conditional_key: The key that determines if the input is shown.
"""
key: str
conditional_key: str
condition: Optional[str] = (
None # e.g. "equals", "contains", "starts_with", "ends_with", "regex", "in", "not_in", "is_empty", "is_not_empty"
)
value: Optional[str] = None
class Inputs(BaseModel):
files: Optional[List[InputFile]] = None
urls: Optional[List[InputUrl]] = None
@ -70,6 +85,7 @@ class Inputs(BaseModel):
select_texts: Optional[List[InputSelectText]] = None
select_numbers: Optional[List[InputSelectNumber]] = None
brain: Optional[BrainInput] = None
conditional_inputs: Optional[List[ConditionalInput]] = None
class Pricing(BaseModel):

View File

@ -1,10 +1,15 @@
from datetime import datetime
from typing import Dict
from typing import Dict, List, Optional
from uuid import UUID
from pydantic import BaseModel
from sqlmodel import JSON, TIMESTAMP, BigInteger, Column, Field, SQLModel, text
class TaskMetadata(BaseModel):
input_files: Optional[List[str]] = None
class Task(SQLModel, table=True):
__tablename__ = "tasks" # type: ignore
@ -17,6 +22,7 @@ class Task(SQLModel, table=True):
),
)
assistant_id: int
assistant_name: str
pretty_id: str
user_id: UUID
status: str = Field(default="pending")
@ -29,6 +35,4 @@ class Task(SQLModel, table=True):
)
settings: Dict = Field(default_factory=dict, sa_column=Column(JSON))
answer: str | None = Field(default=None)
class Config:
arbitrary_types_allowed = True
task_metadata: Dict | None = Field(default_factory=dict, sa_column=Column(JSON))

View File

@ -3,7 +3,7 @@ from uuid import UUID
from sqlalchemy import exc
from sqlalchemy.ext.asyncio import AsyncSession
from sqlmodel import select
from sqlmodel import col, select
from quivr_api.modules.assistant.dto.inputs import CreateTask
from quivr_api.modules.assistant.entity.task_entity import Task
@ -21,9 +21,11 @@ class TasksRepository(BaseRepository):
try:
task_to_create = Task(
assistant_id=task.assistant_id,
assistant_name=task.assistant_name,
pretty_id=task.pretty_id,
user_id=user_id,
settings=task.settings,
task_metadata=task.task_metadata, # type: ignore
)
self.session.add(task_to_create)
await self.session.commit()
@ -40,7 +42,9 @@ class TasksRepository(BaseRepository):
return response.one()
async def get_tasks_by_user_id(self, user_id: UUID) -> Sequence[Task]:
query = select(Task).where(Task.user_id == user_id)
query = (
select(Task).where(Task.user_id == user_id).order_by(col(Task.id).desc())
)
response = await self.session.exec(query)
return response.all()

View File

@ -1,4 +1,4 @@
from typing import Optional, Tuple, Dict
from typing import Dict, Optional, Tuple
from uuid import UUID
from fastapi import HTTPException

View File

@ -1,5 +1,5 @@
import time
import os
import time
from enum import Enum
from fastapi import HTTPException

View File

@ -1,9 +1,10 @@
import os
from typing import Annotated, List, Optional
from uuid import UUID
import os
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, Request
from fastapi.responses import StreamingResponse
from quivr_core.config import RetrievalConfig
from quivr_api.logger import get_logger
from quivr_api.middlewares.auth import AuthBearer, get_current_user
@ -36,7 +37,6 @@ from quivr_api.modules.user.entity.user_identity import UserIdentity
from quivr_api.modules.vector.service.vector_service import VectorService
from quivr_api.utils.telemetry import maybe_send_telemetry
from quivr_api.utils.uuid_generator import generate_uuid_from_string
from quivr_core.config import RetrievalConfig
logger = get_logger(__name__)

View File

@ -2,8 +2,8 @@ from datetime import datetime
from enum import Enum
from typing import Any, Dict, List, Optional
from uuid import UUID
from pydantic import BaseModel
from pydantic import BaseModel
from quivr_core.models import KnowledgeStatus
from sqlalchemy import JSON, TIMESTAMP, Column, text
from sqlalchemy.ext.asyncio import AsyncAttrs

View File

@ -86,4 +86,3 @@ class SupabaseS3Storage(StorageInterface):
except Exception as e:
logger.error(e)
raise e

View File

@ -527,7 +527,9 @@ async def test_should_process_knowledge_prev_error(
assert new.file_sha1
@pytest.mark.skip(reason="Bug: UnboundLocalError: cannot access local variable 'response'")
@pytest.mark.skip(
reason="Bug: UnboundLocalError: cannot access local variable 'response'"
)
@pytest.mark.asyncio(loop_scope="session")
async def test_get_knowledge_storage_path(session: AsyncSession, test_data: TestData):
_, [knowledge, _] = test_data

View File

@ -1,9 +1,8 @@
from fastapi import APIRouter, Depends, HTTPException
from quivr_api.logger import get_logger
from quivr_api.modules.dependencies import get_async_session
from sqlmodel.ext.asyncio.session import AsyncSession
from sqlmodel import text
from sqlmodel.ext.asyncio.session import AsyncSession
logger = get_logger(__name__)
@ -20,7 +19,6 @@ async def root():
@misc_router.get("/healthz", tags=["Health"])
async def healthz(session: AsyncSession = Depends(get_async_session)):
try:
result = await session.execute(text("SELECT 1"))
if not result:

View File

@ -2,7 +2,6 @@ import datetime
import os
from uuid import UUID, uuid4
from quivr_api.utils.uuid_generator import generate_uuid_from_string
from quivr_core.brain import Brain as BrainCore
from quivr_core.chat import ChatHistory as ChatHistoryCore
from quivr_core.config import LLMEndpointConfig, RetrievalConfig
@ -29,6 +28,7 @@ from quivr_api.modules.prompt.entity.prompt import Prompt
from quivr_api.modules.prompt.service.prompt_service import PromptService
from quivr_api.modules.user.entity.user_identity import UserIdentity
from quivr_api.modules.vector.service.vector_service import VectorService
from quivr_api.utils.uuid_generator import generate_uuid_from_string
from quivr_api.vectorstore.supabase import CustomSupabaseVectorStore
from .utils import generate_source

View File

@ -68,7 +68,7 @@ async def generate_source(
try:
file_name = doc.metadata["file_name"]
file_path = await knowledge_service.get_knowledge_storage_path(
file_name=file_name, brain_id=brain_id
file_name=file_name, brain_id=brain_id
)
if file_path in generated_urls:
source_url = generated_urls[file_path]

View File

@ -93,9 +93,7 @@ class SyncUserRepository:
sync_user_id,
)
query = (
self.db.from_("syncs_user")
.select("*")
.eq("user_id", user_id)
self.db.from_("syncs_user").select("*").eq("user_id", user_id)
# .neq("status", "REMOVED")
)
if sync_user_id:
@ -170,9 +168,9 @@ class SyncUserRepository:
)
state_str = json.dumps(state)
self.db.from_("syncs_user").update(sync_user_input.model_dump(exclude_unset=True)).eq(
"user_id", str(sync_user_id)
).eq("state", state_str).execute()
self.db.from_("syncs_user").update(
sync_user_input.model_dump(exclude_unset=True)
).eq("user_id", str(sync_user_id)).eq("state", state_str).execute()
logger.info("Sync user updated successfully")
def update_sync_user_status(self, sync_user_id: int, status: str):

View File

@ -1,9 +1,9 @@
import time
from datetime import datetime, timezone
from typing import List, Sequence
from uuid import UUID
from notion_client import Client
import time
from quivr_api.logger import get_logger
from quivr_api.modules.dependencies import BaseService
@ -165,7 +165,6 @@ async def store_notion_pages(
def fetch_notion_pages(
notion_client: Client, start_cursor: str | None = None, iteration: int = 0
) -> NotionSearchResult:
if iteration > 10:
return NotionSearchResult(results=[], has_more=False, next_cursor=None)
search_result = notion_client.search(
@ -177,7 +176,9 @@ def fetch_notion_pages(
if "code" in search_result and search_result["code"] == "rate_limited":
# Wait 10 seconds
time.sleep(10)
search_result = fetch_notion_pages(notion_client, start_cursor=start_cursor, iteration=iteration+1)
search_result = fetch_notion_pages(
notion_client, start_cursor=start_cursor, iteration=iteration + 1
)
return NotionSearchResult.model_validate(search_result)

View File

@ -74,7 +74,9 @@ def test_fetch_limit_notion_pages_now(fetch_response):
assert len(result) == 0
@pytest.mark.skip(reason="Bug: httpx.ConnectError: [Errno -2] Name or service not known'")
@pytest.mark.skip(
reason="Bug: httpx.ConnectError: [Errno -2] Name or service not known'"
)
@pytest.mark.asyncio(loop_scope="session")
async def test_store_notion_pages_success(
session: AsyncSession,

View File

@ -271,7 +271,10 @@ async def test_process_sync_file_not_supported(syncutils: SyncUtils):
sync_active=sync_active,
)
@pytest.mark.skip(reason="Bug: UnboundLocalError: cannot access local variable 'response'")
@pytest.mark.skip(
reason="Bug: UnboundLocalError: cannot access local variable 'response'"
)
@pytest.mark.asyncio(loop_scope="session")
async def test_process_sync_file_noprev(
monkeypatch,
@ -327,8 +330,8 @@ async def test_process_sync_file_noprev(
assert created_km.file_sha1 is None
assert created_km.created_at is not None
assert created_km.metadata == {"sync_file_id": "1"}
assert len(created_km.brains)> 0
assert created_km.brains[0]["brain_id"]== brain_1.brain_id
assert len(created_km.brains) > 0
assert created_km.brains[0]["brain_id"] == brain_1.brain_id
# Assert celery task in correct
assert task["args"] == ("process_file_task",)
@ -345,8 +348,9 @@ async def test_process_sync_file_noprev(
)
@pytest.mark.skip(reason="Bug: UnboundLocalError: cannot access local variable 'response'")
@pytest.mark.skip(
reason="Bug: UnboundLocalError: cannot access local variable 'response'"
)
@pytest.mark.asyncio(loop_scope="session")
async def test_process_sync_file_with_prev(
monkeypatch,
@ -424,7 +428,7 @@ async def test_process_sync_file_with_prev(
assert created_km.created_at
assert created_km.updated_at == created_km.created_at # new line
assert created_km.metadata == {"sync_file_id": str(dbfiles[0].id)}
assert created_km.brains[0]["brain_id"]== brain_1.brain_id
assert created_km.brains[0]["brain_id"] == brain_1.brain_id
# Check file content changed
assert check_file_exists(str(brain_1.brain_id), sync_file.name)

View File

@ -1,13 +1,14 @@
import asyncio
import base64
import re
from enum import Enum
from io import BytesIO
from pathlib import Path
from typing import List
from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI
import base64
from pdf2image import convert_from_path
import asyncio
import re
# BASE_OCR_PROMPT = """
# Transcribe the content of this file into markdown. Be mindful of the formatting.

View File

@ -1,9 +1,11 @@
from docx.document import Document as DocumentObject
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.section import Section
from docx.section import _Footer as Footer
from docx.section import _Header as Header
from docx.table import Table
from docx.text.paragraph import Paragraph
from docx.section import Section, _Header as Header, _Footer as Footer
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
def print_element(element):

View File

@ -1,5 +1,4 @@
import pytest
from megaparse.Converter import MegaParse

View File

@ -2,7 +2,6 @@ import tempfile
from quivr_core import Brain
from quivr_core.quivr_rag_langgraph import QuivrQARAGLangGraph
if __name__ == "__main__":
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt") as temp_file:

View File

@ -1,7 +1,7 @@
from datetime import datetime
from typing import Any, Generator, Tuple, List
from uuid import UUID, uuid4
from copy import deepcopy
from datetime import datetime
from typing import Any, Generator, List, Tuple
from uuid import UUID, uuid4
from langchain_core.messages import AIMessage, HumanMessage

View File

@ -2,9 +2,9 @@ import os
from enum import Enum
from typing import Dict, List, Optional
from uuid import UUID
from sqlmodel import SQLModel
from megaparse.config import MegaparseConfig
from sqlmodel import SQLModel
from quivr_core.base_config import QuivrBaseConfig
from quivr_core.processor.splitter import SplitterConfig

View File

@ -1,14 +1,14 @@
import datetime
from pydantic import ConfigDict, create_model
from langchain_core.prompts.base import BasePromptTemplate
from langchain_core.prompts import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
MessagesPlaceholder,
PromptTemplate,
SystemMessagePromptTemplate,
MessagesPlaceholder,
)
from langchain_core.prompts.base import BasePromptTemplate
from pydantic import ConfigDict, create_model
class CustomPromptsDict(dict):

View File

@ -1,7 +1,7 @@
import logging
from enum import Enum
from typing import Annotated, AsyncGenerator, Optional, Sequence, TypedDict
from uuid import uuid4
from enum import Enum
# TODO(@aminediro): this is the only dependency to langchain package, we should remove it
from langchain.retrievers import ContextualCompressionRetriever
@ -12,7 +12,7 @@ from langchain_core.documents import BaseDocumentCompressor, Document
from langchain_core.messages import BaseMessage
from langchain_core.messages.ai import AIMessageChunk
from langchain_core.vectorstores import VectorStore
from langgraph.graph import START, END, StateGraph
from langgraph.graph import END, START, StateGraph
from langgraph.graph.message import add_messages
from quivr_core.chat import ChatHistory

View File

@ -41,7 +41,7 @@ dev-dependencies = [
]
[tool.rye.workspace]
members = [".", "core", "worker", "api", "docs", "core/examples/chatbot", "core/MegaParse"]
members = [".", "core", "worker", "api", "docs", "core/examples/chatbot", "core/MegaParse", "worker/diff-assistant"]
[tool.hatch.metadata]
allow-direct-references = true

View File

@ -20,7 +20,10 @@
# via quivr-worker
-e file:core/MegaParse
# via quivr-core
# via quivr-diff-assistant
-e file:worker
-e file:worker/diff-assistant
# via quivr-worker
aiofiles==23.2.1
# via chainlit
# via quivr-core
@ -43,6 +46,8 @@ anthropic==0.34.1
# via langchain-anthropic
antlr4-python3-runtime==4.9.3
# via omegaconf
anyascii==0.3.2
# via python-doctr
anyio==3.7.1
# via anthropic
# via asyncer
@ -126,6 +131,7 @@ click==8.1.7
# via mkdocs
# via mkdocstrings
# via nltk
# via python-oxmsg
# via uvicorn
click-didyoumean==0.3.1
# via celery
@ -178,6 +184,7 @@ defusedxml==0.7.1
# via fpdf2
# via langchain-anthropic
# via nbconvert
# via python-doctr
deprecated==1.2.14
# via llama-index-core
# via llama-index-legacy
@ -188,6 +195,8 @@ deprecated==1.2.14
# via pikepdf
deprecation==2.1.0
# via postgrest
diff-match-patch==20230430
# via quivr-diff-assistant
dirtyjson==1.0.8
# via llama-index-core
# via llama-index-legacy
@ -198,6 +207,7 @@ distro==1.9.0
# via openai
docx2txt==0.8
# via quivr-core
# via quivr-diff-assistant
dropbox==12.0.2
# via quivr-api
ecdsa==0.19.0
@ -214,6 +224,7 @@ executing==2.0.1
# via stack-data
faiss-cpu==1.8.0.post1
# via quivr-core
# via quivr-diff-assistant
fastapi==0.110.3
# via chainlit
# via quivr-api
@ -298,6 +309,9 @@ h11==0.14.0
# via wsproto
h2==4.1.0
# via httpx
h5py==3.10.0
# via python-doctr
# via quivr-diff-assistant
hpack==4.0.0
# via h2
httpcore==1.0.5
@ -325,6 +339,7 @@ httpx==0.27.0
httpx-sse==0.4.0
# via cohere
huggingface-hub==0.24.6
# via python-doctr
# via timm
# via tokenizers
# via transformers
@ -371,6 +386,7 @@ jmespath==1.0.1
# via botocore
joblib==1.4.2
# via nltk
# via scikit-learn
jsonpatch==1.33
# via langchain-core
jsonpath-python==1.0.6
@ -399,11 +415,12 @@ kiwisolver==1.4.5
# via matplotlib
kombu==5.4.0
# via celery
langchain==0.2.14
langchain==0.2.16
# via langchain-community
# via megaparse
# via quivr-api
# via quivr-core
# via quivr-diff-assistant
langchain-anthropic==0.1.23
# via quivr-core
# via quivr-monorepo
@ -414,7 +431,7 @@ langchain-community==0.2.12
# via megaparse
# via quivr-api
# via quivr-core
langchain-core==0.2.38
langchain-core==0.2.41
# via langchain
# via langchain-anthropic
# via langchain-cohere
@ -428,18 +445,20 @@ langchain-core==0.2.38
# via quivr-core
langchain-experimental==0.0.64
# via langchain-cohere
langchain-openai==0.1.22
langchain-openai==0.1.25
# via megaparse
# via quivr-api
# via quivr-diff-assistant
langchain-text-splitters==0.2.2
# via langchain
langdetect==1.0.9
# via python-doctr
# via unstructured
langgraph==0.2.14
# via quivr-core
langgraph-checkpoint==1.0.6
# via langgraph
langsmith==0.1.100
langsmith==0.1.126
# via langchain
# via langchain-community
# via langchain-core
@ -453,14 +472,15 @@ literalai==0.0.607
# via chainlit
llama-cloud==0.0.13
# via llama-index-indices-managed-llama-cloud
llama-index==0.10.67.post1
llama-index==0.11.12
# via megaparse
llama-index-agent-openai==0.2.9
# via quivr-diff-assistant
llama-index-agent-openai==0.3.4
# via llama-index
# via llama-index-program-openai
llama-index-cli==0.1.13
llama-index-cli==0.3.1
# via llama-index
llama-index-core==0.10.67
llama-index-core==0.11.12
# via llama-index
# via llama-index-agent-openai
# via llama-index-cli
@ -473,35 +493,39 @@ llama-index-core==0.10.67
# via llama-index-readers-file
# via llama-index-readers-llama-parse
# via llama-parse
llama-index-embeddings-openai==0.1.11
llama-index-embeddings-openai==0.2.5
# via llama-index
# via llama-index-cli
llama-index-indices-managed-llama-cloud==0.2.7
llama-index-indices-managed-llama-cloud==0.3.1
# via llama-index
llama-index-legacy==0.9.48.post3
# via llama-index
llama-index-llms-openai==0.1.30
llama-index-llms-openai==0.2.9
# via llama-index
# via llama-index-agent-openai
# via llama-index-cli
# via llama-index-multi-modal-llms-openai
# via llama-index-program-openai
# via llama-index-question-gen-openai
llama-index-multi-modal-llms-openai==0.1.9
# via quivr-diff-assistant
llama-index-multi-modal-llms-openai==0.2.1
# via llama-index
llama-index-program-openai==0.1.7
llama-index-program-openai==0.2.0
# via llama-index
# via llama-index-question-gen-openai
llama-index-question-gen-openai==0.1.3
llama-index-question-gen-openai==0.2.0
# via llama-index
llama-index-readers-file==0.1.33
llama-index-readers-file==0.2.2
# via llama-index
llama-index-readers-llama-parse==0.1.6
# via quivr-diff-assistant
llama-index-readers-llama-parse==0.3.0
# via llama-index
llama-parse==0.4.9
llama-parse==0.5.6
# via llama-index-readers-llama-parse
# via megaparse
# via quivr-api
llvmlite==0.43.0
# via numba
lxml==5.3.0
# via pikepdf
# via python-docx
@ -535,7 +559,9 @@ marshmallow==3.22.0
marshmallow-enum==1.5.1
# via unstructured-client
matplotlib==3.9.2
# via mplcursors
# via pycocotools
# via quivr-diff-assistant
# via unstructured-inference
matplotlib-inline==0.1.7
# via ipykernel
@ -576,6 +602,8 @@ mkdocstrings-python==1.11.1
# via mkdocstrings
monotonic==1.6
# via posthog
mplcursors==0.5.3
# via quivr-diff-assistant
mpmath==1.3.0
# via sympy
msal==1.30.0
@ -608,6 +636,7 @@ networkx==3.2.1
# via torch
# via unstructured
nltk==3.9.1
# via llama-index
# via llama-index-core
# via llama-index-legacy
# via unstructured
@ -615,16 +644,20 @@ nodeenv==1.9.1
# via pre-commit
notion-client==2.2.1
# via quivr-api
numba==0.60.0
# via quivr-diff-assistant
numpy==1.26.3
# via chainlit
# via contourpy
# via faiss-cpu
# via h5py
# via langchain
# via langchain-community
# via layoutparser
# via llama-index-core
# via llama-index-legacy
# via matplotlib
# via numba
# via onnx
# via onnxruntime
# via opencv-python
@ -633,12 +666,18 @@ numpy==1.26.3
# via pdf2docx
# via pgvector
# via pycocotools
# via python-doctr
# via quivr-diff-assistant
# via scikit-learn
# via scipy
# via shapely
# via torchvision
# via transformers
# via unstructured
oauthlib==3.2.2
# via requests-oauthlib
olefile==0.47
# via python-oxmsg
omegaconf==2.3.0
# via effdet
onnx==1.16.2
@ -646,21 +685,25 @@ onnx==1.16.2
# via unstructured-inference
onnxruntime==1.19.0
# via unstructured-inference
openai==1.42.0
openai==1.47.1
# via langchain-openai
# via litellm
# via llama-index-agent-openai
# via llama-index-core
# via llama-index-embeddings-openai
# via llama-index-legacy
# via llama-index-llms-openai
# via quivr-api
# via quivr-diff-assistant
# via quivr-worker
opencv-python==4.10.0.84
# via layoutparser
# via python-doctr
# via quivr-diff-assistant
# via unstructured-inference
opencv-python-headless==4.10.0.84
# via pdf2docx
openpyxl==3.1.5
# via quivr-diff-assistant
# via unstructured
opentelemetry-api==1.27.0
# via opentelemetry-exporter-otlp-proto-grpc
@ -720,8 +763,9 @@ paginate==0.5.7
pandas==2.2.2
# via langchain-cohere
# via layoutparser
# via llama-index-core
# via llama-index-legacy
# via llama-index-readers-file
# via quivr-diff-assistant
# via unstructured
pandocfilters==1.5.1
# via nbconvert
@ -747,6 +791,8 @@ pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32'
# via ipython
pgvector==0.3.2
# via quivr-api
pi-heif==0.18.0
# via unstructured
pikepdf==9.1.1
# via unstructured
pillow==10.2.0
@ -756,13 +802,12 @@ pillow==10.2.0
# via matplotlib
# via pdf2image
# via pdfplumber
# via pi-heif
# via pikepdf
# via pillow-heif
# via python-doctr
# via python-pptx
# via torchvision
# via unstructured-pytesseract
pillow-heif==0.18.0
# via unstructured
platformdirs==4.2.2
# via black
# via jupyter-core
@ -819,6 +864,8 @@ pyasn1==0.6.0
# via rsa
pyasn1-modules==0.4.0
# via google-auth
pyclipper==1.3.0.post5
# via python-doctr
pycocotools==2.0.8
# via effdet
pycodestyle==2.12.1
@ -839,6 +886,7 @@ pydantic==2.8.2
# via litellm
# via literalai
# via llama-cloud
# via llama-index-core
# via openai
# via postgrest
# via pydantic-settings
@ -879,9 +927,12 @@ pyparsing==3.1.2
# via unstructured-client
pypdf==4.3.1
# via llama-index-readers-file
# via quivr-diff-assistant
# via unstructured
pypdfium2==4.30.0
# via pdfplumber
# via python-doctr
# via quivr-diff-assistant
pyproject-api==1.6.1
# via tox
pyreadline3==3.4.1 ; sys_platform == 'win32'
@ -910,6 +961,8 @@ python-dateutil==2.9.0.post0
# via realtime
# via storage3
# via unstructured-client
python-doctr==0.9.0
# via quivr-diff-assistant
python-docx==1.1.2
# via megaparse
# via pdf2docx
@ -921,6 +974,7 @@ python-dotenv==1.0.1
# via pydantic-settings
# via pytest-dotenv
# via quivr-api
# via quivr-diff-assistant
# via quivr-worker
python-engineio==4.9.1
# via python-socketio
@ -929,11 +983,14 @@ python-iso639==2024.4.27
python-jose==3.3.0
# via quivr-api
python-magic==0.4.27
# via quivr-diff-assistant
# via unstructured
python-multipart==0.0.9
# via chainlit
# via quivr-api
# via unstructured-inference
python-oxmsg==0.0.1
# via unstructured
python-pptx==1.0.2
# via megaparse
# via unstructured
@ -967,6 +1024,7 @@ pyzmq==26.1.1
# via ipykernel
# via jupyter-client
rapidfuzz==3.9.6
# via python-doctr
# via unstructured
# via unstructured-inference
realtime==2.0.2
@ -1021,14 +1079,20 @@ s3transfer==0.10.2
safetensors==0.4.4
# via timm
# via transformers
scikit-learn==1.5.2
# via quivr-diff-assistant
scipy==1.14.1
# via layoutparser
# via python-doctr
# via scikit-learn
sentencepiece==0.2.0
# via transformers
sentry-sdk==2.13.0
# via quivr-api
setuptools==70.0.0
# via opentelemetry-instrumentation
shapely==2.0.6
# via python-doctr
simple-websocket==1.0.0
# via python-engineio
six==1.16.0
@ -1091,6 +1155,8 @@ tenacity==8.5.0
# via llama-index-legacy
termcolor==2.4.0
# via fire
threadpoolctl==3.5.0
# via scikit-learn
tiktoken==0.7.0
# via langchain-openai
# via litellm
@ -1141,6 +1207,7 @@ tqdm==4.66.5
# via llama-index-core
# via nltk
# via openai
# via python-doctr
# via transformers
# via unstructured
traitlets==5.14.3
@ -1180,6 +1247,7 @@ typing-extensions==4.12.2
# via pydantic-core
# via pyee
# via python-docx
# via python-oxmsg
# via python-pptx
# via realtime
# via resend
@ -1199,9 +1267,10 @@ tzdata==2024.1
# via pandas
unidecode==1.3.8
# via quivr-api
unstructured==0.15.7
unstructured==0.15.13
# via megaparse
# via quivr-core
# via quivr-diff-assistant
unstructured-client==0.6.0
# via unstructured
unstructured-inference==0.7.36

View File

@ -20,7 +20,10 @@
# via quivr-worker
-e file:core/MegaParse
# via quivr-core
# via quivr-diff-assistant
-e file:worker
-e file:worker/diff-assistant
# via quivr-worker
aiofiles==24.1.0
# via quivr-core
aiohappyeyeballs==2.4.0
@ -42,6 +45,8 @@ anthropic==0.34.2
# via langchain-anthropic
antlr4-python3-runtime==4.9.3
# via omegaconf
anyascii==0.3.2
# via python-doctr
anyio==4.4.0
# via anthropic
# via httpx
@ -108,6 +113,7 @@ click==8.1.7
# via mkdocs
# via mkdocstrings
# via nltk
# via python-oxmsg
# via uvicorn
click-didyoumean==0.3.1
# via celery
@ -155,12 +161,15 @@ defusedxml==0.7.1
# via fpdf2
# via langchain-anthropic
# via nbconvert
# via python-doctr
deprecated==1.2.14
# via llama-index-core
# via llama-index-legacy
# via pikepdf
deprecation==2.1.0
# via postgrest
diff-match-patch==20230430
# via quivr-diff-assistant
dirtyjson==1.0.8
# via llama-index-core
# via llama-index-legacy
@ -169,6 +178,7 @@ distro==1.9.0
# via openai
docx2txt==0.8
# via quivr-core
# via quivr-diff-assistant
dropbox==12.0.2
# via quivr-api
ecdsa==0.19.0
@ -183,6 +193,7 @@ executing==2.1.0
# via stack-data
faiss-cpu==1.8.0.post1
# via quivr-core
# via quivr-diff-assistant
fastapi==0.112.1
# via quivr-api
# via sentry-sdk
@ -256,6 +267,9 @@ h11==0.14.0
# via uvicorn
h2==4.1.0
# via httpx
h5py==3.10.0
# via python-doctr
# via quivr-diff-assistant
hpack==4.0.0
# via h2
httpcore==1.0.5
@ -281,6 +295,7 @@ httpx==0.27.0
httpx-sse==0.4.0
# via cohere
huggingface-hub==0.24.6
# via python-doctr
# via timm
# via tokenizers
# via transformers
@ -322,6 +337,7 @@ jmespath==1.0.1
# via botocore
joblib==1.4.2
# via nltk
# via scikit-learn
jsonpatch==1.33
# via langchain-core
jsonpath-python==1.0.6
@ -350,11 +366,12 @@ kiwisolver==1.4.5
# via matplotlib
kombu==5.4.0
# via celery
langchain==0.2.14
langchain==0.2.16
# via langchain-community
# via megaparse
# via quivr-api
# via quivr-core
# via quivr-diff-assistant
langchain-anthropic==0.1.23
# via quivr-core
# via quivr-monorepo
@ -365,7 +382,7 @@ langchain-community==0.2.12
# via megaparse
# via quivr-api
# via quivr-core
langchain-core==0.2.38
langchain-core==0.2.41
# via langchain
# via langchain-anthropic
# via langchain-cohere
@ -379,18 +396,20 @@ langchain-core==0.2.38
# via quivr-core
langchain-experimental==0.0.64
# via langchain-cohere
langchain-openai==0.1.22
langchain-openai==0.1.25
# via megaparse
# via quivr-api
# via quivr-diff-assistant
langchain-text-splitters==0.2.2
# via langchain
langdetect==1.0.9
# via python-doctr
# via unstructured
langgraph==0.2.19
# via quivr-core
langgraph-checkpoint==1.0.9
# via langgraph
langsmith==0.1.100
langsmith==0.1.126
# via langchain
# via langchain-community
# via langchain-core
@ -400,14 +419,15 @@ litellm==1.43.19
# via quivr-api
llama-cloud==0.0.13
# via llama-index-indices-managed-llama-cloud
llama-index==0.10.67.post1
llama-index==0.11.12
# via megaparse
llama-index-agent-openai==0.2.9
# via quivr-diff-assistant
llama-index-agent-openai==0.3.4
# via llama-index
# via llama-index-program-openai
llama-index-cli==0.1.13
llama-index-cli==0.3.1
# via llama-index
llama-index-core==0.10.67
llama-index-core==0.11.12
# via llama-index
# via llama-index-agent-openai
# via llama-index-cli
@ -420,35 +440,39 @@ llama-index-core==0.10.67
# via llama-index-readers-file
# via llama-index-readers-llama-parse
# via llama-parse
llama-index-embeddings-openai==0.1.11
llama-index-embeddings-openai==0.2.5
# via llama-index
# via llama-index-cli
llama-index-indices-managed-llama-cloud==0.2.7
llama-index-indices-managed-llama-cloud==0.3.1
# via llama-index
llama-index-legacy==0.9.48.post3
# via llama-index
llama-index-llms-openai==0.1.30
llama-index-llms-openai==0.2.9
# via llama-index
# via llama-index-agent-openai
# via llama-index-cli
# via llama-index-multi-modal-llms-openai
# via llama-index-program-openai
# via llama-index-question-gen-openai
llama-index-multi-modal-llms-openai==0.1.9
# via quivr-diff-assistant
llama-index-multi-modal-llms-openai==0.2.1
# via llama-index
llama-index-program-openai==0.1.7
llama-index-program-openai==0.2.0
# via llama-index
# via llama-index-question-gen-openai
llama-index-question-gen-openai==0.1.3
llama-index-question-gen-openai==0.2.0
# via llama-index
llama-index-readers-file==0.1.33
llama-index-readers-file==0.2.2
# via llama-index
llama-index-readers-llama-parse==0.1.6
# via quivr-diff-assistant
llama-index-readers-llama-parse==0.3.0
# via llama-index
llama-parse==0.4.9
llama-parse==0.5.6
# via llama-index-readers-llama-parse
# via megaparse
# via quivr-api
llvmlite==0.43.0
# via numba
lxml==5.3.0
# via pikepdf
# via python-docx
@ -482,7 +506,9 @@ marshmallow==3.22.0
marshmallow-enum==1.5.1
# via unstructured-client
matplotlib==3.9.2
# via mplcursors
# via pycocotools
# via quivr-diff-assistant
# via unstructured-inference
matplotlib-inline==0.1.7
# via ipykernel
@ -521,6 +547,8 @@ mkdocstrings-python==1.11.1
# via mkdocstrings
monotonic==1.6
# via posthog
mplcursors==0.5.3
# via quivr-diff-assistant
mpmath==1.3.0
# via sympy
msal==1.30.0
@ -549,20 +577,25 @@ networkx==3.2.1
# via torch
# via unstructured
nltk==3.9.1
# via llama-index
# via llama-index-core
# via llama-index-legacy
# via unstructured
notion-client==2.2.1
# via quivr-api
numba==0.60.0
# via quivr-diff-assistant
numpy==1.26.3
# via contourpy
# via faiss-cpu
# via h5py
# via langchain
# via langchain-community
# via layoutparser
# via llama-index-core
# via llama-index-legacy
# via matplotlib
# via numba
# via onnx
# via onnxruntime
# via opencv-python
@ -571,12 +604,18 @@ numpy==1.26.3
# via pdf2docx
# via pgvector
# via pycocotools
# via python-doctr
# via quivr-diff-assistant
# via scikit-learn
# via scipy
# via shapely
# via torchvision
# via transformers
# via unstructured
oauthlib==3.2.2
# via requests-oauthlib
olefile==0.47
# via python-oxmsg
omegaconf==2.3.0
# via effdet
onnx==1.16.2
@ -584,21 +623,25 @@ onnx==1.16.2
# via unstructured-inference
onnxruntime==1.19.0
# via unstructured-inference
openai==1.42.0
openai==1.47.1
# via langchain-openai
# via litellm
# via llama-index-agent-openai
# via llama-index-core
# via llama-index-embeddings-openai
# via llama-index-legacy
# via llama-index-llms-openai
# via quivr-api
# via quivr-diff-assistant
# via quivr-worker
opencv-python==4.10.0.84
# via layoutparser
# via python-doctr
# via quivr-diff-assistant
# via unstructured-inference
opencv-python-headless==4.10.0.84
# via pdf2docx
openpyxl==3.1.5
# via quivr-diff-assistant
# via unstructured
orjson==3.10.7
# via langsmith
@ -624,8 +667,9 @@ paginate==0.5.7
pandas==2.2.2
# via langchain-cohere
# via layoutparser
# via llama-index-core
# via llama-index-legacy
# via llama-index-readers-file
# via quivr-diff-assistant
# via unstructured
pandocfilters==1.5.1
# via nbconvert
@ -650,6 +694,8 @@ pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32'
# via ipython
pgvector==0.3.2
# via quivr-api
pi-heif==0.18.0
# via unstructured
pikepdf==9.1.1
# via unstructured
pillow==10.2.0
@ -659,13 +705,12 @@ pillow==10.2.0
# via matplotlib
# via pdf2image
# via pdfplumber
# via pi-heif
# via pikepdf
# via pillow-heif
# via python-doctr
# via python-pptx
# via torchvision
# via unstructured-pytesseract
pillow-heif==0.18.0
# via unstructured
platformdirs==4.3.2
# via jupyter-core
# via mkdocs-get-deps
@ -712,6 +757,8 @@ pyasn1==0.6.0
# via rsa
pyasn1-modules==0.4.0
# via google-auth
pyclipper==1.3.0.post5
# via python-doctr
pycocotools==2.0.8
# via effdet
pycparser==2.22 ; platform_python_implementation != 'PyPy' or implementation_name == 'pypy'
@ -728,6 +775,7 @@ pydantic==2.8.2
# via langsmith
# via litellm
# via llama-cloud
# via llama-index-core
# via openai
# via postgrest
# via pydantic-settings
@ -765,9 +813,12 @@ pyparsing==3.1.2
# via unstructured-client
pypdf==4.3.1
# via llama-index-readers-file
# via quivr-diff-assistant
# via unstructured
pypdfium2==4.30.0
# via pdfplumber
# via python-doctr
# via quivr-diff-assistant
pyreadline3==3.4.1 ; sys_platform == 'win32'
# via humanfriendly
python-dateutil==2.9.0.post0
@ -781,6 +832,8 @@ python-dateutil==2.9.0.post0
# via realtime
# via storage3
# via unstructured-client
python-doctr==0.9.0
# via quivr-diff-assistant
python-docx==1.1.2
# via megaparse
# via pdf2docx
@ -790,16 +843,20 @@ python-dotenv==1.0.1
# via megaparse
# via pydantic-settings
# via quivr-api
# via quivr-diff-assistant
# via quivr-worker
python-iso639==2024.4.27
# via unstructured
python-jose==3.3.0
# via quivr-api
python-magic==0.4.27
# via quivr-diff-assistant
# via unstructured
python-multipart==0.0.9
# via quivr-api
# via unstructured-inference
python-oxmsg==0.0.1
# via unstructured
python-pptx==1.0.2
# via megaparse
# via unstructured
@ -830,6 +887,7 @@ pyzmq==26.2.0
# via ipykernel
# via jupyter-client
rapidfuzz==3.9.6
# via python-doctr
# via unstructured
# via unstructured-inference
realtime==2.0.2
@ -882,12 +940,18 @@ s3transfer==0.10.2
safetensors==0.4.4
# via timm
# via transformers
scikit-learn==1.5.2
# via quivr-diff-assistant
scipy==1.14.1
# via layoutparser
# via python-doctr
# via scikit-learn
sentencepiece==0.2.0
# via transformers
sentry-sdk==2.13.0
# via quivr-api
shapely==2.0.6
# via python-doctr
six==1.16.0
# via asttokens
# via bleach
@ -945,6 +1009,8 @@ tenacity==8.5.0
# via llama-index-legacy
termcolor==2.4.0
# via fire
threadpoolctl==3.5.0
# via scikit-learn
tiktoken==0.7.0
# via langchain-openai
# via litellm
@ -992,6 +1058,7 @@ tqdm==4.66.5
# via llama-index-core
# via nltk
# via openai
# via python-doctr
# via transformers
# via unstructured
traitlets==5.14.3
@ -1029,6 +1096,7 @@ typing-extensions==4.12.2
# via pydantic-core
# via pyee
# via python-docx
# via python-oxmsg
# via python-pptx
# via realtime
# via resend
@ -1048,9 +1116,10 @@ tzdata==2024.1
# via pandas
unidecode==1.3.8
# via quivr-api
unstructured==0.15.7
unstructured==0.15.13
# via megaparse
# via quivr-core
# via quivr-diff-assistant
unstructured-client==0.8.1
# via unstructured
unstructured-inference==0.7.36

View File

@ -0,0 +1,9 @@
alter table "public"."tasks" add column "assistant_name" text;
alter
publication supabase_realtime add table tasks;

View File

@ -0,0 +1,4 @@
alter table "public"."tasks" add column "task_metadata" jsonb;

View File

@ -0,0 +1,2 @@
OPENAI_API_KEY = myopenaikey
LLAMA_PARSE_API_KEY = myllamaparsekey

View File

@ -0,0 +1,15 @@
# python generated files
__pycache__/
*.py[oc]
build/
dist/
wheels/
*.egg-info
# venv
.venv
.env
.DS_Store
#pkl
*.pkl

View File

@ -0,0 +1 @@
3.11.9

View File

@ -0,0 +1,3 @@
# diff-assistant
Describe your project here.

View File

@ -0,0 +1,958 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"from diff_algorithm import DiffAlgorithm\n",
"from parser import DeadlyParser\n",
"\n",
"file_path_after = \"/Users/chloed./Documents/quivr/diff-assistant/src/cdp3/test_docs/etiquette_0_before.pdf\"\n",
"file_path_before = \"/Users/chloed./Documents/quivr/diff-assistant/src/cdp3/test_docs/etiquette_0_after.pdf\"\n",
"complex_file = \"/Users/chloed./Documents/quivr/diff-assistant/src/cdp3/test_docs/Cas3-2-3.pdf\""
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"parser = DeadlyParser()\n",
"parsed_before = parser.parse(file_path_before)\n",
"parsed_after = parser.parse(file_path_after)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"text_before = parsed_before.render()\n",
"text_after = parsed_after.render()"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CUDA device False\n",
"\n",
"0: 1024x800 2 Pictures, 2 Section-headers, 18 Texts, 1091.6ms\n",
"Speed: 24.9ms preprocess, 1091.6ms inference, 84.6ms postprocess per image at shape (1, 3, 1024, 800)\n"
]
}
],
"source": [
"from PIL import Image\n",
"import pypdfium2 as pdfium\n",
"import torchvision.transforms as transforms\n",
"\n",
"import torch\n",
"from ultralytics import YOLOv10\n",
"\n",
"print(\"CUDA device\", torch.cuda.is_available())\n",
"\n",
"device = torch.device(\"mps\") # Default CUDA device\n",
"\n",
"model = YOLOv10(\"./yolov10x_best.pt\").to(device)\n",
"\n",
"pdf = pdfium.PdfDocument(file_path_after)\n",
"page = pdf[0] # load a page\n",
"\n",
"bitmap = page.render(scale=500 / 72)\n",
"\n",
"pil_image = bitmap.to_pil()\n",
"\n",
"# Create a transform to convert PIL image to tensor\n",
"to_tensor = transforms.ToTensor()\n",
"\n",
"# Convert PIL image to tensor (this also normalizes values to [0, 1])\n",
"tensor_image = to_tensor(pil_image)\n",
"\n",
"# Add batch dimension\n",
"tensor_image = tensor_image.unsqueeze(0).to(device)\n",
"\n",
"# Assuming your model is already on the CUDA device\n",
"model = model.to(device)\n",
"\n",
"# Perform inference\n",
"with torch.no_grad():\n",
" results = model.predict(source=pil_image, imgsz=1024, conf=0.35, batch=1)\n",
"\n",
"\n",
"annotated_image = results[0].plot()[:, :, ::-1]\n",
"\n",
"im = Image.fromarray(annotated_image)\n",
"\n",
"im.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([0.8352, 0.8235, 0.8203, 0.8113, 0.7984, 0.7860, 0.6394, 0.5778, 0.5666, 0.5546, 0.5365, 0.5300, 0.4666, 0.4322, 0.4222, 0.3932, 0.3926, 0.3901], device='mps:0')\n",
"tensor([6., 9., 7., 9., 6., 9., 6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9.], device='mps:0')\n"
]
}
],
"source": [
"print(results[0].boxes.conf)\n",
"print(results[0].boxes.cls)\n",
"results[0].boxes.xyxyn"
]
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {},
"outputs": [],
"source": [
"from langchain_openai import ChatOpenAI\n",
"from langchain_core.messages import HumanMessage, SystemMessage\n",
"from io import BytesIO\n",
"import base64\n",
"def check_transcription(file_path, text):\n",
" pdf = pdfium.PdfDocument(file_path)\n",
" page = pdf[0] # load a page\n",
" \n",
" bitmap = page.render(scale=500 / 72)\n",
" \n",
" pil_image_before = bitmap.to_pil()\n",
" \n",
" buffered = BytesIO()\n",
" pil_image_before.save(buffered, format=\"PNG\")\n",
" img_str = base64.b64encode(buffered.getvalue()).decode()\n",
" \n",
" chat = ChatOpenAI(model=\"gpt-4o\", temperature=0)\n",
" result = chat.invoke(\n",
" [\n",
" HumanMessage(\n",
" content=[\n",
" {\"type\": \"text\", \"text\": f\"Can you correct this entire text retranscription, respond only with the corrected transcription: {text}\"},\n",
" {\n",
" \"type\": \"image_url\",\n",
" \"image_url\": {\n",
" \"url\": f\"data:image/jpeg;base64,{img_str}\",\n",
" \"detail\": \"auto\",\n",
" },\n",
" },\n",
" ]\n",
" )\n",
" ]\n",
" )\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"result_before = check_transcription(file_path_before, text_before)\n",
"result_after = check_transcription(file_path_after, text_after)"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Coup de pâtes\n",
"TRADITION & INNOVATION\n",
"\n",
"50 CREPES FINES SUCREES AU RHUM NEGRITA® (PLIEES EN QUATRE) D270 55g\n",
"50 Thin crêpes sweetened with rum Negrita® (folded in four) D270 55g\n",
"\n",
"25514\n",
"Rhum NEGRITA\n",
"50 Crêpes fines sucrées au rhum cuites, surgelées -\n",
"50 Crêpes sweetened with rum, baked, frozen\n",
"\n",
"Ingrédients : LAIT entier, farine de BLE, sucre de canne 16.4%, ŒUFS entiers*, beurre concentré (LAIT), eau, rhum Negrita (colorant: E150a) 3.6%, sel, poudres à lever: E500-E331-amidon de BLE.\n",
"* Œufs issus de poules élevées au sol\n",
"\n",
"Ingredients : Whole MILK, WHEAT flour, cane sugar 16.4%, whole EGGS*, concentrated butter (MILK), water, Negrita rum (colouring: E150a) 3.6%, salt, raising agents: E500-E331-WHEAT starch.\n",
"* Barn eggs\n",
"\n",
"Conseil d'utilisation : Décongeler le produit 1 heure entre 0° et 4°C. Après décongélation et maintien à 4°C, le produit se conserve au maximum pendant 24 heures. Suggestion: possibilité de décongeler les crêpes 30 secondes au four à micro-ondes.\n",
"How to prepare the product: Defrost the product 1 hour at 0°C - +4°C. After thawing, preserve the product at +4°C for 24 hours maximum. Suggestion: Defrost the crêpe 30 sec in the microwave.\n",
"\n",
"Informations nutritionnelles pour 100g / Average nutritional values for 100g:\n",
"Valeur énergétique/Energy: 1495 kJ / 356 kcal\n",
"Matières grasses totales/Fat (g): 11.4\n",
"- dont Acides Gras Saturés/of which saturated fatty acids (g): 5.9\n",
"Glucides/Carbohydrates (g): 49.5\n",
"- dont sucres/of which sugar (g): 25.2\n",
"Protéines/Proteins (g): 8.0\n",
"Sel/Salt (g): 0.45\n",
"\n",
"A conserver à -18°C : Ne jamais recongeler un produit décongelé\n",
"Store at -18°C: Don't refreeze, once defrosted\n",
"\n",
"Coup de pâtes\n",
"50 CREPES FINES SUCREES AU RHUM NEGRITA® (PLIEES EN QUATRE) D270 55g\n",
"50 Crêpes fines sucrées au rhum cuites, surgelées -\n",
"50 Crêpes sweetened with rum, baked, frozen\n",
"\n",
"N° DE LOT / BATCH : 116241 13:17\n",
"A consommer de préférence avant le / Best before : 25/10/2025\n",
"\n",
"25514\n",
"FAB : A04A\n",
"\n",
"(01)03604380255141(15)251025(10)116241(91)0316175\n",
"EAN No: 03604380255141\n",
"\n",
"Poids net / Net weight : 2750 g\n",
"\n",
"C.I: 7142 COUP DE PATES S.A.S. ZAC DU BEL AIR - 14-16 AVENUE JOSEPH PAXTON - FERRIERES EN BRIE - 77164 MARNE LA VALLEE CEDEX 3 - FRANCE\n"
]
}
],
"source": [
"print(result_after.content)"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(0, '50 CREPES FINES SUCREES AU\\nCoupdegal'),\n",
" (-1, 'g'),\n",
" (1, 'o'),\n",
" (0, '\\nRHUM NEGRITAO (PLIEES EN QUATRE)\\nTRA'),\n",
" (-1, 'C'),\n",
" (1, 'D'),\n",
" (0, 'ITION '),\n",
" (-1, '&'),\n",
" (1, 'a'),\n",
" (0, ' INNO'),\n",
" (-1, 'V'),\n",
" (1, 'Y'),\n",
" (0, 'AT'),\n",
" (-1, 'IG'),\n",
" (1, ':O'),\n",
" (0, 'N\\nD270 55g\\n50 Thin cr'),\n",
" (-1, 'ê'),\n",
" (1, 'è'),\n",
" (0, 'pes sweetened with rum Negrita'),\n",
" (-1, 'g'),\n",
" (1, 'e'),\n",
" (0, '\\n(folded in four) D270 55g\\n25514 R'),\n",
" (-1, 'k'),\n",
" (1, 'h'),\n",
" (0, 'um'),\n",
" (-1, 'y'),\n",
" (0, '\\n'),\n",
" (1, 'NEGRITA '),\n",
" (0, '50 Crêpes fines sucrées au rhum cuites, surgelées -\\n'),\n",
" (-1, 'NEGRITA\\n'),\n",
" (0, '50 Cr'),\n",
" (-1, 'è'),\n",
" (1, 'ê'),\n",
" (0,\n",
" 'pes sweetened with rum, baked, frozen\\nIngrédients : LAIT entier, farine de BLE, sucre de canne 16.'),\n",
" (-1, '6'),\n",
" (1, '4'),\n",
" (0,\n",
" '%, CEUFS entiers*,\\nbeurre concentré (LAIT), eau, rhum Negrita (colorant: E150a) 3.'),\n",
" (-1, '7'),\n",
" (1, '6'),\n",
" (0,\n",
" '%, sel, poudres à\\nlever: E500-E331-amidon de BLE.\\n* CEufs issus de poules élevées au sol\\nIngredients : Whole MILK, WHEAT flour, cane sugar 16.'),\n",
" (-1, '6'),\n",
" (1, '4'),\n",
" (0, '%, whole EGGS*, con'),\n",
" (-1, 'c'),\n",
" (1, 'ç'),\n",
" (0, 'entrated\\nbutter (MILK), water, Negrita rum (colouring: E150a) 3.'),\n",
" (-1, '7'),\n",
" (1, '6'),\n",
" (0,\n",
" \"%, salt, raising agents:\\nE500-E331-WHEAT starch.\\n* Barn eggs\\nConseil d'utilisation : Décongeler le produit 1 heure entre 0° et 4°C. Après décongélation et\\nmaintien à 4°C, le produit se conserve au maximum pendant 24 heures\"),\n",
" (1, '.'),\n",
" (0, '\\nSuggestion: possibilité de décongeler les cr'),\n",
" (-1, 'è'),\n",
" (1, 'é'),\n",
" (0, 'pes 30 secondes au four à micr'),\n",
" (-1, 'o'),\n",
" (1, 'c'),\n",
" (0, '-ondes.\\n'),\n",
" (-1, 'BPA le 24.09.2020 '),\n",
" (0, \"How to prepare the products: Defrost the product 1 hour at 0'C-+4\"),\n",
" (1, '°'),\n",
" (0, 'C. After thawing,\\npreserve the product at +4'),\n",
" (-1, '°'),\n",
" (1, '*'),\n",
" (0,\n",
" 'C for 24 hours maximum. Suggestion: Defrost the crèpe 30 sec\\nin the microwave.\\nInformations nutritionnelles pour 1 Average nutritional values for 100g:\\nValeur '),\n",
" (-1, 'e'),\n",
" (1, 'é'),\n",
" (0, 'nerg'),\n",
" (-1, 'e'),\n",
" (1, 'é'),\n",
" (0, 'tique/Energy: 149'),\n",
" (-1, '7'),\n",
" (1, '5'),\n",
" (0, ' kJ / 356 kcal\\nMatières grasses totales/Fat (g): 11.'),\n",
" (-1, '6'),\n",
" (1, '4'),\n",
" (0, '\\n- dont Acides Gras Saturés/of which saturated fatty acids (g): '),\n",
" (-1, '6'),\n",
" (1, '5'),\n",
" (0, '.'),\n",
" (-1, '1'),\n",
" (1, '9'),\n",
" (0, '\\nGiu'),\n",
" (-1, 'c'),\n",
" (1, 'ri'),\n",
" (0, 'des/Car'),\n",
" (-1, 'p'),\n",
" (1, 'b'),\n",
" (0, 'o'),\n",
" (-1, 'n'),\n",
" (1, 'h'),\n",
" (0, 'y'),\n",
" (-1, 'ct'),\n",
" (1, 'di'),\n",
" (0, 'ates (g): 4'),\n",
" (-1, '8'),\n",
" (1, '9'),\n",
" (0, '.'),\n",
" (-1, '9'),\n",
" (1, '5'),\n",
" (0, '\\n'),\n",
" (-1, '- '),\n",
" (0, 'dont sucres/of which sugar (g): 2'),\n",
" (-1, '4'),\n",
" (1, '5'),\n",
" (0, '.'),\n",
" (-1, '1'),\n",
" (1, '2'),\n",
" (0, '\\nProtéines'),\n",
" (-1, '/'),\n",
" (0, 'Proteins (g): 8.0\\nSel/Salt (g): 0.4'),\n",
" (-1, '8'),\n",
" (1, '5'),\n",
" (0,\n",
" \"\\nA conserver à -18°C : Ne jamais recongeler un produit décongelé\\nStore at -18°C: Don't refreeze, once defrosted\\n\"),\n",
" (-1, 'Fabriqué en France - Made in France\\n'),\n",
" (0, 'Cou'),\n",
" (-1, 'y'),\n",
" (0, 'pde'),\n",
" (-1, 'g'),\n",
" (1, ' '),\n",
" (0, 'al'),\n",
" (-1, 'g'),\n",
" (0, '\\n50 CREPES FINES SUCREES AU RHUM\\nNEGRITA'),\n",
" (-1, 'O'),\n",
" (1, 'B'),\n",
" (0, ' (PLIEES EN QUATRE) D270\\nT'),\n",
" (-1, 'R'),\n",
" (1, 'W'),\n",
" (0, 'ADITION & INNOVAT'),\n",
" (-1, ':'),\n",
" (1, 'I'),\n",
" (0, 'ON\\n55g\\n50 Cr'),\n",
" (-1, 'ê'),\n",
" (1, 'è'),\n",
" (0,\n",
" 'pes fines sucrées au rhum cuites, surgelées -\\nNo DE LOTI\\n50 Crèpes sweetened with rum, baked, frozen\\nBATCH : '),\n",
" (-1, '084'),\n",
" (1, '116'),\n",
" (0, '2'),\n",
" (-1, '0'),\n",
" (1, '4'),\n",
" (0, '1 1'),\n",
" (-1, '5'),\n",
" (1, '3'),\n",
" (0, ':'),\n",
" (-1, '4'),\n",
" (1, '1'),\n",
" (0, '7\\nA consommer de pr'),\n",
" (-1, 'è'),\n",
" (1, 'é'),\n",
" (0, 'f'),\n",
" (-1, 'è'),\n",
" (1, 'é'),\n",
" (0, 'rence avant'),\n",
" (-1, ' '),\n",
" (0, 'le '),\n",
" (-1, 'I'),\n",
" (1, '/'),\n",
" (0, '\\n25514\\nBest before : 2'),\n",
" (-1, '4'),\n",
" (1, '5'),\n",
" (0, '/'),\n",
" (1, '1'),\n",
" (0, '0'),\n",
" (-1, '9'),\n",
" (0, '/202'),\n",
" (-1, '1'),\n",
" (1, '5'),\n",
" (0, '\\n'),\n",
" (1, 'FAB :\\nA'),\n",
" (0, '0'),\n",
" (1, '4A\\n'),\n",
" (0, '0'),\n",
" (-1, '9'),\n",
" (1, '1.'),\n",
" (0, '0'),\n",
" (1, '9'),\n",
" (0, '80'),\n",
" (-1, '43'),\n",
" (1, '.9'),\n",
" (0, '80'),\n",
" (1, '2'),\n",
" (0, '55141052'),\n",
" (1, '5'),\n",
" (0, '10'),\n",
" (-1, '9'),\n",
" (0, '2'),\n",
" (-1, '4'),\n",
" (1, '5'),\n",
" (0, '10'),\n",
" (-1, '08'),\n",
" (1, '1162'),\n",
" (0, '4'),\n",
" (-1, '20'),\n",
" (0, '1 (91)0316'),\n",
" (-1, '4'),\n",
" (1, '1'),\n",
" (0, '7'),\n",
" (-1, '6'),\n",
" (1, '5'),\n",
" (0, '\\nEAN No: 03604380255141'),\n",
" (-1, ' FAB : 00001 '),\n",
" (1, '\\n'),\n",
" (0, 'Poids net'),\n",
" (-1, '\\n:\\n'),\n",
" (1, '! '),\n",
" (0, '2750\\nNet weight'),\n",
" (1, ': :'),\n",
" (0, '\\ng\\n'),\n",
" (-1, '\\n'),\n",
" (1, 'Ci: 7142 '),\n",
" (0, 'COUP DE'),\n",
" (1, 'F'),\n",
" (0, ' PATES'),\n",
" (-1, 'E'),\n",
" (1, 'O'),\n",
" (0, ' S'),\n",
" (-1, '.'),\n",
" (0, 'A.S'),\n",
" (-1, '-;'),\n",
" (0, ' ZA'),\n",
" (-1, 'C'),\n",
" (1, 'Ç'),\n",
" (0, ' DU BEL AIR'),\n",
" (-1, '-'),\n",
" (0, ' 14-16 AVENUE'),\n",
" (1, '.'),\n",
" (0, ' '),\n",
" (-1, 'J'),\n",
" (1, 'V'),\n",
" (0, 'OSEPH'),\n",
" (-1, ' '),\n",
" (0, 'PAXTON-\\nFERRIERES EN'),\n",
" (-1, 'I'),\n",
" (0, ' BRIE 77'),\n",
" (-1, '6'),\n",
" (1, '8'),\n",
" (0, '14 MARNE LA VALLEE CEDEX 3'),\n",
" (1, '- FRANÇE')]"
]
},
"execution_count": 171,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dmp= DiffAlgorithm()\n",
"diff_main = dmp.diff_main(result_before.content, result_after.content)\n",
"#diff_main = dmp.diff_main(text_before, text_after)\n",
"#result = dmp.to_pretty_json(diff_main, parsed_before)\n",
"diff_main"
]
},
{
"cell_type": "code",
"execution_count": 172,
"metadata": {},
"outputs": [],
"source": [
"#split differences and send to llm \n",
"cleaned_diff = []\n",
"for cat, content in diff_main:\n",
" if content.strip() and content != \"\\n\":\n",
" cleaned_diff.append((cat, content))"
]
},
{
"cell_type": "code",
"execution_count": 173,
"metadata": {},
"outputs": [],
"source": [
"def format_difference(main_diff):\n",
" text_modified = \"\"\n",
" sub_stack = 0\n",
" for op, data in main_diff:\n",
" if op == 0: \n",
" text_modified += data if sub_stack == 0 else f\"_]] {data}\"\n",
" elif op == -1: \n",
" if sub_stack == 0:\n",
" text_modified += f\"[[{data}->\"\n",
" sub_stack += 1\n",
" else:\n",
" text_modified += f\"{data}->\"\n",
" elif op == 1: \n",
" if sub_stack > 0:\n",
" text_modified += f\"{data}]]\"\n",
" sub_stack -= 1\n",
" else:\n",
" text_modified += f\"[[ _ ->{data}]]\"\n",
" return text_modified"
]
},
{
"cell_type": "code",
"execution_count": 174,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"\"50 CREPES FINES SUCREES AU\\nCoupdegal[[g->o]]\\nRHUM NEGRITAO (PLIEES EN QUATRE)\\nTRA[[C->D]]ITION [[&->a]] INNO[[V->Y]]AT[[IG->:O]]N\\nD270 55g\\n50 Thin cr[[ê->è]]pes sweetened with rum Negrita[[g->e]]\\n(folded in four) D270 55g\\n25514 R[[k->h]]um[[y->NEGRITA ]]50 Crêpes fines sucrées au rhum cuites, surgelées -\\n[[NEGRITA\\n->_]] 50 Crè->ê]]pes sweetened with rum, baked, frozen\\nIngrédients : LAIT entier, farine de BLE, sucre de canne 16.[[6->4]]%, CEUFS entiers*,\\nbeurre concentré (LAIT), eau, rhum Negrita (colorant: E150a) 3.[[7->6]]%, sel, poudres à\\nlever: E500-E331-amidon de BLE.\\n* CEufs issus de poules élevées au sol\\nIngredients : Whole MILK, WHEAT flour, cane sugar 16.[[6->4]]%, whole EGGS*, con[[c->ç]]entrated\\nbutter (MILK), water, Negrita rum (colouring: E150a) 3.[[7->6]]%, salt, raising agents:\\nE500-E331-WHEAT starch.\\n* Barn eggs\\nConseil d'utilisation : Décongeler le produit 1 heure entre 0° et 4°C. Après décongélation et\\nmaintien à 4°C, le produit se conserve au maximum pendant 24 heures[[ _ ->.]]\\nSuggestion: possibilité de décongeler les cr[[è->é]]pes 30 secondes au four à micr[[o->c]]-ondes.\\n[[BPA le 24.09.2020 ->_]] How to prepare the products: Defrost the product 1 hour at 0'C-+4°]]C. After thawing,\\npreserve the product at +4[[°->*]]C for 24 hours maximum. Suggestion: Defrost the crèpe 30 sec\\nin the microwave.\\nInformations nutritionnelles pour 1 Average nutritional values for 100g:\\nValeur [[e->é]]nerg[[e->é]]tique/Energy: 149[[7->5]] kJ / 356 kcal\\nMatières grasses totales/Fat (g): 11.[[6->4]]\\n- dont Acides Gras Saturés/of which saturated fatty acids (g): [[6->5]].[[1->9]]\\nGiu[[c->ri]]des/Car[[p->b]]o[[n->h]]y[[ct->di]]ates (g): 4[[8->9]].[[9->5]][[- ->_]] dont sucres/of which sugar (g): 24->5]].[[1->2]]\\nProtéines[[/->_]] Proteins (g): 8.0\\nSel/Salt (g): 0.48->5]]\\nA conserver à -18°C : Ne jamais recongeler un produit décongelé\\nStore at -18°C: Don't refreeze, once defrosted\\n[[Fabriqué en France - Made in France\\n->_]] Couy->_]] pdeg->_]] alg->_]] \\n50 CREPES FINES SUCREES AU RHUM\\nNEGRITAO->B]] (PLIEES EN QUATRE) D270\\nT[[R->W]]ADITION & INNOVAT[[:->I]]ON\\n55g\\n50 Cr[[ê->è]]pes fines sucrées au rhum cuites, surgelées -\\nNo DE LOTI\\n50 Crèpes sweetened with rum, baked, frozen\\nBATCH : [[084->116]]2[[0->4]]1 1[[5->3]]:[[4->1]]7\\nA consommer de pr[[è->é]]f[[è->é]]rence avantle [[I->/]]\\n25514\\nBest before : 2[[4->5]]/[[ _ ->1]]0[[9->_]] /2021->5]][[ _ ->FAB :\\nA]]0[[ _ ->4A\\n]]0[[9->1.]]0[[ _ ->9]]80[[43->.9]]80[[ _ ->2]]55141052[[ _ ->5]]10[[9->_]] 24->5]]10[[08->1162]]4[[20->_]] 1 (91)03164->1]]7[[6->5]]\\nEAN No: 03604380255141[[ FAB : 00001 ->_]] Poids net\\n:\\n->! ]]2750\\nNet weight[[ _ ->: :]]\\ng\\n[[ _ ->Ci: 7142 ]]COUP DE[[ _ ->F]] PATES[[E->O]] S[[.->_]] A.S-;->_]] ZAC->Ç]] DU BEL AIR[[-->_]] 14-16 AVENUE.]][[J->V]]OSEPHPAXTON-\\nFERRIERES EN[[I->_]] BRIE 776->8]]14 MARNE LA VALLEE CEDEX 3[[ _ ->- FRANÇE]]\""
]
},
"execution_count": 174,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"format_difference(cleaned_diff)"
]
},
{
"cell_type": "code",
"execution_count": 175,
"metadata": {},
"outputs": [],
"source": [
"from langchain_openai import ChatOpenAI\n",
"import os\n",
"\n",
"llm = ChatOpenAI(\n",
" model=\"gpt-4o\",\n",
" temperature=0,\n",
" max_tokens=None,\n",
" timeout=None,\n",
" max_retries=2,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 176,
"metadata": {},
"outputs": [],
"source": [
"section_diffs = [cleaned_diff]"
]
},
{
"cell_type": "code",
"execution_count": 177,
"metadata": {},
"outputs": [],
"source": [
"report = []\n",
"#modified_section_names = []\n",
"for section in section_diffs:\n",
" if len(section) == 1 and section[0][0] == 0:\n",
" print(\"No differences found in this section.\")\n",
" continue\n",
" else:\n",
" text_modified = format_difference(section)\n",
" #modified_section_names.append(section[0][1].split(\"\\n\")[1].split(\"#\")[-1].strip())\n",
" messages = [\n",
" (\n",
" \"human\",\n",
" f\"\"\"You are tasked with analyzing and reporting differences in text for a Quality engineer. The input text contains differences marked with special tokens. Your job is to parse these differences and create a clear, concise report.\n",
"\n",
" Here is the text containing the differences:\n",
"\n",
" <diff_text>\n",
" {text_modified}\n",
" </diff_text>\n",
"\n",
" RULE #1 : If there are no [[->]] tokens, it indicates no changes to report, inventing changes means death.\n",
" The differences are marked using the following format:\n",
" - [[before->after]] indicates a change from \"before\" to \"after\"\n",
" - If there is no \"before\" text, it indicates an addition\n",
" - If there is no \"after\" text, it indicates a deletion\n",
" - If there is no [[ ]] token, it indicates no changes to report\n",
" - Make sense of the difference and do not keep the '[' in the report.\n",
" - \"_\" alone means empty.\n",
"\n",
" Follow these steps to create your report:\n",
"\n",
" 1. Carefully read through the entire text.\n",
" 2. Identify each instance of [[ ]] tokens.\n",
" 3. For each instance, determine the modification that was made.\n",
" Present your report in the following format:\n",
" <report>\n",
" In the section ..., the modification found are :\n",
" * the **black** cat was changed to : the **red** cat\n",
" * ...\n",
" </report>\n",
" Note that there might be no modifications in some sections. In that case, simply state that no differences were found.\n",
"\n",
"\n",
" Remember, your goal is to create a clear and concise report that allows the Quality engineer to quickly verify the differences. Focus on accuracy and readability in your output, give every indication possible to make it easier to find the modification.\n",
" The report should be written in a professional and formal tone and in French.\"\"\",\n",
" ),\n",
" ]\n",
" response = llm.invoke(messages)\n",
" report.append(response.content)\n",
"\n",
"#print(\"The modified Sections are : \", modified_section_names)"
]
},
{
"cell_type": "code",
"execution_count": 178,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<report>\n",
"Dans la section \"50 CREPES FINES SUCREES AU\", les modifications trouvées sont :\n",
"* Coupdegal**g** a été changé en : Coupdegal**o**\n",
"\n",
"Dans la section \"RHUM NEGRITAO (PLIEES EN QUATRE)\", les modifications trouvées sont :\n",
"* TRA**C**ITION a été changé en : TRA**D**ITION\n",
"* TRA**&** INNO**V**ATION a été changé en : TRA**a** INNO**Y**ATION\n",
"* INNO**V**ATION a été changé en : INNO**Y**ATION\n",
"* INNO**IG**N a été changé en : INNO**:O**N\n",
"\n",
"Dans la section \"50 Thin crêpes sweetened with rum Negrita\", les modifications trouvées sont :\n",
"* cr**ê**pes a été changé en : cr**è**pes\n",
"* Negrita**g** a été changé en : Negrita**e**\n",
"\n",
"Dans la section \"25514 Rhum NEGRITA 50 Crêpes fines sucrées au rhum cuites, surgelées\", les modifications trouvées sont :\n",
"* R**k**um a été changé en : R**h**um\n",
"* Rhum**y** a été changé en : Rhum**NEGRITA**\n",
"* NEGRITA a été changé en : (supprimé)\n",
"* Crè**ê**pes a été changé en : Crè**e**pes\n",
"\n",
"Dans la section \"Ingrédients\", les modifications trouvées sont :\n",
"* sucre de canne 16.**6**% a été changé en : sucre de canne 16.**4**%\n",
"* rhum Negrita (colorant: E150a) 3.**7**% a été changé en : rhum Negrita (colorant: E150a) 3.**6**%\n",
"\n",
"Dans la section \"Ingredients\", les modifications trouvées sont :\n",
"* cane sugar 16.**6**% a été changé en : cane sugar 16.**4**%\n",
"* con**c**entrated butter a été changé en : con**ç**entrated butter\n",
"* Negrita rum (colouring: E150a) 3.**7**% a été changé en : Negrita rum (colouring: E150a) 3.**6**%\n",
"\n",
"Dans la section \"Conseil d'utilisation\", les modifications trouvées sont :\n",
"* 24 heures** _ ** a été changé en : 24 heures**.**\n",
"* cr**è**pes a été changé en : cr**é**pes\n",
"* micr**o**-ondes a été changé en : micr**c**-ondes\n",
"* BPA le 24.09.2020 a été changé en : (supprimé)\n",
"\n",
"Dans la section \"How to prepare the products\", les modifications trouvées sont :\n",
"* 0'C-+4**°**C a été changé en : 0'C-+4**C**\n",
"* +4**°**C a été changé en : +4**C**\n",
"\n",
"Dans la section \"Valeur énergétique/Energy\", les modifications trouvées sont :\n",
"* Valeur **e**nerg**e**tique a été changé en : Valeur **é**nerg**é**tique\n",
"* 149**7** kJ a été changé en : 149**5** kJ\n",
"\n",
"Dans la section \"Matières grasses totales/Fat (g)\", les modifications trouvées sont :\n",
"* 11.**6** a été changé en : 11.**4**\n",
"\n",
"Dans la section \"Acides Gras Saturés/of which saturated fatty acids (g)\", les modifications trouvées sont :\n",
"* **6**.1 a été changé en : **5**.9\n",
"\n",
"Dans la section \"Glucides/Carbohydrates (g)\", les modifications trouvées sont :\n",
"* Giu**c**des a été changé en : Giu**ri**des\n",
"* Car**p**o**n**y**ct**ates a été changé en : Car**b**o**h**y**di**ates\n",
"* 4**8**.9 a été changé en : 4**9**.5\n",
"* 24**-**1 a été changé en : 24**.2**\n",
"\n",
"Dans la section \"Protéines/Proteins (g)\", les modifications trouvées sont :\n",
"* Protéines**/** a été changé en : Protéines\n",
"\n",
"Dans la section \"Sel/Salt (g)\", les modifications trouvées sont :\n",
"* 0.48**->5** a été changé en : 0.48**5**\n",
"\n",
"Dans la section \"A conserver à -18°C\", les modifications trouvées sont :\n",
"* Fabriqué en France - Made in France a été changé en : (supprimé)\n",
"\n",
"Dans la section \"50 CREPES FINES SUCREES AU RHUM\", les modifications trouvées sont :\n",
"* NEGRITAO**->B** a été changé en : NEGRITAO**B**\n",
"* T**R**ADITION a été changé en : T**W**ADITION\n",
"* INNOVAT**:**ON a été changé en : INNOVAT**I**ON\n",
"\n",
"Dans la section \"50 Crêpes fines sucrées au rhum cuites, surgelées\", les modifications trouvées sont :\n",
"* cr**ê**pes a été changé en : cr**è**pes\n",
"\n",
"Dans la section \"BATCH\", les modifications trouvées sont :\n",
"* 084**2**0 a été changé en : 116**4**1\n",
"* 1**5**:4**7** a été changé en : 1**3**:1**7**\n",
"\n",
"Dans la section \"A consommer de préférence avant le\", les modifications trouvées sont :\n",
"* 2**4**/10/2021 a été changé en : 2**5**/10/2021\n",
"* FAB : A0 a été changé en : FAB : 4A\n",
"* 09.0 a été changé en : 1.0\n",
"* 98043 a été changé en : 980\n",
"* 255141052 a été changé en : 2551410525\n",
"* 109 a été changé en : 109\n",
"* 24.10.08 a été changé en : 24.10.1162\n",
"* 20 a été changé en : 20\n",
"* 1 (91)03164 a été changé en : 1 (91)03164-17\n",
"\n",
"Dans la section \"EAN No\", les modifications trouvées sont :\n",
"* EAN No: 03604380255141 FAB : 00001 a été changé en : EAN No: 03604380255141\n",
"\n",
"Dans la section \"Poids net\", les modifications trouvées sont :\n",
"* Poids net : 2750 a été changé en : Poids net : 2750\n",
"\n",
"Dans la section \"Net weight\", les modifications trouvées sont :\n",
"* Net weight : : a été changé en : Net weight : :\n",
"\n",
"Dans la section \"COUP DE PATES\", les modifications trouvées sont :\n",
"* COUP DE PATES a été changé en : COUP DE F PATES\n",
"* S.A.S a été changé en : S.A.S\n",
"* ZAC DU BEL AIR a été changé en : ZAC DU BEL AIR\n",
"* 14-16 AVENUE a été changé en : 14-16 AVENUE\n",
"* JOSEPH PAXTON a été changé en : JOSEPH PAXTON\n",
"* FERRIERES EN BRIE 77614 MARNE LA VALLEE CEDEX 3 a été changé en : FERRIERES EN BRIE 77614 MARNE LA VALLEE CEDEX 3\n",
"</report>\n"
]
}
],
"source": [
"print(report[0])"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(1, 'Coup de pâtes\\nTRADITION & INNOVATION\\n\\n'),\n",
" (0,\n",
" '50 CREPES FINES SUCREES AU RHUM NEGRITA® (PLIEES EN QUATRE) D270 55g\\n50 Thin crêpes sweetened with rum Negrita® (folded in four) D270 55g\\n'),\n",
" (0, '25514'),\n",
" (0, 'Rhum NEGRITA\\n50 Crêpes fines sucrées au rhum cuites, surgelées -'),\n",
" (0, '50 Crêpes sweetened with rum, baked, frozen'),\n",
" (0, '\\nIngrédients : LAIT entier, farine de BLE, sucre de canne 16.'),\n",
" (-1, '6'),\n",
" (1, '4'),\n",
" (0,\n",
" '%, ŒUFS entiers*, beurre concentré (LAIT), eau, rhum Negrita (colorant: E150a) 3.'),\n",
" (-1, '7'),\n",
" (1, '6'),\n",
" (0,\n",
" '%, sel, poudres à lever: E500-E331-amidon de BLE.\\n* Œufs issus de poules élevées au sol\\n'),\n",
" (0, 'Ingredients : Whole MILK, WHEAT flour, cane sugar 16.'),\n",
" (-1, '6'),\n",
" (1, '4'),\n",
" (0,\n",
" '%, whole EGGS*, concentrated butter (MILK), water, Negrita rum (colouring: E150a) 3.'),\n",
" (-1, '7'),\n",
" (1, '6'),\n",
" (0, '%, salt, raising agents: E500-E331-WHEAT starch.\\n* Barn eggs\\n'),\n",
" (0,\n",
" \"Conseil d'utilisation : Décongeler le produit 1 heure entre 0° et 4°C. Après décongélation et maintien à 4°C, le produit se conserve au maximum pendant 24 heures\"),\n",
" (1, '. '),\n",
" (0,\n",
" 'Suggestion: possibilité de décongeler les crêpes 30 secondes au four à micro-ondes.\\n'),\n",
" (-1, 'BPA le 24.09.2020 '),\n",
" (0, 'How to prepare the product'),\n",
" (-1, 's'),\n",
" (0, ': Defrost the product 1 hour at 0°C'),\n",
" (0, '-'),\n",
" (1, ' +'),\n",
" (0, '4°C. After thawing, preserve the product at '),\n",
" (1, '+'),\n",
" (0,\n",
" '4°C for 24 hours maximum. Suggestion: Defrost the crêpe 30 sec in the microwave.\\n'),\n",
" (0,\n",
" 'Informations nutritionnelles pour 100g / Average nutritional values for 100g:\\nValeur énergétique/Energy: 149'),\n",
" (-1, '7'),\n",
" (1, '5'),\n",
" (0, ' kJ / 356 kcal\\nMatières grasses totales/Fat (g): 11.'),\n",
" (-1, '6'),\n",
" (1, '4'),\n",
" (0, '\\n- dont Acides Gras Saturés/of which saturated fatty acids (g): '),\n",
" (-1, '6'),\n",
" (1, '5'),\n",
" (0, '.'),\n",
" (-1, '1'),\n",
" (1, '9'),\n",
" (0, '\\nGlucides/Carbohydrates (g): 4'),\n",
" (-1, '8'),\n",
" (1, '9'),\n",
" (0, '.'),\n",
" (-1, '9'),\n",
" (1, '5'),\n",
" (0, '\\n- dont sucres/of which sugar (g): 2'),\n",
" (-1, '4'),\n",
" (1, '5'),\n",
" (0, '.'),\n",
" (-1, '1'),\n",
" (1, '2'),\n",
" (0, '\\nProtéines/Proteins (g): 8.0\\nSel/Salt (g): 0.4'),\n",
" (-1, '8'),\n",
" (1, '5\\n'),\n",
" (0,\n",
" \"\\nA conserver à -18°C : Ne jamais recongeler un produit décongelé\\nStore at -18°C: Don't refreeze, once defrosted\\n\"),\n",
" (-1, 'Fabriq'),\n",
" (1, '\\nCo'),\n",
" (0, 'u'),\n",
" (-1, 'é'),\n",
" (1, 'p'),\n",
" (-1, 'en France - Ma'),\n",
" (0, 'de '),\n",
" (-1, 'in Franc'),\n",
" (1, 'pât'),\n",
" (0, 'e'),\n",
" (1, 's'),\n",
" (0,\n",
" '\\n50 CREPES FINES SUCREES AU RHUM NEGRITA® (PLIEES EN QUATRE) D270 55g\\n50 Crêpes fines sucrées au rhum cuites, surgelées -'),\n",
" (0, '50 Crêpes sweetened with rum, baked, frozen\\n'),\n",
" (0, 'N° DE LOT / BATCH : '),\n",
" (-1, '08'),\n",
" (1, '1162'),\n",
" (0, '4'),\n",
" (-1, '20'),\n",
" (0, '1 1'),\n",
" (-1, '5'),\n",
" (1, '3'),\n",
" (0, ':'),\n",
" (-1, '4'),\n",
" (1, '1'),\n",
" (0, '7\\nA consommer de préférence avant le / Best before : 2'),\n",
" (-1, '4'),\n",
" (1, '5'),\n",
" (0, '/'),\n",
" (1, '1'),\n",
" (0, '0'),\n",
" (-1, '9'),\n",
" (0, '/202'),\n",
" (-1, '1'),\n",
" (1, '5\\n'),\n",
" (0, '\\n25514\\n'),\n",
" (1, 'FAB : A04A\\n\\n'),\n",
" (0, '(01)03604380255141(15)2'),\n",
" (1, '5'),\n",
" (0, '10'),\n",
" (-1, '9'),\n",
" (0, '2'),\n",
" (-1, '4'),\n",
" (1, '5'),\n",
" (0, '(10)'),\n",
" (-1, '08'),\n",
" (1, '1162'),\n",
" (0, '4'),\n",
" (-1, '20'),\n",
" (0, '1'),\n",
" (0, '(91)0316'),\n",
" (-1, '4'),\n",
" (1, '1'),\n",
" (0, '7'),\n",
" (-1, '6'),\n",
" (1, '5'),\n",
" (0, '\\nEAN N'),\n",
" (-1, '°'),\n",
" (1, 'o'),\n",
" (0, ': 03604380255141'),\n",
" (-1, ' FAB : 00001 '),\n",
" (0, 'Poids net / Net weight : 2750 g\\n'),\n",
" (1, '\\nC.I: 7142 '),\n",
" (0, 'COUP DE PATES'),\n",
" (-1, '®'),\n",
" (0, ' S.A.S'),\n",
" (-1, ' -'),\n",
" (1, '.'),\n",
" (0, ' ZAC DU BEL AIR - 14-16 AVENUE JOSEPH PAXTON - FERRIERES EN BRIE - 77'),\n",
" (-1, '6'),\n",
" (0, '1'),\n",
" (1, '6'),\n",
" (0, '4 MARNE LA VALLEE CEDEX 3'),\n",
" (1, ' - FRANCE')]"
]
},
"execution_count": 166,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cleaned_diff"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -0,0 +1,53 @@
[project]
name = "quivr-diff-assistant"
version = "0.1.0"
description = "Diff Assistant"
authors = [
{ name = "Stan Girard", email = "stan@quivr.app" }
]
dependencies = [
"python-doctr>=0.9.0",
"matplotlib>=3.9.2",
"mplcursors>=0.5.3",
"diff-match-patch>=20230430",
"scikit-learn>=1.5.1",
"numpy>=1.16.0",
"unstructured>=0.15.9",
"python-magic>=0.4.27",
"pypdfium2>=4.30.0",
"numba>=0.60.0",
"docx2txt>=0.8",
"openpyxl>=3.1.5",
"faiss-cpu>=1.8.0.post1",
"llama-index>=0.11.8",
"openai>=1.44.1",
"pandas>=2.2.2",
"pypdf>=4.3.1",
"llama-index-readers-file>=0.2.1",
"llama-index-llms-openai>=0.2.3",
"python-dotenv>=1.0.1",
"langchain>=0.2.16",
"langchain-openai>=0.1.24",
"opencv-python>=4.10.0.84",
"megaparse>=0.0.31",
"h5py==3.10.0",
]
readme = "README.md"
requires-python = ">= 3.8"
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.rye]
managed = true
dev-dependencies = [
"pytest>=8.3.2",
]
[tool.hatch.metadata]
allow-direct-references = true
[tool.hatch.build.targets.wheel]
packages = ["quivr_diff_assistant"]

View File

@ -0,0 +1,221 @@
import asyncio
from enum import Enum
import pandas as pd
import streamlit as st
from dotenv import load_dotenv
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.node_parser import UnstructuredElementNodeParser
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.schema import Document
from llama_index.llms.openai import OpenAI
from utils.utils import COMPARISON_PROMPT
from quivr_diff_assistant.use_case_3.parser import DeadlyParser
load_dotenv()
# Set pandas display options
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)
def load_and_process_document(file_path, pickle_file):
print(file_path)
reader = SimpleDirectoryReader(input_files=[file_path])
docs = reader.load_data()
print(len(docs), " and", len(docs[0].text))
if len(docs) == 1 and len(docs[0].text) < 9:
print("No text found with classical parse, switching to OCR ...")
parser = DeadlyParser()
doc = parser.deep_parse(file_path)
docs = [Document().from_langchain_format(doc)]
node_parser = UnstructuredElementNodeParser()
raw_nodes = node_parser.get_nodes_from_documents(docs)
base_nodes, node_mappings = node_parser.get_base_nodes_and_mappings(raw_nodes)
return base_nodes, node_mappings
def create_query_engine(base_nodes, node_mappings):
vector_index = VectorStoreIndex(base_nodes)
vector_retriever = vector_index.as_retriever(similarity_top_k=5)
recursive_retriever = RecursiveRetriever(
"vector",
retriever_dict={"vector": vector_retriever},
node_dict=node_mappings,
verbose=True,
)
return RetrieverQueryEngine.from_args(
recursive_retriever, llm=OpenAI(temperature=0, model="gpt-4")
)
def compare_responses(response1, response2):
llm = OpenAI(temperature=0, model="gpt-4")
prompt = f"""
Compare the following two responses and determine if they convey the same information:
Response for document 1: {response1}
Response for document 2: {response2}
Are these responses essentially the same? Provide a brief explanation for your conclusion. The difference in format are not important, focus on the content and the numbers.
If there are any specific differences, please highlight them with bullet points. Respond in french and in a markdown format.
"""
return llm.complete(prompt)
class ComparisonTypes(str, Enum):
CDC_ETIQUETTE = "Cahier des Charges - Etiquette"
CDC_FICHE_DEV = "Cahier des Charges - Fiche Dev"
def llm_comparator(
document: str, cdc: str, llm: BaseChatModel, comparison_type: ComparisonTypes
):
chain = COMPARISON_PROMPT | llm | StrOutputParser()
if comparison_type == ComparisonTypes.CDC_ETIQUETTE:
text_1 = "Etiquette"
elif comparison_type == ComparisonTypes.CDC_FICHE_DEV:
text_1 = "Fiche Dev"
return chain.stream(
{
"document": document,
"text_1": text_1,
"cdc": cdc,
"text_2": "Cahier des Charges",
}
)
async def test_main():
cdc_doc = "/Users/jchevall/Coding/diff-assistant/data/Use case #2/Cas2-2-1_Mendiant Lait_QD PC F03 - FR Cahier des charges produit -rev 2021-v2.pdf"
doc = "/Users/jchevall/Coding/diff-assistant/data/Use case #2/Cas2-2-1_Proposition étiquette Mendiant Lait croustillant.pdf"
cdc_doc = "/Users/jchevall/Coding/diff-assistant/data/Use case #2/Cas2-1-3_12_CDC_70690_Entremets rond vanille pécan individuel_2024.06.28 VALIDE.docx"
doc = "/Users/jchevall/Coding/diff-assistant/data/Use case #2/Cas2-1-3_CDP_R&D_TABL_01_Fiche développement produit - Entremets vanille pécan 28 06 2024.xlsx"
comparison_type = ComparisonTypes.CDC_FICHE_DEV
llm = ChatOpenAI(
model="gpt-4o",
temperature=0.1,
max_tokens=None,
max_retries=2,
)
parser = DeadlyParser()
parsed_cdc_doc = await parser.aparse(cdc_doc)
if comparison_type == ComparisonTypes.CDC_ETIQUETTE:
parsed_doc = await parser.deep_aparse(doc, llm=llm)
else:
parsed_doc = await parser.aparse(doc)
print("\n\n Cahier des Charges")
print(parsed_cdc_doc.page_content)
print("\n\n Other document")
print(parsed_doc.page_content)
comparison = llm_comparator(
document=parsed_doc.page_content,
cdc=parsed_cdc_doc.page_content,
llm=llm,
comparison_type=comparison_type,
)
print("\n\n Comparison")
print(comparison)
def get_document_path(doc):
try:
with open(doc.name, "wb") as temp_file:
temp_file.write(doc.getbuffer())
path = temp_file.name
except:
path = doc
return path
async def parse_documents(cdc_doc, doc, comparison_type: ComparisonTypes, llm):
parser = DeadlyParser()
# Schedule the coroutines as tasks
cdc_task = asyncio.create_task(parser.aparse(get_document_path(cdc_doc)))
if comparison_type == ComparisonTypes.CDC_ETIQUETTE:
doc_task = asyncio.create_task(
parser.deep_aparse(get_document_path(doc), llm=llm)
)
else:
doc_task = asyncio.create_task(parser.aparse(get_document_path(doc)))
# Optionally, do other work here while tasks are running
# Await the tasks to get the results
parsed_cdc_doc = await cdc_task
print("\n\n Cahier de Charges: \n", parsed_cdc_doc.page_content)
parsed_doc = await doc_task
print("\n\n Other doc: \n", parsed_doc.page_content)
return parsed_cdc_doc, parsed_doc
def main():
st.title("Document Comparison Tool : Use Case 2")
# File uploaders for two documents
cdc_doc = st.file_uploader(
"Upload Cahier des Charges", type=["docx", "xlsx", "pdf", "txt"]
)
doc = st.file_uploader(
"Upload Etiquette / Fiche Dev", type=["docx", "xlsx", "pdf", "txt"]
)
comparison_type = st.selectbox(
"Select document types",
[ComparisonTypes.CDC_ETIQUETTE.value, ComparisonTypes.CDC_FICHE_DEV.value],
)
if st.button("Process Documents and Questions"):
if not cdc_doc or not doc:
st.error("Please upload both documents before launching the processing.")
return
with st.spinner("Processing files..."):
llm = ChatOpenAI(
model="gpt-4o",
temperature=0.1,
max_tokens=None,
max_retries=2,
)
parsed_cdc_doc, parsed_doc = asyncio.run(
parse_documents(cdc_doc, doc, comparison_type=comparison_type, llm=llm)
)
comparison = llm_comparator(
document=parsed_doc.page_content,
cdc=parsed_cdc_doc.page_content,
llm=llm,
comparison_type=comparison_type,
)
# Run the async function using asyncio.run()
# comparison = asyncio.run(process_documents(cdc_doc, doc, comparison_type))
st.write_stream(comparison)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,125 @@
import asyncio
import os
import tempfile
from enum import Enum
from pathlib import Path
import streamlit as st
from diff_match_patch import diff_match_patch
# get environment variables
from dotenv import load_dotenv
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_openai import ChatOpenAI
from use_case_3.diff_type import DiffResult, llm_comparator
from use_case_3.llm_reporter import redact_report
from use_case_3.parser import DeadlyParser
load_dotenv()
class DocumentType(Enum):
ETIQUETTE = "etiquette"
CAHIER_DES_CHARGES = "cdc"
async def create_modification_report(
before_file: str | Path,
after_file: str | Path,
type: DocumentType,
llm: BaseChatModel,
partition: bool = False,
use_llm_comparator: bool = False,
parser=DeadlyParser(),
) -> str:
if type == DocumentType.ETIQUETTE:
print("parsing before file")
before_text = parser.deep_parse(before_file, partition=partition, llm=llm)
print("parsing after file")
after_text = parser.deep_parse(after_file, partition=partition, llm=llm)
elif type == DocumentType.CAHIER_DES_CHARGES:
before_text = await parser.aparse(before_file)
after_text = await parser.aparse(after_file)
print(before_text.page_content)
print(after_text.page_content)
text_after_sections = before_text.page_content.split("\n# ")
text_before_sections = after_text.page_content.split("\n# ")
assert len(text_after_sections) == len(text_before_sections)
if use_llm_comparator:
print("using llm comparator")
return llm_comparator(
before_text.page_content, after_text.page_content, llm=llm
)
print("using diff match patch")
dmp = diff_match_patch()
section_diffs = []
for after_section, before_section in zip(
text_after_sections, text_before_sections, strict=False
):
main_diff: list[tuple[int, str]] = dmp.diff_main(after_section, before_section)
section_diffs.append(DiffResult(main_diff))
return redact_report(section_diffs, llm=llm)
def save_uploaded_file(uploaded_file):
with tempfile.NamedTemporaryFile(
delete=False, suffix=os.path.splitext(uploaded_file.name)[1]
) as tmp_file:
tmp_file.write(uploaded_file.getvalue())
return tmp_file.name
st.title("Document Modification Report Generator : Use Case 3")
# File uploaders
before_file = st.file_uploader("Upload 'Before' file", type=["pdf", "docx"])
after_file = st.file_uploader("Upload 'After' file", type=["pdf", "docx"])
# Document type selector
doc_type = st.selectbox("Select document type", ["ETIQUETTE", "CAHIER_DES_CHARGES"])
# Complexity of document
complexity = st.checkbox("Complex document (lot of text of OCRise)")
# Process button
if st.button("Process"):
if before_file and after_file:
with st.spinner("Processing files..."):
# Save uploaded files
before_path = save_uploaded_file(before_file)
after_path = save_uploaded_file(after_file)
# Initialize LLM
openai_gpt4o = ChatOpenAI(
model="gpt-4o",
temperature=0,
max_tokens=None,
max_retries=2,
)
use_llm_comparator = True if doc_type == "ETIQUETTE" else False
# Generate report
print("generating report")
report = asyncio.run(
create_modification_report(
before_path,
after_path,
DocumentType[doc_type],
openai_gpt4o,
partition=complexity,
use_llm_comparator=use_llm_comparator,
)
)
print("report generated")
# Display results
st.subheader("Modification Report")
st.write(report)
# Clean up temporary files
os.unlink(before_path)
os.unlink(after_path)
else:
st.error("Please upload both 'Before' and 'After' files.")

View File

@ -0,0 +1,59 @@
# from langchain_openai import OpenAIEmbeddings
# from rich.console import Console
# from rich.panel import Panel
# from rich.prompt import Prompt
# from quivr_core import Brain
# from quivr_core.config import LLMEndpointConfig
# from quivr_core.llm.llm_endpoint import LLMEndpoint
# from quivr_core.quivr_rag import QuivrQARAG
# if __name__ == "__main__":
# brain_1 = Brain.from_files(
# name="cdc_brain",
# file_paths=["data/cdc/Cas2-1-3_Entremets_rond_vanille_pecan_individuel.docx"],
# llm=LLMEndpoint.from_config(
# LLMEndpointConfig(model="gpt-4o-mini", temperature=0.0)
# ),
# embedder=OpenAIEmbeddings(),
# )
# brain_2 = Brain.from_files(
# name="etiquette_brain",
# file_paths=[
# "data/fiche_dev_produit/Cas2-1-3_Entremets_rond_vanille_pecan_individuel.xlsx"
# ],
# llm=LLMEndpoint.from_config(
# LLMEndpointConfig(model="gpt-4o-mini", temperature=0.0)
# ),
# embedder=OpenAIEmbeddings(),
# )
# # Check brain info
# brain_1.print_info()
# brain_2.print_info()
# console = Console()
# console.print(Panel.fit("Ask what to compare : ", style="bold magenta"))
# while True:
# # Get user input
# section = Prompt.ask("[bold cyan]Section[/bold cyan]")
# # Check if user wants to exit
# if section.lower() == "exit":
# console.print(Panel("Goodbye!", style="bold yellow"))
# break
# question = (
# f"Quelle est/sont le(s) {section} ? Answer only with exact text citation."
# )
# response_1 = brain_1.ask(question)
# response_2 = brain_2.ask(question, rag_pipeline=QuivrQARAG)
# # Print the answer with typing effect
# console.print(f"[bold green]Quivr CDC[/bold green]: {response_1.answer}")
# console.print()
# console.print(f"[bold blue]Quivr Fiche Dev[/bold blue]: {response_2.answer}")
# console.print("-" * console.width)

View File

@ -0,0 +1,105 @@
from typing import List, Tuple
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.prompts.prompt import PromptTemplate
DIFF_PROMPT = PromptTemplate.from_template(
template="""
You need to compare two texts and report all the differences. Your job is to parse these differences and create a clear, concise report. \
Organize the report by sections and provide a detailed explanation of each difference. \
Be specific on difference, it will be reviewed and verified by a Quality engineer.
Here are the different sections of the report:
* Dénominations, comprenant:
* dénomination légale: nom du produit tel quil est défini par la réglementation, \
en général cela inclut aussi avec des information sur son état (cuite, cru, gelé ... )
* dénomination commercial: nom du produit tel quil est vendu au consommateur
* Ingrédients et allergènes, comprenant:
* liste dingrédients
* traces dallergènes
* Une sous-section pour chaque sous produit si il y a lieu;
* Eléments de traçabilité, comprenant:
* le code-barre EAN
* le code article
* DDM - date de durabilité minimale
* numéro de lot
* date de fabrication
* adresse de l'entreprise
* Conseils dutilisation / de manipulation produit, comprenant :
* Conditions de remise en oeuvre
* Durée de vie
* Conditions de transport
* Conditions de conservation : « A conserver à -18°C / Ne pas recongeler un produit décongeler »
* Temps de decongelation
* Temperature de prechauffage
* Poids du produit
* Valeurs / informations nutritionnelles
* Autres
Notes:
-> Coup de Pates: Tradition & Innovation, est l'entreprise productrice / marque du produit.
Chaque sections doivent être organisées comme suit et séparées par des lignes entre chaque avant et après:
## section_name
**Avant** : ...
**Après** : ...
**Modifications**:
* ...
* ...
-----TEXT BEFORE MODIFICATION-----
{before_text}
-----TEXT AFTER MODIFICATION-----
{after_text}
The report should be written in a professional and formal tone and in French.
"""
)
class DiffResult:
def __init__(self, diffs: List[Tuple[int, str]]) -> None:
self.diffs = diffs
def remove_dummy_diffs(self) -> None:
cleaned_diff = []
for cat, content in self.diffs:
if content.strip() and content != "\n":
cleaned_diff.append((cat, content))
self.diffs = cleaned_diff
def format_diffs(self) -> str:
text_modified = ""
sub_stack = 0
for op, data in self.diffs:
if op == 0:
text_modified += data if sub_stack == 0 else f"_]] {data}"
elif op == -1:
if sub_stack == 0:
text_modified += f"[[{data}->"
sub_stack += 1
else:
text_modified += f"{data}->"
elif op == 1:
if sub_stack > 0:
text_modified += f"{data}]]"
sub_stack -= 1
else:
text_modified += f"[[ _ ->{data}]]"
return text_modified
def __str__(self) -> str:
return self.format_diffs()
def llm_comparator(before_text: str, after_text: str, llm: BaseChatModel) -> str:
chain = DIFF_PROMPT | llm
result = chain.invoke({"before_text": before_text, "after_text": after_text})
return str(result.content)

View File

@ -0,0 +1,74 @@
from typing import List
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.prompts.prompt import PromptTemplate
from quivr_diff_assistant.use_case_3.diff_type import DiffResult
REPORT_PROMPT = PromptTemplate.from_template(
template="""You are tasked with analyzing and reporting differences in text for a Quality engineer. The input text contains differences marked with special tokens. Your job is to parse these differences and create a clear, concise report.
Here is the text containing the differences:
<diff_text>
{text_modified}
</diff_text>
RULE #1 : If there are no [[->]] tokens, it indicates no changes to report, inventing changes means death.
The differences are marked using the following format:
- [[before->after]] indicates a change from "before" to "after"
- If there is no "before" text, it indicates an addition
- If there is no "after" text, it indicates a deletion
- If there is no [[ ]] token, it indicates no changes to report
- Make sense of the difference and do not keep the '[' in the report.
- "_" alone means empty.
Follow these steps to create your report:
1. Carefully read through the entire text.
2. Identify each instance of [[ ]] tokens.
3. For each instance, determine the modification that was made.
Present your report in the following markdown format:
# Title (Difference Report)
## Section Name
### Subsection Name (if applicable)
* Original: Original text
* Modified: Modified text
* Changes:
* Change 1
* Change 2
* Change 3
Avoid repetitive infos, only report the changes.
Keep the checkbox when possible and compare the correct check box.
Every modification should be clearly stated with the original text and the modified text.
Note that there might be no modifications in some sections. In that case, simply return nothing.
Try to make the report as clear and concise as possible, a point for each modification found with details, avoid big comparisons.
Remember, your goal is to create a clear and concise report that allows the Quality engineer to quickly verify the differences. Focus on accuracy and readability in your output, give every indication possible to make it easier to find the modification.
The report should be written in a professional and formal tone and in French.""",
)
def redact_report(difference_per_section: List[DiffResult], llm: BaseChatModel) -> str:
report_per_section = []
combined_diffs = ""
for section in difference_per_section:
if len(section.diffs) == 1 and section.diffs[0][0] == 0:
print("No differences found in this section.")
continue
combined_diffs += str(section)
chain = REPORT_PROMPT | llm
result = chain.invoke({"text_modified": str(combined_diffs)})
report_per_section.append(result.content)
report_text = ""
for rep in report_per_section:
report_text += "\n".join(rep.split("\n")[1:-1]) + "\n\n"
return report_text

View File

@ -0,0 +1,287 @@
"""
All of this needs to be in MegaParse, this is just a placeholder for now.
"""
import base64
from typing import List
import cv2
import numpy as np
from doctr.io import DocumentFile
from doctr.io.elements import Document as doctrDocument
from doctr.models import ocr_predictor
from doctr.models.predictor.pytorch import OCRPredictor
from doctr.utils.common_types import AbstractFile
from langchain_core.documents import Document
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import HumanMessage
from megaparse import MegaParse # FIXME: @chloedia Version problems
from quivr_api.logger import get_logger
logger = get_logger(__name__)
"""
This needs to be in megaparse @chloedia
"""
class DeadlyParser:
def __init__(self):
self.predictor: OCRPredictor = ocr_predictor(
pretrained=True, det_arch="fast_base", reco_arch="crnn_vgg16_bn"
)
async def deep_aparse(
self,
file: AbstractFile,
partition: bool = False,
llm: BaseChatModel | None = None,
) -> Document:
"""
Parse the OCR output from the input file and return the extracted text.
"""
try:
docs = DocumentFile.from_pdf(file, scale=int(500 / 72))
if partition:
cropped_image = crop_to_content(docs[0])
# cv2.imshow("cropped", cropped_image)
# cv2.waitKey(0) # Wait for a key press
docs = split_image(cropped_image)
# for i, sub_image in enumerate(docs):
# cv2.imshow(f"sub_image_{i}", sub_image)
# cv2.waitKey(0) # Wait for a key press
# cv2.destroyAllWindows()
print("ocr start")
raw_results: doctrDocument = self.predictor(docs)
print("ocr done")
if llm:
entire_content = ""
print("ocr llm start")
for raw_result, img in zip(raw_results.pages, docs, strict=False):
if raw_result.render() == "":
continue
_, buffer = cv2.imencode(".png", img)
img_str64 = base64.b64encode(buffer.tobytes()).decode("utf-8")
processed_result = llm.invoke(
[
HumanMessage(
content=[
{
"type": "text",
"text": f"Can you correct this entire text retranscription, respond only with the corrected transcription: {raw_result.render()},\n\n do not transcribe logos or images.",
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{img_str64}",
"detail": "auto",
},
},
]
)
]
)
assert isinstance(
processed_result.content, str
), "The LVM did not return a string"
entire_content += processed_result.content
print("ocr llm done")
return Document(page_content=entire_content)
return Document(page_content=raw_results.render())
except Exception as e:
print(e)
return Document(page_content=raw_results.render())
def deep_parse(
self,
file: AbstractFile,
partition: bool = False,
llm: BaseChatModel | None = None,
) -> Document:
"""
Parse the OCR output from the input file and return the extracted text.
"""
try:
logger.info("Starting document processing")
# Reduce image scale to lower memory usage
docs = DocumentFile.from_pdf(file, scale=int(300 / 72))
logger.info("Document loaded")
if partition:
logger.info("Partitioning document")
cropped_image = crop_to_content(docs[0])
docs = split_image(cropped_image)
logger.info("Starting OCR")
raw_results: doctrDocument = self.predictor(docs)
logger.info("OCR completed")
if llm:
entire_content = ""
logger.info("Starting LLM processing")
for i, (raw_result, img) in enumerate(
zip(raw_results.pages, docs, strict=False)
):
if raw_result.render() == "":
continue
_, buffer = cv2.imencode(".png", img)
img_str64 = base64.b64encode(buffer.tobytes()).decode("utf-8")
processed_result = llm.invoke(
[
HumanMessage(
content=[
{
"type": "text",
"text": f"Can you correct this entire text retranscription, respond only with the corrected transcription: {raw_result.render()},\n\n do not transcribe logos or images.",
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{img_str64}",
"detail": "auto",
},
},
]
)
]
)
assert isinstance(
processed_result.content, str
), "The LLM did not return a string"
entire_content += processed_result.content
logger.info("LLM processing completed")
return Document(page_content=entire_content)
return Document(page_content=raw_results.render())
except Exception as e:
logger.error(f"Error in deep_parse: {str(e)}", exc_info=True)
raise
def parse(self, file_path) -> Document:
"""
Parse with megaparse
"""
mp = MegaParse(file_path)
return mp.load()
async def aparse(self, file_path) -> Document:
"""
Parse with megaparse
"""
mp = MegaParse(file_path)
return await mp.aload()
# except:
# reader = SimpleDirectoryReader(input_files=[file_path])
# docs = reader.load_data()
# for doc in docs:
# print(doc)
# pause
# return "".join([doc.text for doc in docs])
# FIXME: When time @chloedia optimize this function and discount random points on the scan
def crop_to_content(image: np.ndarray) -> np.ndarray:
"""Crop the image to the text area."""
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) == 3 else image
# Apply threshold to get image with only black and white
_, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Create rectangular kernel for dilation
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
# Dilate to connect text into blocks
dilated = cv2.dilate(thresh, kernel, iterations=5)
# Find contours
contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
if contours:
# Find the bounding rectangles of all contours
bounding_rects = [cv2.boundingRect(c) for c in contours]
# Combine all bounding rectangles
x = min(rect[0] for rect in bounding_rects)
y = min(rect[1] for rect in bounding_rects)
max_x = max(rect[0] + rect[2] for rect in bounding_rects)
max_y = max(rect[1] + rect[3] for rect in bounding_rects)
w = max_x - x
h = max_y - y
# Add padding
padding = 10
x = max(0, x - padding)
y = max(0, y - padding)
w = min(image.shape[1] - x, w + 2 * padding)
h = min(image.shape[0] - y, h + 2 * padding)
# Crop the image
return image[y : y + h, x : x + w]
else:
return image
# FIXME: When time @chloedia optimize this function
def split_image(image: np.ndarray) -> List[np.ndarray]:
"""Split the image into 4 parts along the y-axis, avoiding splitting letters."""
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
else:
gray = image
# Apply threshold
_, thresh = cv2.threshold(
gray, 250, 255, cv2.THRESH_BINARY
) # Adjust threshold for white pixels
# Find horizontal projection
h_proj = np.sum(thresh, axis=1)
# Calculate the ideal height for each part
total_height = image.shape[0]
ideal_height = total_height // 4
sub_images = []
start = 0
for i in range(3): # We'll make 3 cuts to create 4 parts
target_end = (i + 1) * ideal_height
# Look for the best cut point around the target end
best_cut = target_end
max_whitespace = 0
search_start = max(target_end - ideal_height // 2, 0)
search_end = min(target_end + ideal_height // 2, total_height)
for j in range(search_start, search_end):
# Check for a continuous white line
if np.all(thresh[j, :] == 255):
whitespace = np.sum(
h_proj[max(0, j - 5) : min(total_height, j + 6)]
== 255 * image.shape[1]
)
if whitespace > max_whitespace:
max_whitespace = whitespace
best_cut = j
# If no suitable white line is found, use the target end
if max_whitespace == 0:
best_cut = target_end
# Make the cut
sub_images.append(image[start:best_cut, :])
start = best_cut
# Add the last part
sub_images.append(image[start:, :])
return sub_images

View File

@ -0,0 +1,91 @@
from langchain_core.prompts.prompt import PromptTemplate
COMPARISON_PROMPT = PromptTemplate.from_template(
template="""
You are provided with two texts <text 1> and <text 2>. You need to consider the information contained in \
<text 1> and compare it with the corresponding information contained in <text 2>. \
Keep in mind that <text 2> contains non-relevant information for this task, and that in <text 2> you \
should only focus on the information correspnding to the information contained in <text 1>. \
You need to report all the differences between the information contained in <text 1> and <text 2>. \\
Your job is to parse these differences and create a clear, concise report. \
Organize the report by sections and provide a detailed explanation of each difference. \
Be specific on difference, it will be reviewed and verified by a highly-trained quality engineer.
Here are the different sections of the report:
* Dénominations, comprenant:
* dénomination légale: nom du produit tel quil est défini par la réglementation, \
en général cela inclut aussi avec des information sur son état (cuite, cru, gelé ... )
* dénomination commercial: nom du produit tel quil est vendu au consommateur
* Ingrédients et allergènes (si presents dans plusieurs langues, comparer langue par langue), comprenant:
* liste dingrédients
* traces dallergènes
* une sous-section pour chaque sous produit si il y a lieu;
* Eléments de traçabilité, comprenant:
* le code-barre EAN
* le code article
* numéro de lot
* date de fabrication
* adresse de l'entreprise
* Conseils dutilisation / de manipulation produit, comprenant :
* Conditions / conseils de remise en oeuvre
* Durée de vie
* Durée de conservation (à compter de la date de production, à température ambiante / réfrigérée)
* DDM - date de durabilité minimale
* Conditions de transport
* Conditions de conservation : « A conserver à -18°C / Ne pas recongeler un produit décongeler »
* Temps de decongelation
* Temperature de prechauffage
* Caractéristiques / parametres physiques produit (unité de négoce), comprenant:
* poids de la pièce
* dimensions de la pièce
* poids du produit / unité de négoce (typiquement, carton)
* dimensions du produit / unité de négoce (typiquement, carton)
* nombre de pièces par unité de negoce (typiquement, carton) / colis
* poids du colis / carton
* Données palettisation / donnée technique sur palette (unité de transport)
* hauteur palette
* dimensions de l'unité de negoce (typiquement, carton) / colis
* nombre de colis par couche / palette
* Valeurs / informations nutritionnelles
* Autres
Notes:
-> Coup de Pates: Tradition & Innovation, est l'entreprise productrice / marque du produit.
Chaque sections doivent être organisées comme suit :
## Section name
**<text 1>** :
* ...
* ...
**<text 2>** : ...
* ...
* ...
**Differences**:
* ...
* ...
Beginning of <text 1>
{document}
End of <text 1>
Beginning of <text 2>
{cdc}
End of <text 2>
You need to consider all the information contained in <text 1> and compare it \
with the corresponding information contained in <text 2>.
The report should be written in a professional and formal tone and in French \
and it should follow the structure outlined above. If <text 1> doesn't contain a particular information, \
then you should ignore that information for <text 2> as well and avoid reporting any differences.
In the report you should replace evry occurence of <text 1> with {text_1} and every occurence of <text 2> with {text_2}.
## Dénominations
**{text_1}** :
*
"""
)

View File

@ -0,0 +1,760 @@
# generated by rye
# use `rye lock` or `rye sync` to update this lockfile
#
# last locked with the following flags:
# pre: false
# features: []
# all-features: false
# with-sources: false
# generate-hashes: false
# universal: false
-e file:.
aiohappyeyeballs==2.4.0
# via aiohttp
aiohttp==3.10.5
# via langchain
# via langchain-community
# via llama-index-core
# via llama-index-legacy
aiosignal==1.3.1
# via aiohttp
altair==5.4.1
# via streamlit
annotated-types==0.7.0
# via pydantic
antlr4-python3-runtime==4.9.3
# via omegaconf
anyascii==0.3.2
# via python-doctr
anyio==4.4.0
# via httpx
# via openai
appnope==0.1.4
# via ipykernel
asttokens==2.4.1
# via stack-data
attrs==24.2.0
# via aiohttp
# via jsonschema
# via referencing
backoff==2.2.1
# via unstructured
beautifulsoup4==4.12.3
# via llama-index-readers-file
# via unstructured
blinker==1.8.2
# via streamlit
cachetools==5.5.0
# via google-auth
# via streamlit
certifi==2024.7.4
# via httpcore
# via httpx
# via requests
# via unstructured-client
cffi==1.17.1
# via cryptography
chardet==5.2.0
# via unstructured
charset-normalizer==3.3.2
# via pdfminer-six
# via requests
# via unstructured-client
click==8.1.7
# via nltk
# via python-oxmsg
# via streamlit
cobble==0.1.4
# via mammoth
coloredlogs==15.0.1
# via onnxruntime
comm==0.2.2
# via ipykernel
contourpy==1.2.1
# via matplotlib
cryptography==43.0.1
# via pdfminer-six
cycler==0.12.1
# via matplotlib
dataclasses-json==0.6.7
# via langchain-community
# via llama-index-core
# via llama-index-legacy
# via unstructured
# via unstructured-client
debugpy==1.8.5
# via ipykernel
decorator==5.1.1
# via ipython
deepdiff==7.0.1
# via unstructured-client
defusedxml==0.7.1
# via python-doctr
deprecated==1.2.14
# via llama-index-core
# via llama-index-legacy
# via pikepdf
diff-match-patch==20230430
# via diff-assistant
dirtyjson==1.0.8
# via llama-index-core
# via llama-index-legacy
distro==1.9.0
# via openai
docx2txt==0.8
# via diff-assistant
effdet==0.4.1
# via unstructured
emoji==2.12.1
# via unstructured
et-xmlfile==1.1.0
# via openpyxl
executing==2.0.1
# via stack-data
faiss-cpu==1.8.0.post1
# via diff-assistant
filelock==3.15.4
# via huggingface-hub
# via torch
# via transformers
filetype==1.2.0
# via unstructured
fire==0.6.0
# via pdf2docx
flatbuffers==24.3.25
# via onnxruntime
fonttools==4.53.1
# via matplotlib
# via pdf2docx
frozenlist==1.4.1
# via aiohttp
# via aiosignal
fsspec==2024.6.1
# via huggingface-hub
# via llama-index-core
# via llama-index-legacy
# via torch
gitdb==4.0.11
# via gitpython
gitpython==3.1.43
# via streamlit
google-api-core==2.19.2
# via google-cloud-vision
google-auth==2.34.0
# via google-api-core
# via google-cloud-vision
google-cloud-vision==3.7.4
# via unstructured
googleapis-common-protos==1.65.0
# via google-api-core
# via grpcio-status
greenlet==3.0.3
# via sqlalchemy
grpcio==1.66.1
# via google-api-core
# via grpcio-status
grpcio-status==1.66.1
# via google-api-core
h11==0.14.0
# via httpcore
h5py==3.11.0
# via python-doctr
httpcore==1.0.5
# via httpx
httpx==0.27.0
# via langsmith
# via llama-cloud
# via llama-index-core
# via llama-index-legacy
# via openai
# via unstructured-client
huggingface-hub==0.24.6
# via python-doctr
# via timm
# via tokenizers
# via transformers
# via unstructured-inference
humanfriendly==10.0
# via coloredlogs
idna==3.7
# via anyio
# via httpx
# via requests
# via unstructured-client
# via yarl
iniconfig==2.0.0
# via pytest
iopath==0.1.10
# via layoutparser
ipykernel==6.29.5
# via diff-assistant
ipython==8.26.0
# via ipykernel
jedi==0.19.1
# via ipython
jinja2==3.1.4
# via altair
# via pydeck
# via torch
jiter==0.5.0
# via openai
joblib==1.4.2
# via nltk
# via scikit-learn
jsonpatch==1.33
# via langchain-core
jsonpath-python==1.0.6
# via unstructured-client
jsonpointer==3.0.0
# via jsonpatch
jsonschema==4.23.0
# via altair
jsonschema-specifications==2023.12.1
# via jsonschema
jupyter-client==8.6.2
# via ipykernel
jupyter-core==5.7.2
# via ipykernel
# via jupyter-client
kiwisolver==1.4.5
# via matplotlib
langchain==0.2.16
# via diff-assistant
# via langchain-community
# via megaparse
langchain-community==0.2.16
# via megaparse
langchain-core==0.2.39
# via langchain
# via langchain-community
# via langchain-openai
# via langchain-text-splitters
# via megaparse
langchain-openai==0.1.24
# via diff-assistant
# via megaparse
langchain-text-splitters==0.2.4
# via langchain
langdetect==1.0.9
# via python-doctr
# via unstructured
langsmith==0.1.118
# via langchain
# via langchain-community
# via langchain-core
layoutparser==0.3.4
# via unstructured-inference
llama-cloud==0.0.17
# via llama-index-indices-managed-llama-cloud
llama-index==0.11.8
# via diff-assistant
# via megaparse
llama-index-agent-openai==0.3.1
# via llama-index
# via llama-index-llms-openai
# via llama-index-program-openai
llama-index-cli==0.3.1
# via llama-index
llama-index-core==0.11.8
# via llama-index
# via llama-index-agent-openai
# via llama-index-cli
# via llama-index-embeddings-openai
# via llama-index-indices-managed-llama-cloud
# via llama-index-llms-openai
# via llama-index-multi-modal-llms-openai
# via llama-index-program-openai
# via llama-index-question-gen-openai
# via llama-index-readers-file
# via llama-index-readers-llama-parse
# via llama-parse
llama-index-embeddings-openai==0.2.4
# via llama-index
# via llama-index-cli
llama-index-indices-managed-llama-cloud==0.3.0
# via llama-index
llama-index-legacy==0.9.48.post3
# via llama-index
llama-index-llms-openai==0.2.3
# via diff-assistant
# via llama-index
# via llama-index-agent-openai
# via llama-index-cli
# via llama-index-multi-modal-llms-openai
# via llama-index-program-openai
# via llama-index-question-gen-openai
llama-index-multi-modal-llms-openai==0.2.0
# via llama-index
llama-index-program-openai==0.2.0
# via llama-index
# via llama-index-question-gen-openai
llama-index-question-gen-openai==0.2.0
# via llama-index
llama-index-readers-file==0.2.1
# via diff-assistant
# via llama-index
llama-index-readers-llama-parse==0.3.0
# via llama-index
llama-parse==0.5.3
# via llama-index-readers-llama-parse
# via megaparse
llvmlite==0.43.0
# via numba
lxml==5.3.0
# via pikepdf
# via python-docx
# via python-pptx
# via unstructured
mammoth==1.8.0
# via megaparse
markdown-it-py==3.0.0
# via rich
markupsafe==2.1.5
# via jinja2
marshmallow==3.22.0
# via dataclasses-json
# via unstructured-client
matplotlib==3.9.2
# via diff-assistant
# via mplcursors
# via pycocotools
# via unstructured-inference
matplotlib-inline==0.1.7
# via ipykernel
# via ipython
mdurl==0.1.2
# via markdown-it-py
megaparse==0.0.31
# via diff-assistant
mplcursors==0.5.3
# via diff-assistant
mpmath==1.3.0
# via sympy
multidict==6.0.5
# via aiohttp
# via yarl
mypy-extensions==1.0.0
# via typing-inspect
# via unstructured-client
narwhals==1.6.2
# via altair
nest-asyncio==1.6.0
# via ipykernel
# via llama-index-core
# via llama-index-legacy
# via unstructured-client
networkx==3.3
# via llama-index-core
# via llama-index-legacy
# via torch
nltk==3.9.1
# via llama-index
# via llama-index-core
# via llama-index-legacy
# via unstructured
numba==0.60.0
# via diff-assistant
numpy==1.26.4
# via contourpy
# via diff-assistant
# via faiss-cpu
# via h5py
# via langchain
# via langchain-community
# via layoutparser
# via llama-index-core
# via llama-index-legacy
# via matplotlib
# via numba
# via onnx
# via onnxruntime
# via opencv-python
# via opencv-python-headless
# via pandas
# via pdf2docx
# via pyarrow
# via pycocotools
# via pydeck
# via python-doctr
# via scikit-learn
# via scipy
# via shapely
# via streamlit
# via torchvision
# via transformers
# via unstructured
olefile==0.47
# via python-oxmsg
omegaconf==2.3.0
# via effdet
onnx==1.16.2
# via python-doctr
# via unstructured
# via unstructured-inference
onnxruntime==1.19.2
# via unstructured-inference
openai==1.44.1
# via diff-assistant
# via langchain-openai
# via llama-index-agent-openai
# via llama-index-embeddings-openai
# via llama-index-legacy
# via llama-index-llms-openai
opencv-python==4.10.0.84
# via diff-assistant
# via layoutparser
# via python-doctr
# via unstructured-inference
opencv-python-headless==4.10.0.84
# via pdf2docx
openpyxl==3.1.5
# via diff-assistant
ordered-set==4.1.0
# via deepdiff
orjson==3.10.7
# via langsmith
packaging==24.1
# via altair
# via faiss-cpu
# via huggingface-hub
# via ipykernel
# via langchain-core
# via marshmallow
# via matplotlib
# via onnxruntime
# via pikepdf
# via pytest
# via streamlit
# via transformers
# via unstructured-client
# via unstructured-pytesseract
pandas==2.2.2
# via diff-assistant
# via layoutparser
# via llama-index-legacy
# via llama-index-readers-file
# via streamlit
parso==0.8.4
# via jedi
pdf2docx==0.5.8
# via megaparse
pdf2image==1.17.0
# via layoutparser
# via unstructured
pdfminer-six==20231228
# via pdfplumber
# via unstructured
pdfplumber==0.11.4
# via layoutparser
# via megaparse
pexpect==4.9.0
# via ipython
pi-heif==0.18.0
# via unstructured
pikepdf==9.2.1
# via unstructured
pillow==10.4.0
# via layoutparser
# via llama-index-core
# via matplotlib
# via pdf2image
# via pdfplumber
# via pi-heif
# via pikepdf
# via python-doctr
# via python-pptx
# via streamlit
# via torchvision
# via unstructured-pytesseract
platformdirs==4.2.2
# via jupyter-core
pluggy==1.5.0
# via pytest
portalocker==2.10.1
# via iopath
prompt-toolkit==3.0.47
# via ipython
proto-plus==1.24.0
# via google-api-core
# via google-cloud-vision
protobuf==5.27.3
# via google-api-core
# via google-cloud-vision
# via googleapis-common-protos
# via grpcio-status
# via onnx
# via onnxruntime
# via proto-plus
# via streamlit
psutil==6.0.0
# via ipykernel
# via unstructured
ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.3
# via stack-data
pyarrow==17.0.0
# via streamlit
pyasn1==0.6.1
# via pyasn1-modules
# via rsa
pyasn1-modules==0.4.1
# via google-auth
pyclipper==1.3.0.post5
# via python-doctr
pycocotools==2.0.8
# via effdet
pycparser==2.22
# via cffi
pycryptodome==3.20.0
# via megaparse
pydantic==2.8.2
# via langchain
# via langchain-core
# via langsmith
# via llama-cloud
# via llama-index-core
# via openai
pydantic-core==2.20.1
# via pydantic
pydeck==0.9.1
# via streamlit
pygments==2.18.0
# via ipython
# via rich
pymupdf==1.24.10
# via pdf2docx
pymupdfb==1.24.10
# via pymupdf
pyparsing==3.1.2
# via matplotlib
pypdf==4.3.1
# via diff-assistant
# via llama-index-readers-file
# via unstructured
# via unstructured-client
pypdfium2==4.30.0
# via diff-assistant
# via pdfplumber
# via python-doctr
pytest==8.3.2
python-dateutil==2.9.0.post0
# via jupyter-client
# via matplotlib
# via pandas
# via unstructured-client
python-doctr==0.9.0
# via diff-assistant
python-docx==1.1.2
# via megaparse
# via pdf2docx
python-dotenv==1.0.1
# via diff-assistant
# via megaparse
python-iso639==2024.4.27
# via unstructured
python-magic==0.4.27
# via diff-assistant
# via unstructured
python-multipart==0.0.9
# via unstructured-inference
python-oxmsg==0.0.1
# via unstructured
python-pptx==1.0.2
# via megaparse
pytz==2024.1
# via pandas
pyyaml==6.0.2
# via huggingface-hub
# via langchain
# via langchain-community
# via langchain-core
# via layoutparser
# via llama-index-core
# via omegaconf
# via timm
# via transformers
pyzmq==26.1.1
# via ipykernel
# via jupyter-client
rapidfuzz==3.9.6
# via python-doctr
# via unstructured
# via unstructured-inference
referencing==0.35.1
# via jsonschema
# via jsonschema-specifications
regex==2024.7.24
# via nltk
# via tiktoken
# via transformers
requests==2.32.3
# via google-api-core
# via huggingface-hub
# via langchain
# via langchain-community
# via langsmith
# via llama-index-core
# via llama-index-legacy
# via requests-toolbelt
# via streamlit
# via tiktoken
# via transformers
# via unstructured
# via unstructured-client
requests-toolbelt==1.0.0
# via unstructured-client
rich==13.8.0
# via streamlit
rpds-py==0.20.0
# via jsonschema
# via referencing
rsa==4.9
# via google-auth
safetensors==0.4.5
# via timm
# via transformers
scikit-learn==1.5.1
# via diff-assistant
scipy==1.14.1
# via layoutparser
# via python-doctr
# via scikit-learn
shapely==2.0.6
# via python-doctr
six==1.16.0
# via asttokens
# via fire
# via langdetect
# via python-dateutil
# via unstructured-client
smmap==5.0.1
# via gitdb
sniffio==1.3.1
# via anyio
# via httpx
# via openai
soupsieve==2.6
# via beautifulsoup4
sqlalchemy==2.0.32
# via langchain
# via langchain-community
# via llama-index-core
# via llama-index-legacy
stack-data==0.6.3
# via ipython
streamlit==1.38.0
# via diff-assistant
striprtf==0.0.26
# via llama-index-readers-file
sympy==1.13.2
# via onnxruntime
# via torch
tabulate==0.9.0
# via unstructured
tenacity==8.5.0
# via langchain
# via langchain-community
# via langchain-core
# via llama-index-core
# via llama-index-legacy
# via streamlit
termcolor==2.4.0
# via fire
threadpoolctl==3.5.0
# via scikit-learn
tiktoken==0.7.0
# via langchain-openai
# via llama-index-core
# via llama-index-legacy
timm==1.0.9
# via effdet
# via unstructured-inference
tokenizers==0.19.1
# via transformers
toml==0.10.2
# via streamlit
torch==2.3.1
# via diff-assistant
# via effdet
# via python-doctr
# via timm
# via torchvision
# via unstructured-inference
torchvision==0.18.1
# via effdet
# via python-doctr
# via timm
tornado==6.4.1
# via ipykernel
# via jupyter-client
# via streamlit
tqdm==4.66.5
# via huggingface-hub
# via iopath
# via llama-index-core
# via nltk
# via openai
# via python-doctr
# via transformers
# via unstructured
traitlets==5.14.3
# via comm
# via ipykernel
# via ipython
# via jupyter-client
# via jupyter-core
# via matplotlib-inline
transformers==4.44.2
# via unstructured-inference
typing-extensions==4.12.2
# via altair
# via emoji
# via huggingface-hub
# via iopath
# via ipython
# via langchain-core
# via llama-index-core
# via llama-index-legacy
# via openai
# via pydantic
# via pydantic-core
# via python-docx
# via python-oxmsg
# via python-pptx
# via sqlalchemy
# via streamlit
# via torch
# via typing-inspect
# via unstructured
# via unstructured-client
typing-inspect==0.9.0
# via dataclasses-json
# via llama-index-core
# via llama-index-legacy
# via unstructured-client
tzdata==2024.1
# via pandas
unstructured==0.15.9
# via diff-assistant
# via megaparse
unstructured-client==0.25.5
# via unstructured
unstructured-inference==0.7.36
# via unstructured
unstructured-pytesseract==0.3.13
# via unstructured
urllib3==2.2.2
# via requests
# via unstructured-client
wcwidth==0.2.13
# via prompt-toolkit
wrapt==1.16.0
# via deprecated
# via llama-index-core
# via unstructured
xlsxwriter==3.2.0
# via python-pptx
yarl==1.9.7
# via aiohttp

View File

@ -0,0 +1,754 @@
# generated by rye
# use `rye lock` or `rye sync` to update this lockfile
#
# last locked with the following flags:
# pre: false
# features: []
# all-features: false
# with-sources: false
# generate-hashes: false
# universal: false
-e file:.
aiohappyeyeballs==2.4.0
# via aiohttp
aiohttp==3.10.5
# via langchain
# via langchain-community
# via llama-index-core
# via llama-index-legacy
aiosignal==1.3.1
# via aiohttp
altair==5.4.1
# via streamlit
annotated-types==0.7.0
# via pydantic
antlr4-python3-runtime==4.9.3
# via omegaconf
anyascii==0.3.2
# via python-doctr
anyio==4.4.0
# via httpx
# via openai
appnope==0.1.4
# via ipykernel
asttokens==2.4.1
# via stack-data
attrs==24.2.0
# via aiohttp
# via jsonschema
# via referencing
backoff==2.2.1
# via unstructured
beautifulsoup4==4.12.3
# via llama-index-readers-file
# via unstructured
blinker==1.8.2
# via streamlit
cachetools==5.5.0
# via google-auth
# via streamlit
certifi==2024.7.4
# via httpcore
# via httpx
# via requests
# via unstructured-client
cffi==1.17.1
# via cryptography
chardet==5.2.0
# via unstructured
charset-normalizer==3.3.2
# via pdfminer-six
# via requests
# via unstructured-client
click==8.1.7
# via nltk
# via python-oxmsg
# via streamlit
cobble==0.1.4
# via mammoth
coloredlogs==15.0.1
# via onnxruntime
comm==0.2.2
# via ipykernel
contourpy==1.2.1
# via matplotlib
cryptography==43.0.1
# via pdfminer-six
cycler==0.12.1
# via matplotlib
dataclasses-json==0.6.7
# via langchain-community
# via llama-index-core
# via llama-index-legacy
# via unstructured
# via unstructured-client
debugpy==1.8.5
# via ipykernel
decorator==5.1.1
# via ipython
deepdiff==7.0.1
# via unstructured-client
defusedxml==0.7.1
# via python-doctr
deprecated==1.2.14
# via llama-index-core
# via llama-index-legacy
# via pikepdf
diff-match-patch==20230430
# via diff-assistant
dirtyjson==1.0.8
# via llama-index-core
# via llama-index-legacy
distro==1.9.0
# via openai
docx2txt==0.8
# via diff-assistant
effdet==0.4.1
# via unstructured
emoji==2.12.1
# via unstructured
et-xmlfile==1.1.0
# via openpyxl
executing==2.0.1
# via stack-data
faiss-cpu==1.8.0.post1
# via diff-assistant
filelock==3.15.4
# via huggingface-hub
# via torch
# via transformers
filetype==1.2.0
# via unstructured
fire==0.6.0
# via pdf2docx
flatbuffers==24.3.25
# via onnxruntime
fonttools==4.53.1
# via matplotlib
# via pdf2docx
frozenlist==1.4.1
# via aiohttp
# via aiosignal
fsspec==2024.6.1
# via huggingface-hub
# via llama-index-core
# via llama-index-legacy
# via torch
gitdb==4.0.11
# via gitpython
gitpython==3.1.43
# via streamlit
google-api-core==2.19.2
# via google-cloud-vision
google-auth==2.34.0
# via google-api-core
# via google-cloud-vision
google-cloud-vision==3.7.4
# via unstructured
googleapis-common-protos==1.65.0
# via google-api-core
# via grpcio-status
greenlet==3.0.3
# via sqlalchemy
grpcio==1.66.1
# via google-api-core
# via grpcio-status
grpcio-status==1.66.1
# via google-api-core
h11==0.14.0
# via httpcore
h5py==3.11.0
# via python-doctr
httpcore==1.0.5
# via httpx
httpx==0.27.0
# via langsmith
# via llama-cloud
# via llama-index-core
# via llama-index-legacy
# via openai
# via unstructured-client
huggingface-hub==0.24.6
# via python-doctr
# via timm
# via tokenizers
# via transformers
# via unstructured-inference
humanfriendly==10.0
# via coloredlogs
idna==3.7
# via anyio
# via httpx
# via requests
# via unstructured-client
# via yarl
iopath==0.1.10
# via layoutparser
ipykernel==6.29.5
# via diff-assistant
ipython==8.26.0
# via ipykernel
jedi==0.19.1
# via ipython
jinja2==3.1.4
# via altair
# via pydeck
# via torch
jiter==0.5.0
# via openai
joblib==1.4.2
# via nltk
# via scikit-learn
jsonpatch==1.33
# via langchain-core
jsonpath-python==1.0.6
# via unstructured-client
jsonpointer==3.0.0
# via jsonpatch
jsonschema==4.23.0
# via altair
jsonschema-specifications==2023.12.1
# via jsonschema
jupyter-client==8.6.2
# via ipykernel
jupyter-core==5.7.2
# via ipykernel
# via jupyter-client
kiwisolver==1.4.5
# via matplotlib
langchain==0.2.16
# via diff-assistant
# via langchain-community
# via megaparse
langchain-community==0.2.16
# via megaparse
langchain-core==0.2.39
# via langchain
# via langchain-community
# via langchain-openai
# via langchain-text-splitters
# via megaparse
langchain-openai==0.1.24
# via diff-assistant
# via megaparse
langchain-text-splitters==0.2.4
# via langchain
langdetect==1.0.9
# via python-doctr
# via unstructured
langsmith==0.1.118
# via langchain
# via langchain-community
# via langchain-core
layoutparser==0.3.4
# via unstructured-inference
llama-cloud==0.0.17
# via llama-index-indices-managed-llama-cloud
llama-index==0.11.8
# via diff-assistant
# via megaparse
llama-index-agent-openai==0.3.1
# via llama-index
# via llama-index-llms-openai
# via llama-index-program-openai
llama-index-cli==0.3.1
# via llama-index
llama-index-core==0.11.8
# via llama-index
# via llama-index-agent-openai
# via llama-index-cli
# via llama-index-embeddings-openai
# via llama-index-indices-managed-llama-cloud
# via llama-index-llms-openai
# via llama-index-multi-modal-llms-openai
# via llama-index-program-openai
# via llama-index-question-gen-openai
# via llama-index-readers-file
# via llama-index-readers-llama-parse
# via llama-parse
llama-index-embeddings-openai==0.2.4
# via llama-index
# via llama-index-cli
llama-index-indices-managed-llama-cloud==0.3.0
# via llama-index
llama-index-legacy==0.9.48.post3
# via llama-index
llama-index-llms-openai==0.2.3
# via diff-assistant
# via llama-index
# via llama-index-agent-openai
# via llama-index-cli
# via llama-index-multi-modal-llms-openai
# via llama-index-program-openai
# via llama-index-question-gen-openai
llama-index-multi-modal-llms-openai==0.2.0
# via llama-index
llama-index-program-openai==0.2.0
# via llama-index
# via llama-index-question-gen-openai
llama-index-question-gen-openai==0.2.0
# via llama-index
llama-index-readers-file==0.2.1
# via diff-assistant
# via llama-index
llama-index-readers-llama-parse==0.3.0
# via llama-index
llama-parse==0.5.3
# via llama-index-readers-llama-parse
# via megaparse
llvmlite==0.43.0
# via numba
lxml==5.3.0
# via pikepdf
# via python-docx
# via python-pptx
# via unstructured
mammoth==1.8.0
# via megaparse
markdown-it-py==3.0.0
# via rich
markupsafe==2.1.5
# via jinja2
marshmallow==3.22.0
# via dataclasses-json
# via unstructured-client
matplotlib==3.9.2
# via diff-assistant
# via mplcursors
# via pycocotools
# via unstructured-inference
matplotlib-inline==0.1.7
# via ipykernel
# via ipython
mdurl==0.1.2
# via markdown-it-py
megaparse==0.0.31
# via diff-assistant
mplcursors==0.5.3
# via diff-assistant
mpmath==1.3.0
# via sympy
multidict==6.0.5
# via aiohttp
# via yarl
mypy-extensions==1.0.0
# via typing-inspect
# via unstructured-client
narwhals==1.6.2
# via altair
nest-asyncio==1.6.0
# via ipykernel
# via llama-index-core
# via llama-index-legacy
# via unstructured-client
networkx==3.3
# via llama-index-core
# via llama-index-legacy
# via torch
nltk==3.9.1
# via llama-index
# via llama-index-core
# via llama-index-legacy
# via unstructured
numba==0.60.0
# via diff-assistant
numpy==1.26.4
# via contourpy
# via diff-assistant
# via faiss-cpu
# via h5py
# via langchain
# via langchain-community
# via layoutparser
# via llama-index-core
# via llama-index-legacy
# via matplotlib
# via numba
# via onnx
# via onnxruntime
# via opencv-python
# via opencv-python-headless
# via pandas
# via pdf2docx
# via pyarrow
# via pycocotools
# via pydeck
# via python-doctr
# via scikit-learn
# via scipy
# via shapely
# via streamlit
# via torchvision
# via transformers
# via unstructured
olefile==0.47
# via python-oxmsg
omegaconf==2.3.0
# via effdet
onnx==1.16.2
# via python-doctr
# via unstructured
# via unstructured-inference
onnxruntime==1.19.2
# via unstructured-inference
openai==1.44.1
# via diff-assistant
# via langchain-openai
# via llama-index-agent-openai
# via llama-index-embeddings-openai
# via llama-index-legacy
# via llama-index-llms-openai
opencv-python==4.10.0.84
# via diff-assistant
# via layoutparser
# via python-doctr
# via unstructured-inference
opencv-python-headless==4.10.0.84
# via pdf2docx
openpyxl==3.1.5
# via diff-assistant
ordered-set==4.1.0
# via deepdiff
orjson==3.10.7
# via langsmith
packaging==24.1
# via altair
# via faiss-cpu
# via huggingface-hub
# via ipykernel
# via langchain-core
# via marshmallow
# via matplotlib
# via onnxruntime
# via pikepdf
# via streamlit
# via transformers
# via unstructured-client
# via unstructured-pytesseract
pandas==2.2.2
# via diff-assistant
# via layoutparser
# via llama-index-legacy
# via llama-index-readers-file
# via streamlit
parso==0.8.4
# via jedi
pdf2docx==0.5.8
# via megaparse
pdf2image==1.17.0
# via layoutparser
# via unstructured
pdfminer-six==20231228
# via pdfplumber
# via unstructured
pdfplumber==0.11.4
# via layoutparser
# via megaparse
pexpect==4.9.0
# via ipython
pi-heif==0.18.0
# via unstructured
pikepdf==9.2.1
# via unstructured
pillow==10.4.0
# via layoutparser
# via llama-index-core
# via matplotlib
# via pdf2image
# via pdfplumber
# via pi-heif
# via pikepdf
# via python-doctr
# via python-pptx
# via streamlit
# via torchvision
# via unstructured-pytesseract
platformdirs==4.2.2
# via jupyter-core
portalocker==2.10.1
# via iopath
prompt-toolkit==3.0.47
# via ipython
proto-plus==1.24.0
# via google-api-core
# via google-cloud-vision
protobuf==5.27.3
# via google-api-core
# via google-cloud-vision
# via googleapis-common-protos
# via grpcio-status
# via onnx
# via onnxruntime
# via proto-plus
# via streamlit
psutil==6.0.0
# via ipykernel
# via unstructured
ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.3
# via stack-data
pyarrow==17.0.0
# via streamlit
pyasn1==0.6.1
# via pyasn1-modules
# via rsa
pyasn1-modules==0.4.1
# via google-auth
pyclipper==1.3.0.post5
# via python-doctr
pycocotools==2.0.8
# via effdet
pycparser==2.22
# via cffi
pycryptodome==3.20.0
# via megaparse
pydantic==2.8.2
# via langchain
# via langchain-core
# via langsmith
# via llama-cloud
# via llama-index-core
# via openai
pydantic-core==2.20.1
# via pydantic
pydeck==0.9.1
# via streamlit
pygments==2.18.0
# via ipython
# via rich
pymupdf==1.24.10
# via pdf2docx
pymupdfb==1.24.10
# via pymupdf
pyparsing==3.1.2
# via matplotlib
pypdf==4.3.1
# via diff-assistant
# via llama-index-readers-file
# via unstructured
# via unstructured-client
pypdfium2==4.30.0
# via diff-assistant
# via pdfplumber
# via python-doctr
python-dateutil==2.9.0.post0
# via jupyter-client
# via matplotlib
# via pandas
# via unstructured-client
python-doctr==0.9.0
# via diff-assistant
python-docx==1.1.2
# via megaparse
# via pdf2docx
python-dotenv==1.0.1
# via diff-assistant
# via megaparse
python-iso639==2024.4.27
# via unstructured
python-magic==0.4.27
# via diff-assistant
# via unstructured
python-multipart==0.0.9
# via unstructured-inference
python-oxmsg==0.0.1
# via unstructured
python-pptx==1.0.2
# via megaparse
pytz==2024.1
# via pandas
pyyaml==6.0.2
# via huggingface-hub
# via langchain
# via langchain-community
# via langchain-core
# via layoutparser
# via llama-index-core
# via omegaconf
# via timm
# via transformers
pyzmq==26.1.1
# via ipykernel
# via jupyter-client
rapidfuzz==3.9.6
# via python-doctr
# via unstructured
# via unstructured-inference
referencing==0.35.1
# via jsonschema
# via jsonschema-specifications
regex==2024.7.24
# via nltk
# via tiktoken
# via transformers
requests==2.32.3
# via google-api-core
# via huggingface-hub
# via langchain
# via langchain-community
# via langsmith
# via llama-index-core
# via llama-index-legacy
# via requests-toolbelt
# via streamlit
# via tiktoken
# via transformers
# via unstructured
# via unstructured-client
requests-toolbelt==1.0.0
# via unstructured-client
rich==13.8.0
# via streamlit
rpds-py==0.20.0
# via jsonschema
# via referencing
rsa==4.9
# via google-auth
safetensors==0.4.5
# via timm
# via transformers
scikit-learn==1.5.1
# via diff-assistant
scipy==1.14.1
# via layoutparser
# via python-doctr
# via scikit-learn
shapely==2.0.6
# via python-doctr
six==1.16.0
# via asttokens
# via fire
# via langdetect
# via python-dateutil
# via unstructured-client
smmap==5.0.1
# via gitdb
sniffio==1.3.1
# via anyio
# via httpx
# via openai
soupsieve==2.6
# via beautifulsoup4
sqlalchemy==2.0.32
# via langchain
# via langchain-community
# via llama-index-core
# via llama-index-legacy
stack-data==0.6.3
# via ipython
streamlit==1.38.0
# via diff-assistant
striprtf==0.0.26
# via llama-index-readers-file
sympy==1.13.2
# via onnxruntime
# via torch
tabulate==0.9.0
# via unstructured
tenacity==8.5.0
# via langchain
# via langchain-community
# via langchain-core
# via llama-index-core
# via llama-index-legacy
# via streamlit
termcolor==2.4.0
# via fire
threadpoolctl==3.5.0
# via scikit-learn
tiktoken==0.7.0
# via langchain-openai
# via llama-index-core
# via llama-index-legacy
timm==1.0.9
# via effdet
# via unstructured-inference
tokenizers==0.19.1
# via transformers
toml==0.10.2
# via streamlit
torch==2.3.1
# via diff-assistant
# via effdet
# via python-doctr
# via timm
# via torchvision
# via unstructured-inference
torchvision==0.18.1
# via effdet
# via python-doctr
# via timm
tornado==6.4.1
# via ipykernel
# via jupyter-client
# via streamlit
tqdm==4.66.5
# via huggingface-hub
# via iopath
# via llama-index-core
# via nltk
# via openai
# via python-doctr
# via transformers
# via unstructured
traitlets==5.14.3
# via comm
# via ipykernel
# via ipython
# via jupyter-client
# via jupyter-core
# via matplotlib-inline
transformers==4.44.2
# via unstructured-inference
typing-extensions==4.12.2
# via altair
# via emoji
# via huggingface-hub
# via iopath
# via ipython
# via langchain-core
# via llama-index-core
# via llama-index-legacy
# via openai
# via pydantic
# via pydantic-core
# via python-docx
# via python-oxmsg
# via python-pptx
# via sqlalchemy
# via streamlit
# via torch
# via typing-inspect
# via unstructured
# via unstructured-client
typing-inspect==0.9.0
# via dataclasses-json
# via llama-index-core
# via llama-index-legacy
# via unstructured-client
tzdata==2024.1
# via pandas
unstructured==0.15.9
# via diff-assistant
# via megaparse
unstructured-client==0.25.5
# via unstructured
unstructured-inference==0.7.36
# via unstructured
unstructured-pytesseract==0.3.13
# via unstructured
urllib3==2.2.2
# via requests
# via unstructured-client
wcwidth==0.2.13
# via prompt-toolkit
wrapt==1.16.0
# via deprecated
# via llama-index-core
# via unstructured
xlsxwriter==3.2.0
# via python-pptx
yarl==1.9.7
# via aiohttp

View File

@ -0,0 +1,6 @@
import pytest
@pytest.fixture
def hello_message():
return "Hello from diff-assistant!"

View File

@ -0,0 +1,5 @@
from use_case_3 import hello
def test_hello(hello_message):
assert hello() == hello_message

View File

@ -8,6 +8,7 @@ authors = [
dependencies = [
"quivr-core[all]",
"quivr-api",
"quivr-diff-assistant",
"celery[redis]>=5.0.0",
"python-dotenv>=1.0.0",
"playwright>=1.0.0",
@ -48,3 +49,7 @@ path = "../quivr-core"
[[tool.rye.sources]]
name = "quivr-api"
path = "../quivr-api"
[[tool.rye.sources]]
name = "quivr-diff-assistant"
path = "./diff-assistant"

View File

@ -5,6 +5,8 @@ from quivr_api.modules.upload.service.upload_file import (
upload_file_storage,
)
from quivr_worker.assistants.cdp_use_case_2 import process_cdp_use_case_2
from quivr_worker.assistants.cdp_use_case_3 import process_cdp_use_case_3
from quivr_worker.utils.pdf_generator.pdf_generator import PDFGenerator, PDFModel
@ -15,19 +17,29 @@ async def process_assistant(
tasks_service: TasksService,
user_id: str,
):
print(task_id)
task = await tasks_service.get_task_by_id(task_id, user_id) # type: ignore
assistant_name = task.assistant_name
output = ""
if assistant_id == 3:
output = await process_cdp_use_case_3(
assistant_id, notification_uuid, task_id, tasks_service, user_id
)
elif assistant_id == 2:
output = await process_cdp_use_case_2(
assistant_id, notification_uuid, task_id, tasks_service, user_id
)
else:
new_task = await tasks_service.update_task(task_id, {"status": "processing"})
# Add a random delay of 10 to 20 seconds
await tasks_service.update_task(task_id, {"status": "in_progress"})
print(task)
task_result = {"status": "completed", "answer": "#### Assistant answer"}
task_result = {"status": "completed", "answer": output}
output_dir = f"{assistant_id}/{notification_uuid}"
os.makedirs(output_dir, exist_ok=True)
output_path = f"{output_dir}/output.pdf"
generated_pdf = PDFGenerator(PDFModel(title="Test", content="Test"))
generated_pdf = PDFGenerator(PDFModel(title=assistant_name, content=output))
generated_pdf.print_pdf()
generated_pdf.output(output_path)
@ -36,5 +48,4 @@ async def process_assistant(
# Now delete the file
os.remove(output_path)
await tasks_service.update_task(task_id, task_result)

View File

@ -0,0 +1,312 @@
import random
import string
from enum import Enum
import pandas as pd
# get environment variables
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.node_parser import UnstructuredElementNodeParser
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.schema import Document
from llama_index.llms.openai import OpenAI
from quivr_api.logger import get_logger
from quivr_api.modules.assistant.dto.inputs import InputAssistant
from quivr_api.modules.assistant.services.tasks_service import TasksService
from quivr_api.modules.dependencies import get_supabase_client
from quivr_diff_assistant.use_case_3.parser import DeadlyParser
from quivr_diff_assistant.utils.utils import COMPARISON_PROMPT
logger = get_logger(__name__)
# Set pandas display options
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)
def load_and_process_document(file_path, pickle_file):
print(file_path)
reader = SimpleDirectoryReader(input_files=[file_path])
docs = reader.load_data()
print(len(docs), " and", len(docs[0].text))
if len(docs) == 1 and len(docs[0].text) < 9:
print("No text found with classical parse, switching to OCR ...")
parser = DeadlyParser()
doc = parser.deep_parse(file_path)
docs = [Document().from_langchain_format(doc)]
node_parser = UnstructuredElementNodeParser()
raw_nodes = node_parser.get_nodes_from_documents(docs)
base_nodes, node_mappings = node_parser.get_base_nodes_and_mappings(raw_nodes)
return base_nodes, node_mappings
def create_query_engine(base_nodes, node_mappings):
vector_index = VectorStoreIndex(base_nodes)
vector_retriever = vector_index.as_retriever(similarity_top_k=5)
recursive_retriever = RecursiveRetriever(
"vector",
retriever_dict={"vector": vector_retriever},
node_dict=node_mappings,
verbose=True,
)
return RetrieverQueryEngine.from_args(
recursive_retriever, llm=OpenAI(temperature=0, model="gpt-4")
)
def compare_responses(response1, response2):
llm = OpenAI(temperature=0, model="gpt-4")
prompt = f"""
Compare the following two responses and determine if they convey the same information:
Response for document 1: {response1}
Response for document 2: {response2}
Are these responses essentially the same? Provide a brief explanation for your conclusion. The difference in format are not important, focus on the content and the numbers.
If there are any specific differences, please highlight them with bullet points. Respond in french and in a markdown format.
"""
return llm.complete(prompt)
class ComparisonTypes(str, Enum):
CDC_ETIQUETTE = "Cahier des Charges - Etiquette"
CDC_FICHE_DEV = "Cahier des Charges - Fiche Dev"
def llm_comparator(
document: str, cdc: str, llm: BaseChatModel, comparison_type: ComparisonTypes
):
chain = COMPARISON_PROMPT | llm | StrOutputParser()
if comparison_type == ComparisonTypes.CDC_ETIQUETTE:
text_1 = "Etiquette"
elif comparison_type == ComparisonTypes.CDC_FICHE_DEV:
text_1 = "Fiche Dev"
return chain.stream(
{
"document": document,
"text_1": text_1,
"cdc": cdc,
"text_2": "Cahier des Charges",
}
)
async def process_cdp_use_case_2(
assistant_id: str,
notification_uuid: str,
task_id: int,
tasks_service: TasksService,
user_id: str,
) -> str:
task = await tasks_service.get_task_by_id(task_id, user_id) # type: ignore
logger.info(f"Task: {task} 📝")
# Parse settings into InputAssistant
input_assistant = InputAssistant.model_validate(task.settings)
assert input_assistant.inputs.files is not None
assert len(input_assistant.inputs.files) == 2
# Get the value of the "Document 1" key and "Document 2" key. The input files might not be in the order of "Document 1" and "Document 2"
# So we need to find the correct order
logger.info(f"Input assistant: {input_assistant} 📂")
before_file_key = input_assistant.inputs.files[0].key
after_file_key = input_assistant.inputs.files[1].key
before_file_value = input_assistant.inputs.files[0].value
after_file_value = input_assistant.inputs.files[1].value
if before_file_key == "Document 2":
before_file_value = input_assistant.inputs.files[1].value
after_file_value = input_assistant.inputs.files[0].value
# Get the files from supabase
supabase_client = get_supabase_client()
path = f"{task.assistant_id}/{task.pretty_id}/"
logger.info(f"Path: {path} 📁")
await tasks_service.update_task(task_id, {"status": "processing"})
before_file_data = supabase_client.storage.from_("quivr").download(
f"{path}{before_file_value}"
)
after_file_data = supabase_client.storage.from_("quivr").download(
f"{path}{after_file_value}"
)
# Generate a random string of 8 characters
random_string = "".join(random.choices(string.ascii_letters + string.digits, k=8))
# Write temp files with the original name without using save_uploaded_file
# because the file is already in the quivr bucket
before_file_path = f"/tmp/{random_string}_{before_file_value}"
after_file_path = f"/tmp/{random_string}_{after_file_value}"
with open(before_file_path, "wb") as f:
f.write(before_file_data)
with open(after_file_path, "wb") as f:
f.write(after_file_data)
assert input_assistant.inputs.select_texts is not None
value_use_case = input_assistant.inputs.select_texts[0].value
## Get the document type
document_type = None
if value_use_case == "Etiquettes VS Cahier des charges":
document_type = ComparisonTypes.CDC_ETIQUETTE
elif value_use_case == "Fiche Dev VS Cahier des charges":
document_type = ComparisonTypes.CDC_FICHE_DEV
else:
logger.error(f"❌ Document type not supported: {value_use_case}")
raise ValueError(f"Document type not supported: {value_use_case}")
parser = DeadlyParser()
logger.info(f"Document type: {document_type} 📄")
llm = ChatOpenAI(
model="gpt-4o",
temperature=0.1,
max_tokens=None,
max_retries=2,
)
before_file_parsed = await parser.aparse(before_file_path)
logger.info("Before file parsed 📜")
after_file_parsed = None
if document_type == ComparisonTypes.CDC_ETIQUETTE:
logger.info("Parsing after file with deep parse 🔍")
after_file_parsed = await parser.deep_aparse(after_file_path, llm=llm)
else:
logger.info("Parsing after file with classical parse 🔍")
after_file_parsed = await parser.aparse(after_file_path)
logger.info("Comparing documents ⚖️")
comparison = llm_comparator(
document=after_file_parsed.page_content,
cdc=before_file_parsed.page_content,
llm=llm,
comparison_type=document_type,
)
logger.info(f"Comparison: {comparison}")
return "".join(comparison)
async def test_main():
cdc_doc = "/Users/jchevall/Coding/diff-assistant/data/Use case #2/Cas2-2-1_Mendiant Lait_QD PC F03 - FR Cahier des charges produit -rev 2021-v2.pdf"
doc = "/Users/jchevall/Coding/diff-assistant/data/Use case #2/Cas2-2-1_Proposition étiquette Mendiant Lait croustillant.pdf"
comparison_type = ComparisonTypes.CDC_FICHE_DEV
llm = ChatOpenAI(
model="gpt-4o",
temperature=0.1,
max_tokens=None,
max_retries=2,
)
parser = DeadlyParser()
parsed_cdc_doc = await parser.aparse(cdc_doc)
if comparison_type == ComparisonTypes.CDC_ETIQUETTE:
parsed_doc = await parser.deep_aparse(doc, llm=llm)
else:
parsed_doc = await parser.aparse(doc)
print("\n\n Cahier des Charges")
print(parsed_cdc_doc.page_content)
print("\n\n Other document")
print(parsed_doc.page_content)
comparison = llm_comparator(
document=parsed_doc.page_content,
cdc=parsed_cdc_doc.page_content,
llm=llm,
comparison_type=comparison_type,
)
print("\n\n Comparison")
print(comparison)
def get_document_path(doc):
try:
with open(doc.name, "wb") as temp_file:
temp_file.write(doc.getbuffer())
path = temp_file.name
except:
path = doc
return path
async def parse_documents(cdc_doc, doc, comparison_type: ComparisonTypes, llm):
parser = DeadlyParser()
# Schedule the coroutines as tasks
cdc_task = asyncio.create_task(parser.aparse(get_document_path(cdc_doc)))
if comparison_type == ComparisonTypes.CDC_ETIQUETTE:
doc_task = asyncio.create_task(
parser.deep_aparse(get_document_path(doc), llm=llm)
)
else:
doc_task = asyncio.create_task(parser.aparse(get_document_path(doc)))
# Optionally, do other work here while tasks are running
# Await the tasks to get the results
parsed_cdc_doc = await cdc_task
print("\n\n Cahier de Charges: \n", parsed_cdc_doc.page_content)
parsed_doc = await doc_task
print("\n\n Other doc: \n", parsed_doc.page_content)
return parsed_cdc_doc, parsed_doc
# def main():
# st.title("Document Comparison Tool : Use Case 2")
# # File uploaders for two documents
# cdc_doc = st.file_uploader(
# "Upload Cahier des Charges", type=["docx", "xlsx", "pdf", "txt"]
# )
# doc = st.file_uploader(
# "Upload Etiquette / Fiche Dev", type=["docx", "xlsx", "pdf", "txt"]
# )
# comparison_type = st.selectbox(
# "Select document types",
# [ComparisonTypes.CDC_ETIQUETTE.value, ComparisonTypes.CDC_FICHE_DEV.value],
# )
# if st.button("Process Documents and Questions"):
# if not cdc_doc or not doc:
# st.error("Please upload both documents before launching the processing.")
# return
# with st.spinner("Processing files..."):
# llm = ChatOpenAI(
# model="gpt-4o",
# temperature=0.1,
# max_tokens=None,
# max_retries=2,
# )
# parsed_cdc_doc, parsed_doc = asyncio.run(
# parse_documents(cdc_doc, doc, comparison_type=comparison_type, llm=llm)
# )
# comparison = llm_comparator(
# document=parsed_doc.page_content,
# cdc=parsed_cdc_doc.page_content,
# llm=llm,
# comparison_type=comparison_type,
# )
# # Run the async function using asyncio.run()
# # comparison = asyncio.run(process_documents(cdc_doc, doc, comparison_type))
# st.write_stream(comparison)

View File

@ -0,0 +1,224 @@
import os
import random
import string
import tempfile
from enum import Enum
from pathlib import Path
from diff_match_patch import diff_match_patch
# get environment variables
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_openai import ChatOpenAI
from quivr_api.logger import get_logger
from quivr_api.modules.assistant.dto.inputs import InputAssistant
from quivr_api.modules.assistant.services.tasks_service import TasksService
from quivr_api.modules.dependencies import get_supabase_client
from quivr_diff_assistant.use_case_3.diff_type import DiffResult, llm_comparator
from quivr_diff_assistant.use_case_3.llm_reporter import redact_report
from quivr_diff_assistant.use_case_3.parser import DeadlyParser
logger = get_logger(__name__)
class DocumentType(Enum):
ETIQUETTE = "etiquette"
CAHIER_DES_CHARGES = "cdc"
async def process_cdp_use_case_3(
assistant_id: str,
notification_uuid: str,
task_id: int,
tasks_service: TasksService,
user_id: str,
) -> str:
task = await tasks_service.get_task_by_id(task_id, user_id) # type: ignore
# Parse settings into InputAssistant
input_assistant = InputAssistant.model_validate(task.settings)
assert input_assistant.inputs.files is not None
assert len(input_assistant.inputs.files) == 2
# Get the value of the "Document 1" key and "Document 2" key. The input files might not be in the order of "Document 1" and "Document 2"
# So we need to find the correct order
before_file_key = input_assistant.inputs.files[0].key
after_file_key = input_assistant.inputs.files[1].key
before_file_value = input_assistant.inputs.files[0].value
after_file_value = input_assistant.inputs.files[1].value
if before_file_key == "Document 2":
before_file_value = input_assistant.inputs.files[1].value
after_file_value = input_assistant.inputs.files[0].value
# Get the files from supabase
supabase_client = get_supabase_client()
path = f"{task.assistant_id}/{task.pretty_id}/"
await tasks_service.update_task(task_id, {"status": "processing"})
# Before file key - parsed from the
before_file_data = supabase_client.storage.from_("quivr").download(
f"{path}{before_file_value}"
)
after_file_data = supabase_client.storage.from_("quivr").download(
f"{path}{after_file_value}"
)
# Generate a random string of 8 characters
random_string = "".join(random.choices(string.ascii_letters + string.digits, k=8))
# Write temp files with the original name without using save_uploaded_file
# because the file is already in the quivr bucket
before_file_path = f"/tmp/{random_string}_{before_file_value}"
after_file_path = f"/tmp/{random_string}_{after_file_value}"
with open(before_file_path, "wb") as f:
f.write(before_file_data)
with open(after_file_path, "wb") as f:
f.write(after_file_data)
assert input_assistant.inputs.select_texts is not None
value_use_case = input_assistant.inputs.select_texts[0].value
## Get the document type
document_type = None
if value_use_case == "Etiquettes":
document_type = DocumentType.ETIQUETTE
elif value_use_case == "Cahier des charges":
document_type = DocumentType.CAHIER_DES_CHARGES
else:
raise ValueError(f"Invalid value for use case: {value_use_case}")
## Get the hard to read document boolean value
assert input_assistant.inputs.booleans is not None
hard_to_read_document = input_assistant.inputs.booleans[0].value
assert before_file_data is not None
assert after_file_data is not None
openai_gpt4o = ChatOpenAI(
model="gpt-4o",
temperature=0,
max_tokens=None,
max_retries=2,
)
llm_comparator = True if document_type == DocumentType.ETIQUETTE else False
report = await create_modification_report(
before_file=before_file_path,
after_file=after_file_path,
type=document_type,
llm=openai_gpt4o,
partition=hard_to_read_document,
use_llm_comparator=llm_comparator,
)
os.unlink(before_file_path)
os.unlink(after_file_path)
return report
async def create_modification_report(
before_file: str | Path | bytes,
after_file: str | Path | bytes,
type: DocumentType,
llm: BaseChatModel,
partition: bool = False,
use_llm_comparator: bool = False,
parser=DeadlyParser(),
) -> str:
if type == DocumentType.ETIQUETTE:
logger.debug("parsing before file")
before_text = parser.deep_parse(before_file, partition=partition, llm=llm)
logger.debug("parsing after file")
after_text = parser.deep_parse(after_file, partition=partition, llm=llm)
elif type == DocumentType.CAHIER_DES_CHARGES:
before_text = await parser.aparse(before_file)
after_text = await parser.aparse(after_file)
logger.debug(before_text.page_content)
logger.debug(after_text.page_content)
text_after_sections = before_text.page_content.split("\n# ")
text_before_sections = after_text.page_content.split("\n# ")
assert len(text_after_sections) == len(text_before_sections)
if use_llm_comparator:
logger.debug("using llm comparator")
llm_comparator_result = llm_comparator(
before_text.page_content, after_text.page_content, llm=llm
)
return llm_comparator_result
logger.debug("using diff match patch")
dmp = diff_match_patch()
section_diffs = []
for after_section, before_section in zip(
text_after_sections, text_before_sections, strict=False
):
main_diff: list[tuple[int, str]] = dmp.diff_main(after_section, before_section)
section_diffs.append(DiffResult(main_diff))
logger.debug(section_diffs)
report = redact_report(section_diffs, llm=llm)
return report
def save_uploaded_file(uploaded_file):
with tempfile.NamedTemporaryFile(
delete=False, suffix=os.path.splitext(uploaded_file.name)[1]
) as tmp_file:
tmp_file.write(uploaded_file.getvalue())
return tmp_file.name
# st.title("Document Modification Report Generator : Use Case 3")
# # File uploaders
# before_file = st.file_uploader("Upload 'Before' file", type=["pdf", "docx"])
# after_file = st.file_uploader("Upload 'After' file", type=["pdf", "docx"])
# # Document type selector
# doc_type = st.selectbox("Select document type", ["ETIQUETTE", "CAHIER_DES_CHARGES"])
# # Complexity of document
# complexity = st.checkbox("Complex document (lot of text of OCRise)")
# # Process button
# if st.button("Process"):
# if before_file and after_file:
# with st.spinner("Processing files..."):
# # Save uploaded files
# before_path = save_uploaded_file(before_file)
# after_path = save_uploaded_file(after_file)
# # Initialize LLM
# openai_gpt4o = ChatOpenAI(
# model="gpt-4o",
# temperature=0,
# max_tokens=None,
# max_retries=2,
# )
# use_llm_comparator = True if doc_type == "ETIQUETTE" else False
# # Generate report
# logger.debug("generating report")
# report = asyncio.run(
# create_modification_report(
# before_path,
# after_path,
# DocumentType[doc_type],
# openai_gpt4o,
# partition=complexity,
# use_llm_comparator=use_llm_comparator,
# )
# )
# logger.debug("report generated")
# # Display results
# st.subheader("Modification Report")
# st.write(report)
# # Clean up temporary files
# os.unlink(before_path)
# os.unlink(after_path)
# else:
# st.error("Please upload both 'Before' and 'After' files.")

View File

@ -2,6 +2,7 @@ import asyncio
import os
from uuid import UUID
import torch
from celery.schedules import crontab
from celery.signals import worker_process_init
from dotenv import load_dotenv
@ -32,8 +33,8 @@ from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine
from sqlmodel import Session, text
from sqlmodel.ext.asyncio.session import AsyncSession
from quivr_worker.celery_monitor import is_being_executed
from quivr_worker.assistants.assistants import process_assistant
from quivr_worker.celery_monitor import is_being_executed
from quivr_worker.check_premium import check_is_premium
from quivr_worker.process.process_s3_file import process_uploaded_file
from quivr_worker.process.process_url import process_url_func
@ -46,6 +47,9 @@ from quivr_worker.syncs.process_active_syncs import (
from quivr_worker.syncs.store_notion import fetch_and_store_notion_files_async
from quivr_worker.utils.utils import _patch_json
torch.set_num_threads(1)
load_dotenv()
get_logger("quivr_core")
@ -130,6 +134,8 @@ async def aprocess_assistant_task(
task_id: int,
user_id: str,
):
global async_engine
assert async_engine
async with AsyncSession(async_engine) as async_session:
try:
await async_session.execute(

View File

@ -34,9 +34,9 @@ class PDFGenerator(FPDF):
)
def header(self):
# Logo
logo_path = os.path.join(os.path.dirname(__file__), "logo.png")
self.image(logo_path, 10, 10, 20) # Adjust size as needed
# # Logo
# logo_path = os.path.join(os.path.dirname(__file__), "logo.png")
# self.image(logo_path, 10, 10, 20) # Adjust size as needed
# Move cursor to right of image
self.set_xy(20, 15)
@ -59,15 +59,31 @@ class PDFGenerator(FPDF):
def chapter_body(self):
self.set_font("DejaVu", "", 12)
self.multi_cell(
0,
10,
self.pdf_model.content,
markdown=True,
new_x=XPos.RIGHT,
new_y=YPos.TOP,
)
self.ln()
content_lines = self.pdf_model.content.split("\n")
for line in content_lines:
if line.startswith("# "):
self.ln() # Add line break before header
self.set_font("DejaVu", "B", 16)
self.multi_cell(0, 10, line[2:], markdown=False)
elif line.startswith("## "):
self.ln() # Add line break before header
self.set_font("DejaVu", "B", 14)
self.multi_cell(0, 10, line[3:], markdown=False)
elif line.startswith("### "):
self.ln() # Add line break before header
self.set_font("DejaVu", "B", 12)
self.multi_cell(0, 10, line[4:], markdown=False)
else:
self.set_font("DejaVu", "", 12)
self.multi_cell(
0,
10,
line,
markdown=True,
new_x=XPos.RIGHT,
new_y=YPos.TOP,
)
self.ln()
def print_pdf(self):
self.add_page()
@ -78,7 +94,11 @@ if __name__ == "__main__":
pdf_model = PDFModel(
title="Summary of Legal Services Rendered by Orrick",
content="""
# Main Header
## Sub Header
### Sub Sub Header
**Summary:**
This is a summary of the legal services rendered.
""",
)
pdf = PDFGenerator(pdf_model)

View File

@ -53,7 +53,7 @@ services:
volumes:
- ./backend/:/app/
command: >
/bin/bash -c "python -m celery -A quivr_worker.celery_worker worker -l info -E"
/bin/bash -c "python -m celery -A quivr_worker.celery_worker worker -l info -E -P solo"
restart: always
depends_on:
- redis

View File

@ -1,29 +0,0 @@
@use "styles/Spacings.module.scss";
.modal_content_container {
padding: Spacings.$spacing05;
display: flex;
flex-direction: column;
height: 100%;
justify-content: space-between;
.modal_content_wrapper {
display: flex;
flex-direction: column;
gap: Spacings.$spacing05;
.message_wrapper {
display: flex;
flex-direction: column;
}
.title {
font-weight: 600;
}
}
.button {
display: flex;
align-self: flex-end;
}
}

View File

@ -1,151 +0,0 @@
import { useState } from "react";
import { Assistant } from "@/lib/api/assistants/types";
import { useAssistants } from "@/lib/api/assistants/useAssistants";
import { Stepper } from "@/lib/components/AddBrainModal/components/Stepper/Stepper";
import { StepValue } from "@/lib/components/AddBrainModal/types/types";
import { MessageInfoBox } from "@/lib/components/ui/MessageInfoBox/MessageInfoBox";
import { Modal } from "@/lib/components/ui/Modal/Modal";
import { QuivrButton } from "@/lib/components/ui/QuivrButton/QuivrButton";
import { Step } from "@/lib/types/Modal";
import styles from "./AssistantModal.module.scss";
import { InputsStep } from "./InputsStep/InputsStep";
import { OutputsStep } from "./OutputsStep/OutputsStep";
interface AssistantModalProps {
isOpen: boolean;
setIsOpen: (value: boolean) => void;
assistant: Assistant;
}
export const AssistantModal = ({
isOpen,
setIsOpen,
assistant,
}: AssistantModalProps): JSX.Element => {
const steps: Step[] = [
{
label: "Inputs",
value: "FIRST_STEP",
},
{
label: "Outputs",
value: "SECOND_STEP",
},
];
const [currentStep, setCurrentStep] = useState<StepValue>("FIRST_STEP");
const [emailOutput, setEmailOutput] = useState<boolean>(true);
const [brainOutput, setBrainOutput] = useState<string>("");
const [files, setFiles] = useState<{ key: string; file: File | null }[]>(
assistant.inputs.files.map((fileInput) => ({
key: fileInput.key,
file: null,
}))
);
const { processAssistant } = useAssistants();
const handleFileChange = (file: File, inputKey: string) => {
setFiles((prevFiles) =>
prevFiles.map((fileObj) =>
fileObj.key === inputKey ? { ...fileObj, file } : fileObj
)
);
};
const handleSetIsOpen = (value: boolean) => {
if (!value) {
setCurrentStep("FIRST_STEP");
}
setIsOpen(value);
};
const handleProcessAssistant = async () => {
handleSetIsOpen(false);
await processAssistant(
{
name: assistant.name,
inputs: {
files: files.map((file) => ({
key: file.key,
value: (file.file as File).name,
})),
urls: [],
texts: [],
},
outputs: {
email: {
activated: emailOutput,
},
brain: {
activated: brainOutput !== "",
value: brainOutput,
},
},
},
files.map((file) => file.file as File)
);
};
return (
<Modal
title={assistant.name}
desc={assistant.description}
isOpen={isOpen}
setOpen={handleSetIsOpen}
size="big"
CloseTrigger={<div />}
>
<div className={styles.modal_content_container}>
<div className={styles.modal_content_wrapper}>
<Stepper steps={steps} currentStep={currentStep} />
{currentStep === "FIRST_STEP" ? (
<MessageInfoBox type="tutorial">
<div className={styles.message_wrapper}>
<span className={styles.title}>Expected Input</span>
{assistant.input_description}
</div>
</MessageInfoBox>
) : (
<MessageInfoBox type="tutorial">
<div className={styles.message_wrapper}>
<span className={styles.title}>Output</span>
{assistant.output_description}
</div>
</MessageInfoBox>
)}
{currentStep === "FIRST_STEP" ? (
<InputsStep
inputs={assistant.inputs}
onFileChange={handleFileChange}
/>
) : (
<OutputsStep
setEmailOutput={setEmailOutput}
setBrainOutput={setBrainOutput}
/>
)}
</div>
<div className={styles.button}>
{currentStep === "FIRST_STEP" ? (
<QuivrButton
label="Next"
color="primary"
iconName="chevronRight"
onClick={() => setCurrentStep("SECOND_STEP")}
disabled={!!files.find((file) => !file.file)}
/>
) : (
<QuivrButton
label="Process"
color="primary"
iconName="chevronRight"
onClick={() => handleProcessAssistant()}
disabled={!emailOutput && brainOutput === ""}
/>
)}
</div>
</div>
</Modal>
);
};

View File

@ -1,28 +0,0 @@
import { capitalCase } from "change-case";
import { AssistantInputs } from "@/lib/api/assistants/types";
import { FileInput } from "@/lib/components/ui/FileInput/FileInput";
interface InputsStepProps {
inputs: AssistantInputs;
onFileChange: (file: File, inputKey: string) => void; //
}
export const InputsStep = ({
inputs,
onFileChange,
}: InputsStepProps): JSX.Element => {
return (
<div>
{inputs.files.map((fileInput) => (
<FileInput
key={fileInput.key}
label={capitalCase(fileInput.key)}
icon="file"
acceptedFileTypes={fileInput.allowed_extensions}
onFileChange={(file) => onFileChange(file, fileInput.key)}
/>
))}
</div>
);
};

View File

@ -1,16 +0,0 @@
@use "styles/Spacings.module.scss";
.outputs_wrapper {
display: flex;
flex-direction: column;
gap: Spacings.$spacing03;
.message_wrapper {
width: 100%;
}
.brain_selector {
padding-block: Spacings.$spacing02;
max-width: 250px;
}
}

View File

@ -1,83 +0,0 @@
import { useMemo, useState } from "react";
import { formatMinimalBrainsToSelectComponentInput } from "@/app/chat/[chatId]/components/ActionsBar/components/KnowledgeToFeed/utils/formatMinimalBrainsToSelectComponentInput";
import { Checkbox } from "@/lib/components/ui/Checkbox/Checkbox";
import { MessageInfoBox } from "@/lib/components/ui/MessageInfoBox/MessageInfoBox";
import { SingleSelector } from "@/lib/components/ui/SingleSelector/SingleSelector";
import { requiredRolesForUpload } from "@/lib/config/upload";
import { useBrainContext } from "@/lib/context/BrainProvider/hooks/useBrainContext";
import styles from "./OutputsStep.module.scss";
interface OutputsStepProps {
setEmailOutput: (value: boolean) => void;
setBrainOutput: (value: string) => void;
}
export const OutputsStep = ({
setEmailOutput,
setBrainOutput,
}: OutputsStepProps): JSX.Element => {
const [existingBrainChecked, setExistingBrainChecked] =
useState<boolean>(false);
const [selectedBrainId, setSelectedBrainId] = useState<string>("");
const { allBrains } = useBrainContext();
const brainsWithUploadRights = formatMinimalBrainsToSelectComponentInput(
useMemo(
() =>
allBrains.filter(
(brain) =>
requiredRolesForUpload.includes(brain.role) && !!brain.max_files
),
[allBrains]
)
);
return (
<div className={styles.outputs_wrapper}>
<MessageInfoBox type="info">
It can take a few minutes to process.
</MessageInfoBox>
<Checkbox
label="Receive the results by Email"
checked={true}
setChecked={setEmailOutput}
/>
<Checkbox
label="Upload the results on an existing Brain"
checked={existingBrainChecked}
setChecked={() => {
if (existingBrainChecked) {
setBrainOutput("");
setSelectedBrainId("");
}
setExistingBrainChecked(!existingBrainChecked);
}}
/>
{existingBrainChecked && (
<div className={styles.brain_selector}>
<SingleSelector
options={brainsWithUploadRights}
onChange={(brain) => {
setBrainOutput(brain);
setSelectedBrainId(brain);
}}
selectedOption={
selectedBrainId
? {
value: selectedBrainId,
label: allBrains.find(
(brain) => brain.id === selectedBrainId
)?.name as string,
}
: undefined
}
placeholder="Select a brain"
iconName="brain"
/>
</div>
)}
</div>
);
};

View File

@ -1,20 +0,0 @@
@use "styles/Spacings.module.scss";
.content_wrapper {
padding: Spacings.$spacing06;
display: flex;
flex-direction: column;
gap: Spacings.$spacing05;
.assistants_grid {
display: flex;
gap: Spacings.$spacing03;
flex-wrap: wrap;
}
.message_wrapper {
display: flex;
flex-direction: column;
gap: Spacings.$spacing02;
}
}

View File

@ -1,109 +0,0 @@
"use client";
import { redirect, usePathname } from "next/navigation";
import { useEffect, useState } from "react";
import { Assistant } from "@/lib/api/assistants/types";
import { useAssistants } from "@/lib/api/assistants/useAssistants";
import { PageHeader } from "@/lib/components/PageHeader/PageHeader";
import { BrainCard } from "@/lib/components/ui/BrainCard/BrainCard";
import { MessageInfoBox } from "@/lib/components/ui/MessageInfoBox/MessageInfoBox";
import { useSupabase } from "@/lib/context/SupabaseProvider";
import { redirectToLogin } from "@/lib/router/redirectToLogin";
import { AssistantModal } from "./AssistantModal/AssistantModal";
import styles from "./page.module.scss";
const Assistants = (): JSX.Element => {
const pathname = usePathname();
const { session } = useSupabase();
const [assistants, setAssistants] = useState<Assistant[]>([]);
const [assistantModalOpened, setAssistantModalOpened] =
useState<boolean>(false);
const [currentAssistant, setCurrentAssistant] = useState<Assistant | null>(
null
);
const { getAssistants } = useAssistants();
useEffect(() => {
// REMOVE FOR NOW ACCESS TO QUIVR ASSISTANTS
redirect("/search");
if (session === null) {
redirectToLogin();
}
void (async () => {
try {
const res = await getAssistants();
if (res) {
setAssistants(res);
}
} catch (error) {
console.error(error);
}
})();
}, [pathname, session]);
return (
<>
<div className={styles.page_header}>
<PageHeader
iconName="assistant"
label="Quivr Assistants"
buttons={[]}
/>
<div className={styles.content_wrapper}>
<MessageInfoBox type="info">
<div className={styles.message_wrapper}>
<span>
A Quivr Assistant is an AI agent that apply specific processes
to an input in order to generate a usable output.
</span>
<span>
For now, you can try the summary assistant, that summarizes a
document and send the result by email or upload it in one of
your brains.
</span>
<span> But don&apos;t worry! Other assistants are cooking!</span>
</div>
</MessageInfoBox>
<MessageInfoBox type="warning">
<div className={styles.message_wrapper}>
<span>
<strong>Feature still in Beta.</strong> Please provide feedbacks
on the chat below!
</span>
</div>
</MessageInfoBox>
<div className={styles.assistants_grid}>
{assistants.map((assistant) => {
return (
<BrainCard
tooltip={assistant.description}
brainName={assistant.name}
tags={assistant.tags}
imageUrl={assistant.icon_url}
callback={() => {
setAssistantModalOpened(true);
setCurrentAssistant(assistant);
}}
key={assistant.name}
cardKey={assistant.name}
/>
);
})}
</div>
</div>
</div>
{currentAssistant && (
<AssistantModal
isOpen={assistantModalOpened}
setIsOpen={setAssistantModalOpened}
assistant={currentAssistant}
/>
)}
</>
);
};
export default Assistants;

View File

@ -3,9 +3,7 @@
@import "tailwindcss/utilities";
@import './colors.css';
* {
@apply scrollbar;
}
main {
@apply max-w-screen-xl mx-auto flex flex-col;
@ -64,6 +62,7 @@ div:focus {
--background-blur: rgba(0, 0, 0, 0.9);
--background-success: var(--success-lightest);
--background-error: var(--dangerous-lightest);
--background-pending: var(--background-3);
/* Borders */
--border-0: var(--grey-5);
@ -101,6 +100,7 @@ body.dark_mode {
--background-blur: rgba(0, 0, 0, 0.9);
--background-success: var(--black-5);
--background-error: var(--black-5);
--background-pending: var(--black-5);
/* Borders */
--border-0: var(--black-5);

View File

@ -0,0 +1,41 @@
@use "styles/Radius.module.scss";
@use "styles/Spacings.module.scss";
@use "styles/Typography.module.scss";
.assistant_tab_wrapper {
display: flex;
flex-direction: column;
align-items: center;
gap: Spacings.$spacing05;
border-radius: Radius.$normal;
border: 1px solid var(--border-0);
padding: Spacings.$spacing05;
width: 250px;
cursor: pointer;
height: 100%;
&.disabled {
pointer-events: none;
opacity: 0.3;
}
.header {
display: flex;
align-self: flex-start;
align-items: center;
gap: Spacings.$spacing03;
.title {
@include Typography.H3;
}
}
.description {
font-size: Typography.$small;
font-style: italic;
}
&:hover {
background-color: var(--background-3);
}
}

View File

@ -0,0 +1,29 @@
"use client";
import { Icon } from "@/lib/components/ui/Icon/Icon";
import styles from "./AssistantCard.module.scss";
import { Assistant } from "../../types/assistant";
interface AssistantCardProps {
assistant: Assistant;
}
const AssistantCard = ({ assistant }: AssistantCardProps): JSX.Element => {
return (
<div
className={`${styles.assistant_tab_wrapper} ${
assistant.tags.includes("Disabled") ? styles.disabled : ""
}`}
>
<div className={styles.header}>
<Icon name="assistant" color="black" size="normal" />
<span className={styles.title}>{assistant.name}</span>
</div>
<span className={styles.description}>{assistant.description}</span>
</div>
);
};
export default AssistantCard;

View File

@ -0,0 +1,62 @@
@use "styles/Spacings.module.scss";
@use "styles/Typography.module.scss";
.assistant_tab_wrapper {
height: 100%;
display: flex;
flex-direction: column;
justify-content: space-between;
.content_section {
display: flex;
flex-direction: column;
gap: Spacings.$spacing06;
.title {
@include Typography.H2;
}
.assistant_choice_wrapper {
display: flex;
gap: Spacings.$spacing05;
align-items: stretch;
flex-wrap: wrap;
}
}
.form_wrapper {
display: flex;
flex-direction: column;
gap: Spacings.$spacing06;
.title {
@include Typography.H2;
}
.file_inputs_wrapper {
display: flex;
justify-content: space-between;
width: 100%;
gap: Spacings.$spacing05;
.file_input_wrapper {
width: 100%;
display: flex;
flex-direction: column;
gap: Spacings.$spacing03;
.file_header {
display: flex;
align-items: center;
gap: Spacings.$spacing03;
font-size: Typography.$small;
}
}
}
}
.buttons_wrapper {
display: flex;
justify-content: space-between;
}
}

View File

@ -0,0 +1,267 @@
"use client";
import { useEffect, useState } from "react";
import { useAssistants } from "@/lib/api/assistants/useAssistants";
import { FileInput } from "@/lib/components/ui/FileInput/FileInput";
import { Icon } from "@/lib/components/ui/Icon/Icon";
import QuivrButton from "@/lib/components/ui/QuivrButton/QuivrButton";
import AssistantCard from "./AssistantCard/AssistantCard";
import styles from "./AssistantTab.module.scss";
import BooleansInputs from "./BooleansInputs/BooleansInputs";
import SelectorsInputs from "./SelectorsInput/SelectorsInputs";
import { Assistant, ProcessAssistantData } from "../types/assistant";
export interface ProcessAssistantInput {
input: ProcessAssistantData;
files: File[];
}
interface AssistantTabProps {
setSelectedTab: (tab: string) => void;
}
const FILE_TYPES = ["pdf", "docx", "doc", "txt"];
const useAssistantData = () => {
const [assistants, setAssistants] = useState<Assistant[]>([]);
const [assistantChoosed, setAssistantChoosed] = useState<
Assistant | undefined
>(undefined);
const { getAssistants } = useAssistants();
useEffect(() => {
void (async () => {
try {
const res = await getAssistants();
setAssistants(res);
} catch (error) {
console.error(error);
}
})();
}, []);
return { assistants, assistantChoosed, setAssistantChoosed };
};
const useFormStates = (assistantChoosed: Assistant | undefined) => {
const [booleanStates, setBooleanStates] = useState<{
[key: string]: boolean | null;
}>({});
const [selectTextStates, setSelectTextStates] = useState<{
[key: string]: string | null;
}>({});
const [fileStates, setFileStates] = useState<{ [key: string]: File }>({});
const [isFormValid, setIsFormValid] = useState<boolean>(false);
useEffect(() => {
if (assistantChoosed?.inputs.booleans) {
const initialBooleanStates = assistantChoosed.inputs.booleans.reduce(
(acc, input) => ({ ...acc, [input.key]: false }),
{}
);
setBooleanStates(initialBooleanStates);
}
if (assistantChoosed?.inputs.select_texts) {
const initialSelectTextStates =
assistantChoosed.inputs.select_texts.reduce(
(acc, input) => ({ ...acc, [input.key]: input.options[0] }),
{}
);
setSelectTextStates(initialSelectTextStates);
}
}, [assistantChoosed]);
return {
booleanStates,
setBooleanStates,
selectTextStates,
setSelectTextStates,
fileStates,
setFileStates,
isFormValid,
setIsFormValid,
};
};
const validateForm = (
assistantChoosed: Assistant | undefined,
booleanStates: { [x: string]: boolean | null },
fileStates: { [x: string]: File | undefined },
selectTextStates: { [x: string]: string | null }
) => {
if (!assistantChoosed) {
return false;
}
const allBooleansSet =
assistantChoosed.inputs.booleans?.every(
(input) =>
booleanStates[input.key] !== undefined &&
booleanStates[input.key] !== null
) ?? true;
const allFilesSet = assistantChoosed.inputs.files.every(
(input) => fileStates[input.key] !== undefined
);
const allSelectTextsSet =
assistantChoosed.inputs.select_texts?.every(
(input) =>
selectTextStates[input.key] !== undefined &&
selectTextStates[input.key] !== null
) ?? true;
return allBooleansSet && allFilesSet && allSelectTextsSet;
};
const AssistantTab = ({ setSelectedTab }: AssistantTabProps): JSX.Element => {
const { assistants, assistantChoosed, setAssistantChoosed } =
useAssistantData();
const {
booleanStates,
setBooleanStates,
selectTextStates,
setSelectTextStates,
fileStates,
setFileStates,
isFormValid,
setIsFormValid,
} = useFormStates(assistantChoosed);
const { processTask } = useAssistants();
const [loading, setLoading] = useState<boolean>(false);
const handleFileChange = (key: string, file: File) => {
setFileStates((prevState) => ({
...prevState,
[key]: file,
}));
};
useEffect(() => {
setIsFormValid(
validateForm(
assistantChoosed,
booleanStates,
fileStates,
selectTextStates
)
);
}, [booleanStates, fileStates, selectTextStates, assistantChoosed]);
const handleSubmit = async () => {
if (assistantChoosed) {
const processAssistantData: ProcessAssistantData = {
id: assistantChoosed.id,
name: assistantChoosed.name,
inputs: {
files: Object.keys(fileStates).map((key) => ({
key,
value: fileStates[key].name,
})),
booleans: Object.keys(booleanStates).map((key) => ({
key,
value: booleanStates[key] ?? null,
})),
select_texts: Object.keys(selectTextStates).map((key) => ({
key,
value: selectTextStates[key],
})),
},
};
const processAssistantInput: ProcessAssistantInput = {
input: processAssistantData,
files: Object.values(fileStates),
};
setLoading(true);
await processTask(processAssistantInput);
setSelectedTab("Process");
setLoading(false);
}
};
const resetForm = () => {
setBooleanStates({});
setSelectTextStates({});
setFileStates({});
setIsFormValid(false);
};
const handleBack = () => {
resetForm();
setAssistantChoosed(undefined);
};
return (
<div className={styles.assistant_tab_wrapper}>
{!assistantChoosed ? (
<div className={styles.content_section}>
<span className={styles.title}>Choose an assistant</span>
<div className={styles.assistant_choice_wrapper}>
{assistants.map((assistant, index) => (
<div key={index} onClick={() => setAssistantChoosed(assistant)}>
<AssistantCard assistant={assistant} />
</div>
))}
</div>
</div>
) : (
<div className={styles.form_wrapper}>
<span className={styles.title}>{assistantChoosed.name}</span>
<div className={styles.file_inputs_wrapper}>
{assistantChoosed.inputs.files.map((input, index) => (
<div className={styles.file_input_wrapper} key={index}>
<div className={styles.file_header}>
<Icon name="file" color="black" size="small" />
<span>{input.key}</span>
</div>
<FileInput
label={input.key}
onFileChange={(file) => handleFileChange(input.key, file)}
acceptedFileTypes={FILE_TYPES}
/>
</div>
))}
</div>
<SelectorsInputs
selectTexts={assistantChoosed.inputs.select_texts ?? []}
selectTextStates={selectTextStates}
setSelectTextStates={setSelectTextStates}
/>
<BooleansInputs
booleans={assistantChoosed.inputs.booleans ?? []}
conditionalInputs={assistantChoosed.inputs.conditional_inputs}
booleanStates={booleanStates}
setBooleanStates={setBooleanStates}
selectTextStates={selectTextStates}
/>
</div>
)}
{assistantChoosed && (
<div className={styles.buttons_wrapper}>
<QuivrButton
iconName="chevronLeft"
label="Back"
color="primary"
onClick={() => handleBack()}
/>
<QuivrButton
iconName="chevronRight"
label="EXECUTE"
color="primary"
important={true}
onClick={handleSubmit}
isLoading={loading}
disabled={!isFormValid}
/>
</div>
)}
</div>
);
};
export default AssistantTab;

View File

@ -0,0 +1,5 @@
@use "styles/Variables.module.scss";
.boolean_inputs_wrapper {
width: Variables.$assistantInputWidth;
}

View File

@ -0,0 +1,74 @@
"use client";
import { Checkbox } from "@/lib/components/ui/Checkbox/Checkbox";
import styles from "./BooleansInputs.module.scss";
import { ConditionalInput } from "../../types/assistant";
interface BooleansInputsProps {
booleans: { key: string; description: string }[];
conditionalInputs?: ConditionalInput[];
booleanStates: { [key: string]: boolean | null };
setBooleanStates: React.Dispatch<
React.SetStateAction<{ [key: string]: boolean | null }>
>;
selectTextStates: { [key: string]: string | null };
}
const BooleansInputs = ({
booleans,
conditionalInputs,
booleanStates,
setBooleanStates,
selectTextStates,
}: BooleansInputsProps): JSX.Element => {
const handleCheckboxChange = (key: string, checked: boolean) => {
setBooleanStates((prevState: { [key: string]: boolean | null }) => ({
...prevState,
[key]: checked,
}));
};
const checkCondition = (conditionalInput: ConditionalInput): boolean => {
const { key, condition, value } = conditionalInput;
const targetValue =
booleanStates[key]?.toString() ?? selectTextStates[key] ?? "";
if (condition === "equals") {
return targetValue === value;
} else {
return targetValue !== value;
}
};
return (
<div className={styles.boolean_inputs_wrapper}>
{booleans.map((input, index) => {
const shouldShow = !!conditionalInputs?.every((conditionalInput) => {
if (conditionalInput.conditional_key === input.key) {
return checkCondition(conditionalInput);
}
return true;
});
if (!shouldShow) {
return null;
}
return (
<div key={index} className={styles.boolean_input}>
<Checkbox
label={input.key}
checked={!!booleanStates[input.key]}
setChecked={(checked) => handleCheckboxChange(input.key, checked)}
/>
</div>
);
})}
</div>
);
};
export default BooleansInputs;

View File

@ -0,0 +1,5 @@
@use "styles/Variables.module.scss";
.select_texts_wrapper {
width: Variables.$assistantInputWidth;
}

View File

@ -0,0 +1,49 @@
import React from "react";
import { SingleSelector } from "@/lib/components/ui/SingleSelector/SingleSelector";
import styles from "./SelectorsInputs.module.scss";
interface SelectorsInputsProps {
selectTexts: { key: string; options: string[] }[];
selectTextStates: { [key: string]: string | null };
setSelectTextStates: React.Dispatch<
React.SetStateAction<{ [key: string]: string | null }>
>;
}
const SelectorsInputs = ({
selectTexts,
selectTextStates,
setSelectTextStates,
}: SelectorsInputsProps): JSX.Element => {
const handleSelectTextChange = (key: string, value: string) => {
setSelectTextStates((prevState) => ({
...prevState,
[key]: value,
}));
};
return (
<div className={styles.select_texts_wrapper}>
{selectTexts.map((input, index) => (
<div key={index} className={styles.select_text}>
<SingleSelector
iconName="brain"
placeholder={input.key}
options={input.options.map((option) => {
return { label: option, value: option };
})}
onChange={(value) => handleSelectTextChange(input.key, value)}
selectedOption={{
label: selectTextStates[input.key] ?? input.options[0],
value: selectTextStates[input.key] ?? input.options[0],
}}
/>
</div>
))}
</div>
);
};
export default SelectorsInputs;

View File

@ -0,0 +1,193 @@
@use "styles/Radius.module.scss";
@use "styles/Spacings.module.scss";
@use "styles/Typography.module.scss";
@use "styles/Variables.module.scss";
.process_wrapper {
padding-inline: Spacings.$spacing06;
overflow: hidden;
display: flex;
gap: Spacings.$spacing02;
justify-content: space-between;
align-items: center;
border: 1px solid var(--border-0);
padding-block: Spacings.$spacing03;
position: relative;
overflow: visible;
font-size: Typography.$small;
border-bottom: none;
&.last {
border-radius: 0 0 Radius.$normal Radius.$normal;
border-bottom: 1px solid var(--border-0);
}
&.clickable {
cursor: pointer;
&:hover {
background-color: var(--background-1);
}
}
.left {
display: flex;
align-items: center;
gap: calc(Spacings.$spacing06 + 6px);
overflow: hidden;
.left_fields {
display: flex;
align-items: center;
overflow: hidden;
.assistant {
font-size: Typography.$small;
min-width: Variables.$menuSectionWidth;
max-width: Variables.$menuSectionWidth;
}
.files {
font-size: Typography.$tiny;
color: var(--text-4);
overflow: hidden;
.filename {
@include Typography.EllipsisOverflow;
}
}
}
}
.right {
display: flex;
gap: Spacings.$spacing05;
align-items: center;
.date {
font-size: Typography.$very_tiny;
width: 150px;
display: flex;
align-items: center;
justify-content: center;
@include Typography.EllipsisOverflow;
}
.status {
width: 100px;
display: flex;
align-items: center;
justify-content: center;
@include Typography.EllipsisOverflow;
}
}
}
.markdown {
p {
margin: 0;
padding-block: Spacings.$spacing06;
align-items: center;
}
ul {
list-style-type: disc;
margin-top: 0;
padding: 0;
margin-left: Spacings.$spacing05;
display: flex;
flex-direction: column;
gap: Spacings.$spacing03;
li {
white-space-collapse: collapse;
}
}
ol {
list-style-type: decimal;
padding-left: Spacings.$spacing05;
list-style-position: outside;
li {
white-space-collapse: collapse;
}
}
h1 {
@include Typography.H1;
}
h2 {
@include Typography.H2;
}
h3 {
@include Typography.H3;
}
table {
width: 100%;
border-collapse: collapse;
margin: Spacings.$spacing05 0;
}
thead {
background-color: var(--background-1);
}
tr {
border-bottom: 1px solid var(--border-0);
}
th,
td {
padding: Spacings.$spacing03;
text-align: left;
}
th {
font-weight: bold;
}
pre[class*="language-"] {
background: var(--background-5);
color: var(--white-0);
padding: Spacings.$spacing05;
border-radius: Radius.$normal;
overflow: auto;
margin: 0 0 Spacings.$spacing05 0;
white-space: pre-wrap;
font-size: Typography.$small;
font-family: "Courier New", Courier, monospace;
}
code[class*="language-"] {
background: none;
color: inherit;
border-radius: Radius.$normal;
font-family: "Courier New", Courier, monospace;
font-size: Typography.$small;
white-space: pre-wrap;
}
code {
background: var(--background-5);
color: var(--white-0);
padding: Spacings.$spacing01;
border-radius: Radius.$normal;
font-family: "Courier New", Courier, monospace;
font-size: Typography.$medium;
}
.code_block {
.icon {
position: absolute;
right: 0;
padding: Spacings.$spacing05;
}
code {
white-space: pre-wrap;
}
}
}

View File

@ -0,0 +1,173 @@
"use client";
import { capitalCase } from "change-case";
import format from "date-fns/format";
import { fr } from "date-fns/locale";
import { saveAs } from "file-saver";
import { useState } from "react";
import ReactMarkdown from "react-markdown";
import gfm from "remark-gfm";
import { useAssistants } from "@/lib/api/assistants/useAssistants";
import { Checkbox } from "@/lib/components/ui/Checkbox/Checkbox";
import { Icon } from "@/lib/components/ui/Icon/Icon";
import { LoaderIcon } from "@/lib/components/ui/LoaderIcon/LoaderIcon";
import { Modal } from "@/lib/components/ui/Modal/Modal";
import { Tag } from "@/lib/components/ui/Tag/Tag";
import { useDevice } from "@/lib/hooks/useDevice";
import styles from "./ProcessLine.module.scss";
import { Process } from "../../types/process";
interface ProcessLineProps {
process: Process;
last?: boolean;
selected: boolean;
setSelected: (selected: boolean, event: React.MouseEvent) => void;
}
const ProcessLine = ({
process,
last,
selected,
setSelected,
}: ProcessLineProps): JSX.Element => {
const [showResult, setShowResult] = useState(false);
const [downloadUrl, setDownloadUrl] = useState<string | null>(null);
const { isMobile } = useDevice();
const { downloadTaskResult } = useAssistants();
const handleMouseEnter = async () => {
if (process.status === "completed" && !downloadUrl) {
const res: string = await downloadTaskResult(process.id);
setDownloadUrl(res);
}
};
const handleDownload = async () => {
if (downloadUrl) {
const response = await fetch(
downloadUrl.replace("host.docker.internal", "localhost")
);
const blob = await response.blob();
const formattedDate = format(
new Date(process.creation_time),
"yyyy-MM-dd",
{ locale: fr }
);
const fileName = `${process.assistant_name}_${formattedDate}.pdf`;
saveAs(blob, fileName);
}
};
return (
<>
<div
className={`${styles.process_wrapper} ${last ? styles.last : ""} ${
process.status === "completed" ? styles.clickable : ""
}`}
onClick={() => {
if (process.status === "completed") {
setShowResult(!showResult);
}
}}
onMouseEnter={() => void handleMouseEnter()}
>
<div className={styles.left}>
<Checkbox
checked={selected}
setChecked={(checked, event) => setSelected(checked, event)}
/>
<div className={styles.left_fields}>
<span className={styles.assistant}>{process.assistant_name}</span>
<span className={styles.files}>
{process.task_metadata.input_files.map((file, index) => (
<div className={styles.filename} key={index}>
<span>{file}</span>
</div>
))}
</span>
</div>
</div>
<div className={styles.right}>
{!isMobile && (
<>
<span className={styles.date}>
{format(
new Date(process.creation_time),
"d MMMM yyyy '-' HH:mm:ss",
{
locale: fr,
}
)}
</span>
<div className={styles.status}>
<Tag
name={capitalCase(process.status)}
color={
process.status === "error"
? "dangerous"
: process.status === "processing"
? "primary"
: process.status === "completed"
? "success"
: "grey"
}
/>
</div>
</>
)}
<div
onClick={(event: React.MouseEvent<HTMLDivElement>) => {
event.stopPropagation();
}}
>
{process.status === "processing" ? (
<LoaderIcon size="normal" color="primary" />
) : downloadUrl ? (
<div onClick={() => void handleDownload()}>
<Icon
name="download"
size="normal"
color="black"
handleHover={process.status === "completed"}
/>
</div>
) : (
<Icon
name={
process.status === "completed"
? "download"
: process.status === "error"
? "warning"
: "waiting"
}
size="normal"
color="black"
handleHover={process.status === "completed"}
/>
)}
</div>
</div>
</div>
<Modal
size="big"
isOpen={showResult}
setOpen={setShowResult}
CloseTrigger={<div />}
>
{process.answer && (
<div className={styles.markdown}>
<ReactMarkdown remarkPlugins={[gfm]}>
{process.answer.replace(/\n/g, "\n")}
</ReactMarkdown>
</div>
)}
</Modal>
</>
);
};
export default ProcessLine;

View File

@ -0,0 +1,122 @@
@use "styles/Radius.module.scss";
@use "styles/ScreenSizes.module.scss";
@use "styles/Spacings.module.scss";
@use "styles/Typography.module.scss";
@use "styles/Variables.module.scss";
.process_tab_wrapper {
display: flex;
flex-direction: column;
gap: Spacings.$spacing05;
padding-bottom: Spacings.$spacing10;
border-radius: Radius.$normal;
@media screen and (max-width: ScreenSizes.$small) {
overflow-x: auto;
}
.title {
@include Typography.H2;
}
.table_header {
display: flex;
justify-content: space-between;
align-items: center;
gap: Spacings.$spacing03;
.search {
width: 250px;
}
}
.first_line {
display: flex;
justify-content: space-between;
padding-left: calc(Spacings.$spacing06);
padding-right: calc(Spacings.$spacing11 + 6px);
padding-block: Spacings.$spacing02;
font-weight: 500;
background-color: var(--background-1);
font-size: Typography.$small;
border: 1px solid var(--border-0);
border-radius: Radius.$normal Radius.$normal 0 0;
border-bottom: none;
&.empty {
border: 1px solid var(--border-0);
border-radius: Radius.$normal;
}
.left {
display: flex;
align-items: center;
gap: calc(Spacings.$spacing06 + 6px);
.left_fields {
display: flex;
align-items: center;
.field {
display: flex;
align-items: center;
gap: Spacings.$spacing02;
cursor: pointer;
.icon {
visibility: hidden;
}
&:hover {
.icon {
visibility: visible;
}
}
&.assistant {
width: Variables.$menuSectionWidth;
}
}
}
}
.right {
display: flex;
gap: calc(Spacings.$spacing12 + Spacings.$spacing06 + 2px);
.status {
display: flex;
align-items: center;
gap: Spacings.$spacing02;
cursor: pointer;
.icon {
visibility: hidden;
}
&:hover {
.icon {
visibility: visible;
}
}
}
.date {
display: flex;
align-items: center;
gap: Spacings.$spacing02;
cursor: pointer;
.icon {
visibility: hidden;
}
&:hover {
.icon {
visibility: visible;
}
}
}
}
}
}

View File

@ -0,0 +1,239 @@
"use client";
import { useEffect, useState } from "react";
import { useAssistants } from "@/lib/api/assistants/useAssistants";
import { Checkbox } from "@/lib/components/ui/Checkbox/Checkbox";
import { Icon } from "@/lib/components/ui/Icon/Icon";
import { QuivrButton } from "@/lib/components/ui/QuivrButton/QuivrButton";
import { TextInput } from "@/lib/components/ui/TextInput/TextInput";
import { useSupabase } from "@/lib/context/SupabaseProvider";
import { filterAndSort, updateSelectedItems } from "@/lib/helpers/table";
import { useDevice } from "@/lib/hooks/useDevice";
import ProcessLine from "./Process/ProcessLine";
import styles from "./ProcessTab.module.scss";
import { Process } from "../types/process";
const ProcessTab = (): JSX.Element => {
const [processes, setProcesses] = useState<Process[]>([]);
const [searchQuery, setSearchQuery] = useState<string>("");
const [selectedProcess, setSelectedProcess] = useState<Process[]>([]);
const [allChecked, setAllChecked] = useState<boolean>(false);
const [sortConfig, setSortConfig] = useState<{
key: keyof Process;
direction: "ascending" | "descending";
}>({ key: "creation_time", direction: "descending" });
const [filteredProcess, setFilteredProcess] = useState<Process[]>([]);
const [lastSelectedIndex, setLastSelectedIndex] = useState<number | null>(
null
);
const [loading, setLoading] = useState<boolean>(false);
const { getTasks, deleteTask } = useAssistants();
const { supabase } = useSupabase();
const { isMobile } = useDevice();
const loadTasks = async () => {
try {
const res = await getTasks();
setProcesses(res);
setFilteredProcess(res);
} catch (error) {
console.error(error);
}
};
const handleStatusChange = () => {
void loadTasks();
};
useEffect(() => {
void loadTasks();
}, []);
useEffect(() => {
const channel = supabase
.channel("tasks")
.on(
"postgres_changes",
{ event: "UPDATE", schema: "public", table: "tasks" },
handleStatusChange
)
.subscribe();
return () => {
void supabase.removeChannel(channel);
};
}, []);
useEffect(() => {
setFilteredProcess(
filterAndSort(
processes,
searchQuery,
sortConfig,
(process) => process[sortConfig.key]
)
);
}, [processes, searchQuery, sortConfig]);
const handleDelete = async () => {
setLoading(true);
await Promise.all(
selectedProcess.map(async (process) => await deleteTask(process.id))
);
const remainingProcesses = processes.filter(
(process) =>
!selectedProcess.some((selected) => selected.id === process.id)
);
setProcesses(remainingProcesses);
setFilteredProcess(
filterAndSort(
remainingProcesses,
searchQuery,
sortConfig,
(process) => process[sortConfig.key]
)
);
setSelectedProcess([]);
setAllChecked(false);
setLoading(false);
};
const handleSelect = (
process: Process,
index: number,
event: React.MouseEvent
) => {
const newSelectedProcess = updateSelectedItems<Process>({
item: process,
index,
event,
lastSelectedIndex,
filteredList: filteredProcess,
selectedItems: selectedProcess,
});
setSelectedProcess(newSelectedProcess.selectedItems);
setLastSelectedIndex(newSelectedProcess.lastSelectedIndex);
};
const handleSort = (key: keyof Process) => {
setSortConfig((prevSortConfig) => {
let direction: "ascending" | "descending" = "ascending";
if (
prevSortConfig.key === key &&
prevSortConfig.direction === "ascending"
) {
direction = "descending";
}
return { key, direction };
});
};
return (
<div className={styles.process_tab_wrapper}>
<span className={styles.title}>My Results</span>
<div className={styles.table_header}>
<div className={styles.search}>
<TextInput
iconName="search"
label="Search"
inputValue={searchQuery}
setInputValue={setSearchQuery}
small={true}
/>
</div>
<QuivrButton
label="Delete"
iconName="delete"
color="dangerous"
disabled={selectedProcess.length === 0}
onClick={handleDelete}
isLoading={loading}
/>
</div>
<div>
<div
className={`${styles.first_line} ${
!filteredProcess.length ? styles.empty : ""
}`}
>
<div className={styles.left}>
<Checkbox
checked={allChecked}
setChecked={(checked) => {
setAllChecked(checked);
setSelectedProcess(checked ? filteredProcess : []);
}}
/>
<div className={styles.left_fields}>
<div
className={`${styles.field} ${styles.assistant}`}
onClick={() => handleSort("assistant_name")}
>
Assistant
<div className={styles.icon}>
<Icon name="sort" size="small" color="black" />
</div>
</div>
<div className={styles.field} onClick={() => handleSort("name")}>
Files
<div className={styles.icon}>
<Icon name="sort" size="small" color="black" />
</div>
</div>
</div>
</div>
<div className={styles.right}>
{!isMobile && (
<>
<div
className={styles.date}
onClick={() => handleSort("creation_time")}
>
Date
<div className={styles.icon}>
<Icon name="sort" size="small" color="black" />
</div>
</div>
<div
className={styles.status}
onClick={() => handleSort("status")}
>
Statut
<div className={styles.icon}>
<Icon name="sort" size="small" color="black" />
</div>
</div>
</>
)}
</div>
</div>
<div className={styles.process_list}>
{filteredProcess.map((process, index) => (
<div key={process.id} className={styles.process_line}>
<ProcessLine
process={process}
last={index === filteredProcess.length - 1}
selected={selectedProcess.some(
(item) => item.id === process.id
)}
setSelected={(_selected, event) =>
handleSelect(process, index, event)
}
/>
</div>
))}
</div>
</div>
</div>
);
};
export default ProcessTab;

View File

@ -0,0 +1,20 @@
@use "styles/Spacings.module.scss";
.page_wrapper {
display: flex;
flex-direction: column;
gap: Spacings.$spacing05;
width: 100%;
height: 100vh;
overflow: hidden;
.content_wrapper {
padding-inline: Spacings.$spacing09;
padding-block: Spacings.$spacing05;
overflow-y: auto;
display: flex;
flex-direction: column;
gap: Spacings.$spacing05;
height: 100%;
}
}

View File

@ -0,0 +1,47 @@
"use client";
import { useState } from "react";
import PageHeader from "@/lib/components/PageHeader/PageHeader";
import { Tabs } from "@/lib/components/ui/Tabs/Tabs";
import { Tab } from "@/lib/types/Tab";
import AssistantTab from "./AssistantTab/AssistantTab";
import ProcessTab from "./ProcessTab/ProcessTab";
import styles from "./page.module.scss";
const QualityAssistant = (): JSX.Element => {
const [selectedTab, setSelectedTab] = useState("Assistants");
const qualityAssistantTab: Tab[] = [
{
label: "Assistants",
isSelected: selectedTab === "Assistants",
onClick: () => setSelectedTab("Assistants"),
iconName: "assistant",
},
{
label: "Process",
isSelected: selectedTab === "Process",
onClick: () => setSelectedTab("Process"),
iconName: "waiting",
},
];
return (
<div className={styles.page_wrapper}>
<div className={styles.page_header}>
<PageHeader iconName="assistant" label="Assistants" buttons={[]} />
</div>
<div className={styles.content_wrapper}>
<Tabs tabList={qualityAssistantTab} />
{selectedTab === "Assistants" && (
<AssistantTab setSelectedTab={setSelectedTab} />
)}
{selectedTab === "Process" && <ProcessTab />}
</div>
</div>
);
};
export default QualityAssistant;

View File

@ -0,0 +1,123 @@
interface Pricing {
cost: number;
description: string;
}
interface InputFile {
key: string;
allowed_extensions: string[];
required: boolean;
description: string;
}
interface InputUrl {
key: string;
required: boolean;
description: string;
}
interface InputText {
key: string;
required: boolean;
description: string;
validation_regex: string;
}
interface InputBoolean {
key: string;
required: boolean;
description: string;
}
interface InputNumber {
key: string;
required: boolean;
description: string;
min: number;
max: number;
increment: number;
default: number;
}
interface SelectText {
key: string;
required: boolean;
description: string;
options: string[];
default: string;
}
interface SelectNumber {
key: string;
required: boolean;
description: string;
options: number[];
default: number;
}
interface Brain {
required: boolean;
description: string;
type: string;
}
interface Inputs {
files: InputFile[];
urls: InputUrl[];
texts: InputText[];
booleans?: InputBoolean[];
numbers: InputNumber[];
select_texts?: SelectText[];
select_numbers: SelectNumber[];
brain: Brain;
conditional_inputs?: ConditionalInput[];
}
export interface Assistant {
id: number;
name: string;
description: string;
pricing: Pricing;
tags: string[];
input_description: string;
output_description: string;
inputs: Inputs;
icon_url: string;
}
interface ProcessAssistantInputFile {
key: string;
value: string;
}
export interface ConditionalInput {
key: string;
conditional_key: string;
condition: "equals" | "not_equals";
value: string;
}
export interface ProcessAssistantData {
id: number;
name: string;
inputs: {
files?: ProcessAssistantInputFile[];
urls?: { key: string; value: string }[];
texts?: { key: string; value: string }[];
booleans?: { key: string; value: boolean | null }[];
numbers?: { key: string; value: number }[];
select_texts?: { key: string; value: string | null }[];
select_numbers?: { key: string; value: number }[];
brain?: { value: string };
conditional_inputs?: ConditionalInput[];
};
}
export interface ProcessAssistantInput {
input: ProcessAssistantData;
files: File[];
}
export interface ResultDownload {
data: string;
}

View File

@ -0,0 +1,13 @@
export interface ProcessMetadata {
input_files: string[];
}
export interface Process {
answer: string;
id: number;
name: string;
creation_time: string;
status: "pending" | "processing" | "completed" | "error";
assistant_name: string;
task_metadata: ProcessMetadata;
}

View File

@ -14,6 +14,7 @@
margin-left: -(Spacings.$spacing05 + Spacings.$spacing03);
gap: Spacings.$spacing03;
align-items: center;
padding-top: Spacings.$spacing05;
.tabs {
width: 100%;

View File

@ -4,6 +4,7 @@ import { Checkbox } from "@/lib/components/ui/Checkbox/Checkbox";
import { Icon } from "@/lib/components/ui/Icon/Icon";
import { QuivrButton } from "@/lib/components/ui/QuivrButton/QuivrButton";
import { TextInput } from "@/lib/components/ui/TextInput/TextInput";
import { updateSelectedItems } from "@/lib/helpers/table";
import { useDevice } from "@/lib/hooks/useDevice";
import { isUploadedKnowledge, Knowledge } from "@/lib/types/Knowledge";
@ -58,51 +59,6 @@ const filterAndSortKnowledge = (
return filteredList;
};
const updateSelectedKnowledge = ({
knowledge,
index,
event,
lastSelectedIndex,
filteredKnowledgeList,
selectedKnowledge,
}: {
knowledge: Knowledge;
index: number;
event: React.MouseEvent;
lastSelectedIndex: number | null;
filteredKnowledgeList: Knowledge[];
selectedKnowledge: Knowledge[];
}): { selectedKnowledge: Knowledge[]; lastSelectedIndex: number | null } => {
if (event.shiftKey && lastSelectedIndex !== null) {
const start = Math.min(lastSelectedIndex, index);
const end = Math.max(lastSelectedIndex, index);
const range = filteredKnowledgeList.slice(start, end + 1);
const newSelected = [...selectedKnowledge];
range.forEach((item) => {
if (!newSelected.some((selectedItem) => selectedItem.id === item.id)) {
newSelected.push(item);
}
});
return { selectedKnowledge: newSelected, lastSelectedIndex: index };
} else {
const isSelected = selectedKnowledge.some(
(item) => item.id === knowledge.id
);
const newSelectedKnowledge = isSelected
? selectedKnowledge.filter(
(selectedItem) => selectedItem.id !== knowledge.id
)
: [...selectedKnowledge, knowledge];
return {
selectedKnowledge: newSelectedKnowledge,
lastSelectedIndex: isSelected ? null : index,
};
}
};
const KnowledgeTable = React.forwardRef<HTMLDivElement, KnowledgeTableProps>(
({ knowledgeList }, ref) => {
const [selectedKnowledge, setSelectedKnowledge] = useState<Knowledge[]>([]);
@ -131,15 +87,15 @@ const KnowledgeTable = React.forwardRef<HTMLDivElement, KnowledgeTableProps>(
index: number,
event: React.MouseEvent
) => {
const newSelectedKnowledge = updateSelectedKnowledge({
knowledge,
const newSelectedKnowledge = updateSelectedItems<Knowledge>({
item: knowledge,
index,
event,
lastSelectedIndex,
filteredKnowledgeList,
selectedKnowledge,
filteredList: filteredKnowledgeList,
selectedItems: selectedKnowledge,
});
setSelectedKnowledge(newSelectedKnowledge.selectedKnowledge);
setSelectedKnowledge(newSelectedKnowledge.selectedItems);
setLastSelectedIndex(newSelectedKnowledge.lastSelectedIndex);
};

View File

@ -1,38 +1,63 @@
import { AxiosInstance } from "axios";
import { Assistant, ProcessAssistantRequest } from "./types";
import {
Assistant,
ProcessAssistantInput,
} from "@/app/quality-assistant/types/assistant";
import { Process } from "@/app/quality-assistant/types/process";
export const getAssistants = async (
axiosInstance: AxiosInstance
): Promise<Assistant[] | undefined> => {
return (await axiosInstance.get<Assistant[] | undefined>("/assistants")).data;
): Promise<Assistant[]> => {
return (await axiosInstance.get<Assistant[]>(`/assistants`)).data;
};
export const processAssistant = async (
export const getTasks = async (
axiosInstance: AxiosInstance
): Promise<Process[]> => {
return (await axiosInstance.get<Process[]>(`/assistants/tasks`)).data;
};
export const processTask = async (
axiosInstance: AxiosInstance,
input: ProcessAssistantRequest,
files: File[]
): Promise<string | undefined> => {
processAssistantInput: ProcessAssistantInput
): Promise<string> => {
const formData = new FormData();
formData.append(
"input",
JSON.stringify({
name: input.name,
inputs: {
files: input.inputs.files,
urls: input.inputs.urls,
texts: input.inputs.texts,
},
outputs: input.outputs,
})
);
formData.append("input", JSON.stringify(processAssistantInput.input));
files.forEach((file) => {
formData.append("files", file);
processAssistantInput.files.forEach((file) => {
if (file instanceof File) {
formData.append("files", file);
} else {
console.error("L'élément n'est pas un fichier valide", file);
}
});
return (
await axiosInstance.post<string | undefined>("/assistant/process", formData)
).data;
const response = await axiosInstance.post<string>(
`/assistants/task`,
formData,
{
headers: {
"Content-Type": "multipart/form-data",
},
}
);
return response.data;
};
export const deleteTask = async (
axiosInstance: AxiosInstance,
taskId: number
): Promise<void> => {
await axiosInstance.delete(`/assistants/task/${taskId}`);
};
export const downloadTaskResult = async (
axiosInstance: AxiosInstance,
taskId: number
): Promise<string> => {
return (await axiosInstance<string>(`/assistants/task/${taskId}/download`))
.data;
};

View File

@ -1,7 +1,13 @@
import { ProcessAssistantInput } from "@/app/quality-assistant/types/assistant";
import { useAxios } from "@/lib/hooks";
import { getAssistants, processAssistant } from "./assistants";
import { ProcessAssistantRequest } from "./types";
import {
deleteTask,
downloadTaskResult,
getAssistants,
getTasks,
processTask,
} from "./assistants";
// eslint-disable-next-line @typescript-eslint/explicit-module-boundary-types
export const useAssistants = () => {
@ -9,7 +15,11 @@ export const useAssistants = () => {
return {
getAssistants: async () => getAssistants(axiosInstance),
processAssistant: async (input: ProcessAssistantRequest, files: File[]) =>
processAssistant(axiosInstance, input, files),
getTasks: async () => getTasks(axiosInstance),
processTask: async (processAssistantInput: ProcessAssistantInput) =>
processTask(axiosInstance, processAssistantInput),
deleteTask: async (taskId: number) => deleteTask(axiosInstance, taskId),
downloadTaskResult: async (taskId: number) =>
downloadTaskResult(axiosInstance, taskId),
};
};

View File

@ -1,5 +1,6 @@
import { MotionConfig } from "framer-motion";
import { usePathname, useRouter } from "next/navigation";
import { useFeatureFlagEnabled } from 'posthog-js/react';
import { useState } from "react";
import { MenuControlButton } from "@/app/chat/[chatId]/components/ActionsBar/components/ChatInput/components/MenuControlButton/MenuControlButton";
@ -17,11 +18,13 @@ import { HomeButton } from "./components/HomeButton/HomeButton";
import { Notifications } from "./components/Notifications/Notifications";
import { NotificationsButton } from "./components/NotificationsButton/NotificationsButton";
import { ProfileButton } from "./components/ProfileButton/ProfileButton";
import { QualityAssistantButton } from "./components/QualityAssistantButton/QualityAssistantButton";
import { SocialsButtons } from "./components/SocialsButtons/SocialsButtons";
import { StudioButton } from "./components/StudioButton/StudioButton";
import { ThreadsButton } from "./components/ThreadsButton/ThreadsButton";
import { UpgradeToPlusButton } from "./components/UpgradeToPlusButton/UpgradeToPlusButton";
const showUpgradeButton = process.env.NEXT_PUBLIC_SHOW_TOKENS === "true";
export const Menu = (): JSX.Element => {
@ -31,6 +34,8 @@ export const Menu = (): JSX.Element => {
const pathname = usePathname() ?? "";
const [isLogoHovered, setIsLogoHovered] = useState<boolean>(false);
const { isDarkMode } = useUserSettingsContext();
const flagEnabled = useFeatureFlagEnabled('show-quality-assistant')
useChatsList();
@ -44,6 +49,7 @@ export const Menu = (): JSX.Element => {
"/library",
"/search",
"studio",
"/quality-assistant",
"/user",
];
@ -59,9 +65,8 @@ export const Menu = (): JSX.Element => {
<div>
<MotionConfig transition={{ mass: 1, damping: 10, duration: 0.1 }}>
<div
className={`${styles.menu_container} ${
!isOpened ? styles.hidden : ""
}`}
className={`${styles.menu_container} ${!isOpened ? styles.hidden : ""
}`}
>
<AnimatedDiv>
<div className={styles.menu_wrapper}>
@ -83,6 +88,8 @@ export const Menu = (): JSX.Element => {
<div className={styles.block}>
<DiscussionButton />
<HomeButton />
{flagEnabled && <QualityAssistantButton />}
<StudioButton />
<NotificationsButton />
<ThreadsButton />

Some files were not shown because too many files have changed in this diff Show More