feat: kms-migration (#3446)

# Description

- necessary changes for Kms v0.1
This commit is contained in:
AmineDiro 2024-11-01 16:16:37 +01:00 committed by GitHub
parent 6415c75cbd
commit 1356d87098
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 31 additions and 36 deletions

View File

@ -112,9 +112,9 @@ class QuivrFile:
id: UUID,
original_filename: str,
path: Path,
brain_id: UUID,
file_sha1: str,
file_extension: FileExtension | str,
brain_id: UUID | None = None,
file_size: int | None = None,
metadata: dict[str, Any] | None = None,
) -> None:

View File

@ -3,6 +3,7 @@ import logging
import tiktoken
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
from megaparse import MegaParse
from quivr_core.config import MegaparseConfig
from quivr_core.files.file import QuivrFile
@ -55,14 +56,11 @@ class MegaparseProcessor(ProcessorBase):
}
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
# mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore
# document: Document = await mega_parse.aload()
# if len(document.page_content) > self.splitter_config.chunk_size:
# docs = self.text_splitter.split_documents([document])
# for doc in docs:
# # if "Production Fonts (maximum)" in doc.page_content:
# # print('Doc: ', doc.page_content)
# doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
# return docs
# return [document]
return []
mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore
document: Document = await mega_parse.aload()
if len(document.page_content) > self.splitter_config.chunk_size:
docs = self.text_splitter.split_documents([document])
for doc in docs:
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
return docs
return [document]

View File

@ -1,4 +1,5 @@
import logging
import os
from typing import AsyncIterable
import httpx
@ -28,7 +29,7 @@ class TikaProcessor(ProcessorBase):
def __init__(
self,
tika_url: str = "http://localhost:9998/tika",
tika_url: str = os.getenv("TIKA_SERVER_URL", "http://localhost:9998/tika"),
splitter: TextSplitter | None = None,
splitter_config: SplitterConfig = SplitterConfig(),
timeout: float = 5.0,

View File

@ -2,7 +2,6 @@ import logging
from abc import ABC, abstractmethod
from importlib.metadata import PackageNotFoundError, version
from typing import Any
from uuid import uuid4
from langchain_core.documents import Document
@ -43,7 +42,6 @@ class ProcessorBase(ABC):
"utf-8"
)
doc.metadata = {
"id": uuid4(),
"chunk_index": idx,
"quivr_core_version": qvr_version,
**file.metadata,

View File

@ -117,13 +117,13 @@ def defaults_to_proc_entries(
# TODO(@aminediro): Megaparse should register itself
# Append Megaparse
_append_proc_mapping(
mapping=base_processors,
file_ext=FileExtension.pdf,
cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor",
errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor",
priority=None,
)
# _append_proc_mapping(
# mapping=base_processors,
# file_ext=FileExtension.pdf,
# cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor",
# errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor",
# priority=None,
# )
return base_processors

View File

@ -39,10 +39,11 @@ class ChatMessage(BaseModelV1):
class KnowledgeStatus(str, Enum):
PROCESSING = "PROCESSING"
UPLOADED = "UPLOADED"
ERROR = "ERROR"
RESERVED = "RESERVED"
PROCESSING = "PROCESSING"
PROCESSED = "PROCESSED"
UPLOADED = "UPLOADED"
class Source(BaseModel):

View File

@ -1,20 +1,20 @@
import asyncio
import logging
from typing import (
Annotated,
Any,
AsyncGenerator,
Dict,
List,
Optional,
Sequence,
Tuple,
TypedDict,
Dict,
Any,
Type,
TypedDict,
)
from uuid import uuid4
import asyncio
# TODO(@aminediro): this is the only dependency to langchain package, we should remove it
import openai
from langchain.retrievers import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain_community.document_compressors import JinaRerank
@ -22,20 +22,17 @@ from langchain_core.callbacks import Callbacks
from langchain_core.documents import BaseDocumentCompressor, Document
from langchain_core.messages import BaseMessage
from langchain_core.messages.ai import AIMessageChunk
from langchain_core.vectorstores import VectorStore
from langchain_core.prompts.base import BasePromptTemplate
from langgraph.graph import START, END, StateGraph
from langchain_core.vectorstores import VectorStore
from langgraph.graph import END, START, StateGraph
from langgraph.graph.message import add_messages
from langgraph.types import Send
from pydantic import BaseModel, Field
import openai
from quivr_core.rag.entities.chat import ChatHistory
from quivr_core.rag.entities.config import DefaultRerankers, NodeConfig, RetrievalConfig
from quivr_core.llm import LLMEndpoint
from quivr_core.llm_tools.llm_tools import LLMToolFactory
from quivr_core.rag.entities.chat import ChatHistory
from quivr_core.rag.entities.config import DefaultRerankers, NodeConfig, RetrievalConfig
from quivr_core.rag.entities.models import (
ParsedRAGChunkResponse,
QuivrKnowledge,