feat: kms-migration (#3446)

# Description

- necessary changes for Kms v0.1
This commit is contained in:
AmineDiro 2024-11-01 16:16:37 +01:00 committed by GitHub
parent 6415c75cbd
commit 1356d87098
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 31 additions and 36 deletions

View File

@ -112,9 +112,9 @@ class QuivrFile:
id: UUID, id: UUID,
original_filename: str, original_filename: str,
path: Path, path: Path,
brain_id: UUID,
file_sha1: str, file_sha1: str,
file_extension: FileExtension | str, file_extension: FileExtension | str,
brain_id: UUID | None = None,
file_size: int | None = None, file_size: int | None = None,
metadata: dict[str, Any] | None = None, metadata: dict[str, Any] | None = None,
) -> None: ) -> None:

View File

@ -3,6 +3,7 @@ import logging
import tiktoken import tiktoken
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
from megaparse import MegaParse
from quivr_core.config import MegaparseConfig from quivr_core.config import MegaparseConfig
from quivr_core.files.file import QuivrFile from quivr_core.files.file import QuivrFile
@ -55,14 +56,11 @@ class MegaparseProcessor(ProcessorBase):
} }
async def process_file_inner(self, file: QuivrFile) -> list[Document]: async def process_file_inner(self, file: QuivrFile) -> list[Document]:
# mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore
# document: Document = await mega_parse.aload() document: Document = await mega_parse.aload()
# if len(document.page_content) > self.splitter_config.chunk_size: if len(document.page_content) > self.splitter_config.chunk_size:
# docs = self.text_splitter.split_documents([document]) docs = self.text_splitter.split_documents([document])
# for doc in docs: for doc in docs:
# # if "Production Fonts (maximum)" in doc.page_content: doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
# # print('Doc: ', doc.page_content) return docs
# doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))} return [document]
# return docs
# return [document]
return []

View File

@ -1,4 +1,5 @@
import logging import logging
import os
from typing import AsyncIterable from typing import AsyncIterable
import httpx import httpx
@ -28,7 +29,7 @@ class TikaProcessor(ProcessorBase):
def __init__( def __init__(
self, self,
tika_url: str = "http://localhost:9998/tika", tika_url: str = os.getenv("TIKA_SERVER_URL", "http://localhost:9998/tika"),
splitter: TextSplitter | None = None, splitter: TextSplitter | None = None,
splitter_config: SplitterConfig = SplitterConfig(), splitter_config: SplitterConfig = SplitterConfig(),
timeout: float = 5.0, timeout: float = 5.0,

View File

@ -2,7 +2,6 @@ import logging
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from importlib.metadata import PackageNotFoundError, version from importlib.metadata import PackageNotFoundError, version
from typing import Any from typing import Any
from uuid import uuid4
from langchain_core.documents import Document from langchain_core.documents import Document
@ -43,7 +42,6 @@ class ProcessorBase(ABC):
"utf-8" "utf-8"
) )
doc.metadata = { doc.metadata = {
"id": uuid4(),
"chunk_index": idx, "chunk_index": idx,
"quivr_core_version": qvr_version, "quivr_core_version": qvr_version,
**file.metadata, **file.metadata,

View File

@ -117,13 +117,13 @@ def defaults_to_proc_entries(
# TODO(@aminediro): Megaparse should register itself # TODO(@aminediro): Megaparse should register itself
# Append Megaparse # Append Megaparse
_append_proc_mapping( # _append_proc_mapping(
mapping=base_processors, # mapping=base_processors,
file_ext=FileExtension.pdf, # file_ext=FileExtension.pdf,
cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor", # cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor",
errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor", # errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor",
priority=None, # priority=None,
) # )
return base_processors return base_processors

View File

@ -39,10 +39,11 @@ class ChatMessage(BaseModelV1):
class KnowledgeStatus(str, Enum): class KnowledgeStatus(str, Enum):
PROCESSING = "PROCESSING"
UPLOADED = "UPLOADED"
ERROR = "ERROR" ERROR = "ERROR"
RESERVED = "RESERVED" RESERVED = "RESERVED"
PROCESSING = "PROCESSING"
PROCESSED = "PROCESSED"
UPLOADED = "UPLOADED"
class Source(BaseModel): class Source(BaseModel):

View File

@ -1,20 +1,20 @@
import asyncio
import logging import logging
from typing import ( from typing import (
Annotated, Annotated,
Any,
AsyncGenerator, AsyncGenerator,
Dict,
List, List,
Optional, Optional,
Sequence, Sequence,
Tuple, Tuple,
TypedDict,
Dict,
Any,
Type, Type,
TypedDict,
) )
from uuid import uuid4 from uuid import uuid4
import asyncio
# TODO(@aminediro): this is the only dependency to langchain package, we should remove it import openai
from langchain.retrievers import ContextualCompressionRetriever from langchain.retrievers import ContextualCompressionRetriever
from langchain_cohere import CohereRerank from langchain_cohere import CohereRerank
from langchain_community.document_compressors import JinaRerank from langchain_community.document_compressors import JinaRerank
@ -22,20 +22,17 @@ from langchain_core.callbacks import Callbacks
from langchain_core.documents import BaseDocumentCompressor, Document from langchain_core.documents import BaseDocumentCompressor, Document
from langchain_core.messages import BaseMessage from langchain_core.messages import BaseMessage
from langchain_core.messages.ai import AIMessageChunk from langchain_core.messages.ai import AIMessageChunk
from langchain_core.vectorstores import VectorStore
from langchain_core.prompts.base import BasePromptTemplate from langchain_core.prompts.base import BasePromptTemplate
from langgraph.graph import START, END, StateGraph from langchain_core.vectorstores import VectorStore
from langgraph.graph import END, START, StateGraph
from langgraph.graph.message import add_messages from langgraph.graph.message import add_messages
from langgraph.types import Send from langgraph.types import Send
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
import openai
from quivr_core.rag.entities.chat import ChatHistory
from quivr_core.rag.entities.config import DefaultRerankers, NodeConfig, RetrievalConfig
from quivr_core.llm import LLMEndpoint from quivr_core.llm import LLMEndpoint
from quivr_core.llm_tools.llm_tools import LLMToolFactory from quivr_core.llm_tools.llm_tools import LLMToolFactory
from quivr_core.rag.entities.chat import ChatHistory
from quivr_core.rag.entities.config import DefaultRerankers, NodeConfig, RetrievalConfig
from quivr_core.rag.entities.models import ( from quivr_core.rag.entities.models import (
ParsedRAGChunkResponse, ParsedRAGChunkResponse,
QuivrKnowledge, QuivrKnowledge,