mirror of
https://github.com/StanGirard/quivr.git
synced 2024-11-22 11:33:57 +03:00
feat: kms-migration (#3446)
# Description - necessary changes for Kms v0.1
This commit is contained in:
parent
6415c75cbd
commit
1356d87098
@ -112,9 +112,9 @@ class QuivrFile:
|
|||||||
id: UUID,
|
id: UUID,
|
||||||
original_filename: str,
|
original_filename: str,
|
||||||
path: Path,
|
path: Path,
|
||||||
brain_id: UUID,
|
|
||||||
file_sha1: str,
|
file_sha1: str,
|
||||||
file_extension: FileExtension | str,
|
file_extension: FileExtension | str,
|
||||||
|
brain_id: UUID | None = None,
|
||||||
file_size: int | None = None,
|
file_size: int | None = None,
|
||||||
metadata: dict[str, Any] | None = None,
|
metadata: dict[str, Any] | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
@ -3,6 +3,7 @@ import logging
|
|||||||
import tiktoken
|
import tiktoken
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
|
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
|
||||||
|
from megaparse import MegaParse
|
||||||
|
|
||||||
from quivr_core.config import MegaparseConfig
|
from quivr_core.config import MegaparseConfig
|
||||||
from quivr_core.files.file import QuivrFile
|
from quivr_core.files.file import QuivrFile
|
||||||
@ -55,14 +56,11 @@ class MegaparseProcessor(ProcessorBase):
|
|||||||
}
|
}
|
||||||
|
|
||||||
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
|
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
|
||||||
# mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore
|
mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore
|
||||||
# document: Document = await mega_parse.aload()
|
document: Document = await mega_parse.aload()
|
||||||
# if len(document.page_content) > self.splitter_config.chunk_size:
|
if len(document.page_content) > self.splitter_config.chunk_size:
|
||||||
# docs = self.text_splitter.split_documents([document])
|
docs = self.text_splitter.split_documents([document])
|
||||||
# for doc in docs:
|
for doc in docs:
|
||||||
# # if "Production Fonts (maximum)" in doc.page_content:
|
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
|
||||||
# # print('Doc: ', doc.page_content)
|
return docs
|
||||||
# doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
|
return [document]
|
||||||
# return docs
|
|
||||||
# return [document]
|
|
||||||
return []
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
from typing import AsyncIterable
|
from typing import AsyncIterable
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
@ -28,7 +29,7 @@ class TikaProcessor(ProcessorBase):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
tika_url: str = "http://localhost:9998/tika",
|
tika_url: str = os.getenv("TIKA_SERVER_URL", "http://localhost:9998/tika"),
|
||||||
splitter: TextSplitter | None = None,
|
splitter: TextSplitter | None = None,
|
||||||
splitter_config: SplitterConfig = SplitterConfig(),
|
splitter_config: SplitterConfig = SplitterConfig(),
|
||||||
timeout: float = 5.0,
|
timeout: float = 5.0,
|
||||||
|
@ -2,7 +2,6 @@ import logging
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from importlib.metadata import PackageNotFoundError, version
|
from importlib.metadata import PackageNotFoundError, version
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from uuid import uuid4
|
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -43,7 +42,6 @@ class ProcessorBase(ABC):
|
|||||||
"utf-8"
|
"utf-8"
|
||||||
)
|
)
|
||||||
doc.metadata = {
|
doc.metadata = {
|
||||||
"id": uuid4(),
|
|
||||||
"chunk_index": idx,
|
"chunk_index": idx,
|
||||||
"quivr_core_version": qvr_version,
|
"quivr_core_version": qvr_version,
|
||||||
**file.metadata,
|
**file.metadata,
|
||||||
|
@ -117,13 +117,13 @@ def defaults_to_proc_entries(
|
|||||||
|
|
||||||
# TODO(@aminediro): Megaparse should register itself
|
# TODO(@aminediro): Megaparse should register itself
|
||||||
# Append Megaparse
|
# Append Megaparse
|
||||||
_append_proc_mapping(
|
# _append_proc_mapping(
|
||||||
mapping=base_processors,
|
# mapping=base_processors,
|
||||||
file_ext=FileExtension.pdf,
|
# file_ext=FileExtension.pdf,
|
||||||
cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor",
|
# cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor",
|
||||||
errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor",
|
# errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor",
|
||||||
priority=None,
|
# priority=None,
|
||||||
)
|
# )
|
||||||
return base_processors
|
return base_processors
|
||||||
|
|
||||||
|
|
||||||
|
@ -39,10 +39,11 @@ class ChatMessage(BaseModelV1):
|
|||||||
|
|
||||||
|
|
||||||
class KnowledgeStatus(str, Enum):
|
class KnowledgeStatus(str, Enum):
|
||||||
PROCESSING = "PROCESSING"
|
|
||||||
UPLOADED = "UPLOADED"
|
|
||||||
ERROR = "ERROR"
|
ERROR = "ERROR"
|
||||||
RESERVED = "RESERVED"
|
RESERVED = "RESERVED"
|
||||||
|
PROCESSING = "PROCESSING"
|
||||||
|
PROCESSED = "PROCESSED"
|
||||||
|
UPLOADED = "UPLOADED"
|
||||||
|
|
||||||
|
|
||||||
class Source(BaseModel):
|
class Source(BaseModel):
|
||||||
|
@ -1,20 +1,20 @@
|
|||||||
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
from typing import (
|
from typing import (
|
||||||
Annotated,
|
Annotated,
|
||||||
|
Any,
|
||||||
AsyncGenerator,
|
AsyncGenerator,
|
||||||
|
Dict,
|
||||||
List,
|
List,
|
||||||
Optional,
|
Optional,
|
||||||
Sequence,
|
Sequence,
|
||||||
Tuple,
|
Tuple,
|
||||||
TypedDict,
|
|
||||||
Dict,
|
|
||||||
Any,
|
|
||||||
Type,
|
Type,
|
||||||
|
TypedDict,
|
||||||
)
|
)
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
import asyncio
|
|
||||||
|
|
||||||
# TODO(@aminediro): this is the only dependency to langchain package, we should remove it
|
import openai
|
||||||
from langchain.retrievers import ContextualCompressionRetriever
|
from langchain.retrievers import ContextualCompressionRetriever
|
||||||
from langchain_cohere import CohereRerank
|
from langchain_cohere import CohereRerank
|
||||||
from langchain_community.document_compressors import JinaRerank
|
from langchain_community.document_compressors import JinaRerank
|
||||||
@ -22,20 +22,17 @@ from langchain_core.callbacks import Callbacks
|
|||||||
from langchain_core.documents import BaseDocumentCompressor, Document
|
from langchain_core.documents import BaseDocumentCompressor, Document
|
||||||
from langchain_core.messages import BaseMessage
|
from langchain_core.messages import BaseMessage
|
||||||
from langchain_core.messages.ai import AIMessageChunk
|
from langchain_core.messages.ai import AIMessageChunk
|
||||||
from langchain_core.vectorstores import VectorStore
|
|
||||||
from langchain_core.prompts.base import BasePromptTemplate
|
from langchain_core.prompts.base import BasePromptTemplate
|
||||||
from langgraph.graph import START, END, StateGraph
|
from langchain_core.vectorstores import VectorStore
|
||||||
|
from langgraph.graph import END, START, StateGraph
|
||||||
from langgraph.graph.message import add_messages
|
from langgraph.graph.message import add_messages
|
||||||
from langgraph.types import Send
|
from langgraph.types import Send
|
||||||
|
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
import openai
|
|
||||||
|
|
||||||
from quivr_core.rag.entities.chat import ChatHistory
|
|
||||||
from quivr_core.rag.entities.config import DefaultRerankers, NodeConfig, RetrievalConfig
|
|
||||||
from quivr_core.llm import LLMEndpoint
|
from quivr_core.llm import LLMEndpoint
|
||||||
from quivr_core.llm_tools.llm_tools import LLMToolFactory
|
from quivr_core.llm_tools.llm_tools import LLMToolFactory
|
||||||
|
from quivr_core.rag.entities.chat import ChatHistory
|
||||||
|
from quivr_core.rag.entities.config import DefaultRerankers, NodeConfig, RetrievalConfig
|
||||||
from quivr_core.rag.entities.models import (
|
from quivr_core.rag.entities.models import (
|
||||||
ParsedRAGChunkResponse,
|
ParsedRAGChunkResponse,
|
||||||
QuivrKnowledge,
|
QuivrKnowledge,
|
||||||
|
Loading…
Reference in New Issue
Block a user