mirror of
https://github.com/StanGirard/quivr.git
synced 2024-11-26 03:15:19 +03:00
feat: kms-migration (#3446)
# Description - necessary changes for Kms v0.1
This commit is contained in:
parent
6415c75cbd
commit
1356d87098
@ -112,9 +112,9 @@ class QuivrFile:
|
||||
id: UUID,
|
||||
original_filename: str,
|
||||
path: Path,
|
||||
brain_id: UUID,
|
||||
file_sha1: str,
|
||||
file_extension: FileExtension | str,
|
||||
brain_id: UUID | None = None,
|
||||
file_size: int | None = None,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
) -> None:
|
||||
|
@ -3,6 +3,7 @@ import logging
|
||||
import tiktoken
|
||||
from langchain_core.documents import Document
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
|
||||
from megaparse import MegaParse
|
||||
|
||||
from quivr_core.config import MegaparseConfig
|
||||
from quivr_core.files.file import QuivrFile
|
||||
@ -55,14 +56,11 @@ class MegaparseProcessor(ProcessorBase):
|
||||
}
|
||||
|
||||
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
|
||||
# mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore
|
||||
# document: Document = await mega_parse.aload()
|
||||
# if len(document.page_content) > self.splitter_config.chunk_size:
|
||||
# docs = self.text_splitter.split_documents([document])
|
||||
# for doc in docs:
|
||||
# # if "Production Fonts (maximum)" in doc.page_content:
|
||||
# # print('Doc: ', doc.page_content)
|
||||
# doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
|
||||
# return docs
|
||||
# return [document]
|
||||
return []
|
||||
mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore
|
||||
document: Document = await mega_parse.aload()
|
||||
if len(document.page_content) > self.splitter_config.chunk_size:
|
||||
docs = self.text_splitter.split_documents([document])
|
||||
for doc in docs:
|
||||
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
|
||||
return docs
|
||||
return [document]
|
||||
|
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import os
|
||||
from typing import AsyncIterable
|
||||
|
||||
import httpx
|
||||
@ -28,7 +29,7 @@ class TikaProcessor(ProcessorBase):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tika_url: str = "http://localhost:9998/tika",
|
||||
tika_url: str = os.getenv("TIKA_SERVER_URL", "http://localhost:9998/tika"),
|
||||
splitter: TextSplitter | None = None,
|
||||
splitter_config: SplitterConfig = SplitterConfig(),
|
||||
timeout: float = 5.0,
|
||||
|
@ -2,7 +2,6 @@ import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from importlib.metadata import PackageNotFoundError, version
|
||||
from typing import Any
|
||||
from uuid import uuid4
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -43,7 +42,6 @@ class ProcessorBase(ABC):
|
||||
"utf-8"
|
||||
)
|
||||
doc.metadata = {
|
||||
"id": uuid4(),
|
||||
"chunk_index": idx,
|
||||
"quivr_core_version": qvr_version,
|
||||
**file.metadata,
|
||||
|
@ -117,13 +117,13 @@ def defaults_to_proc_entries(
|
||||
|
||||
# TODO(@aminediro): Megaparse should register itself
|
||||
# Append Megaparse
|
||||
_append_proc_mapping(
|
||||
mapping=base_processors,
|
||||
file_ext=FileExtension.pdf,
|
||||
cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor",
|
||||
errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor",
|
||||
priority=None,
|
||||
)
|
||||
# _append_proc_mapping(
|
||||
# mapping=base_processors,
|
||||
# file_ext=FileExtension.pdf,
|
||||
# cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor",
|
||||
# errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor",
|
||||
# priority=None,
|
||||
# )
|
||||
return base_processors
|
||||
|
||||
|
||||
|
@ -39,10 +39,11 @@ class ChatMessage(BaseModelV1):
|
||||
|
||||
|
||||
class KnowledgeStatus(str, Enum):
|
||||
PROCESSING = "PROCESSING"
|
||||
UPLOADED = "UPLOADED"
|
||||
ERROR = "ERROR"
|
||||
RESERVED = "RESERVED"
|
||||
PROCESSING = "PROCESSING"
|
||||
PROCESSED = "PROCESSED"
|
||||
UPLOADED = "UPLOADED"
|
||||
|
||||
|
||||
class Source(BaseModel):
|
||||
|
@ -1,20 +1,20 @@
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import (
|
||||
Annotated,
|
||||
Any,
|
||||
AsyncGenerator,
|
||||
Dict,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
TypedDict,
|
||||
Dict,
|
||||
Any,
|
||||
Type,
|
||||
TypedDict,
|
||||
)
|
||||
from uuid import uuid4
|
||||
import asyncio
|
||||
|
||||
# TODO(@aminediro): this is the only dependency to langchain package, we should remove it
|
||||
import openai
|
||||
from langchain.retrievers import ContextualCompressionRetriever
|
||||
from langchain_cohere import CohereRerank
|
||||
from langchain_community.document_compressors import JinaRerank
|
||||
@ -22,20 +22,17 @@ from langchain_core.callbacks import Callbacks
|
||||
from langchain_core.documents import BaseDocumentCompressor, Document
|
||||
from langchain_core.messages import BaseMessage
|
||||
from langchain_core.messages.ai import AIMessageChunk
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
from langchain_core.prompts.base import BasePromptTemplate
|
||||
from langgraph.graph import START, END, StateGraph
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
from langgraph.graph import END, START, StateGraph
|
||||
from langgraph.graph.message import add_messages
|
||||
from langgraph.types import Send
|
||||
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
import openai
|
||||
|
||||
from quivr_core.rag.entities.chat import ChatHistory
|
||||
from quivr_core.rag.entities.config import DefaultRerankers, NodeConfig, RetrievalConfig
|
||||
from quivr_core.llm import LLMEndpoint
|
||||
from quivr_core.llm_tools.llm_tools import LLMToolFactory
|
||||
from quivr_core.rag.entities.chat import ChatHistory
|
||||
from quivr_core.rag.entities.config import DefaultRerankers, NodeConfig, RetrievalConfig
|
||||
from quivr_core.rag.entities.models import (
|
||||
ParsedRAGChunkResponse,
|
||||
QuivrKnowledge,
|
||||
|
Loading…
Reference in New Issue
Block a user