From bb572a2a8d060f147461506aadd38704eb029a9a Mon Sep 17 00:00:00 2001 From: Stan Girard Date: Fri, 11 Oct 2024 16:59:03 +0200 Subject: [PATCH] docs(core): init (#3365) # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): --------- Co-authored-by: aminediro Co-authored-by: Jacopo Chevallard Co-authored-by: chloedia Co-authored-by: AmineDiro --- backend/core/quivr_core/base_config.py | 24 ++ backend/core/quivr_core/brain/brain.py | 183 ++++++++++++++ backend/core/quivr_core/chat.py | 44 +++- backend/core/quivr_core/config.py | 236 ++++++++++++++++-- backend/core/quivr_core/storage/__init__.py | 0 .../core/quivr_core/storage/local_storage.py | 64 +++++ .../core/quivr_core/storage/storage_base.py | 50 ++++ backend/docs/docs/brain/brain.md | 3 + backend/docs/docs/brain/chat.md | 11 + backend/docs/docs/brain/index.md | 42 ++++ backend/docs/docs/config/base_config.md | 5 + backend/docs/docs/config/config.md | 22 ++ backend/docs/docs/config/index.md | 0 backend/docs/docs/examples/custom_storage.md | 3 + backend/docs/docs/parsers/index.md | 2 +- backend/docs/docs/storage/base.md | 5 + backend/docs/docs/storage/index.md | 28 +++ backend/docs/docs/storage/local_storage.md | 5 + backend/docs/mkdocs.yml | 59 +++-- 19 files changed, 743 insertions(+), 43 deletions(-) create mode 100644 backend/core/quivr_core/storage/__init__.py create mode 100644 backend/docs/docs/brain/brain.md create mode 100644 backend/docs/docs/brain/chat.md create mode 100644 backend/docs/docs/brain/index.md create mode 100644 backend/docs/docs/config/base_config.md create mode 100644 backend/docs/docs/config/config.md create mode 100644 backend/docs/docs/config/index.md create mode 100644 backend/docs/docs/examples/custom_storage.md create mode 100644 backend/docs/docs/storage/base.md create mode 100644 backend/docs/docs/storage/index.md create mode 100644 backend/docs/docs/storage/local_storage.md diff --git a/backend/core/quivr_core/base_config.py b/backend/core/quivr_core/base_config.py index 0a2d11546..7d7bc8163 100644 --- a/backend/core/quivr_core/base_config.py +++ b/backend/core/quivr_core/base_config.py @@ -5,10 +5,34 @@ from pydantic import BaseModel, ConfigDict class QuivrBaseConfig(BaseModel): + """ + Base configuration class for Quivr. + + This class extends Pydantic's BaseModel and provides a foundation for + configuration management in quivr-core. + + Attributes: + model_config (ConfigDict): Configuration for the Pydantic model. + It's set to forbid extra attributes, ensuring strict adherence + to the defined schema. + + Class Methods: + from_yaml: Create an instance of the class from a YAML file. + """ + model_config = ConfigDict(extra="forbid") @classmethod def from_yaml(cls, file_path: str | Path): + """ + Create an instance of the class from a YAML file. + + Args: + file_path (str | Path): The path to the YAML file. + + Returns: + QuivrBaseConfig: An instance of the class initialized with the data from the YAML file. + """ # Load the YAML file with open(file_path, "r") as stream: config_data = yaml.safe_load(stream) diff --git a/backend/core/quivr_core/brain/brain.py b/backend/core/quivr_core/brain/brain.py index 3bbe7af02..50f20123f 100644 --- a/backend/core/quivr_core/brain/brain.py +++ b/backend/core/quivr_core/brain/brain.py @@ -46,6 +46,24 @@ logger = logging.getLogger("quivr_core") async def process_files( storage: StorageBase, skip_file_error: bool, **processor_kwargs: dict[str, Any] ) -> list[Document]: + """ + Process files in storage. + This function takes a StorageBase and return a list of langchain documents. + + Args: + storage (StorageBase): The storage containing the files to process. + skip_file_error (bool): Whether to skip files that cannot be processed. + processor_kwargs (dict[str, Any]): Additional arguments for the processor. + + Returns: + list[Document]: List of processed documents in the Langchain Document format. + + Raises: + ValueError: If a file cannot be processed and skip_file_error is False. + Exception: If no processor is found for a file of a specific type and skip_file_error is False. + + """ + knowledge = [] for file in await storage.get_files(): try: @@ -71,6 +89,36 @@ async def process_files( class Brain: + """ + A class representing a Brain. + + This class allows for the creation of a Brain, which is a collection of knowledge one wants to retrieve information from. + + A Brain is set to: + + * Store files in the storage of your choice (local, S3, etc.) + * Process the files in the storage to extract text and metadata in a wide range of format. + * Store the processed files in the vector store of your choice (FAISS, PGVector, etc.) - default to FAISS. + * Create an index of the processed files. + * Use the *Quivr* workflow for the retrieval augmented generation. + + A Brain is able to: + + * Search for information in the vector store. + * Answer questions about the knowledges in the Brain. + * Stream the answer to the question. + + Attributes: + name (str): The name of the brain. + id (UUID): The unique identifier of the brain. + storage (StorageBase): The storage used to store the files. + llm (LLMEndpoint): The language model used to generate the answer. + vector_db (VectorStore): The vector store used to store the processed files. + embedder (Embeddings): The embeddings used to create the index of the processed files. + + + """ + def __init__( self, *, @@ -106,6 +154,22 @@ class Brain: @classmethod def load(cls, folder_path: str | Path) -> Self: + """ + Load a brain from a folder path. + + Args: + folder_path (str | Path): The path to the folder containing the brain. + + Returns: + Brain: The brain loaded from the folder path. + + Example: + ```python + brain_loaded = Brain.load("path/to/brain") + brain_loaded.print_info() + ``` + + """ if isinstance(folder_path, str): folder_path = Path(folder_path) if not folder_path.exists(): @@ -154,6 +218,20 @@ class Brain: ) async def save(self, folder_path: str | Path): + """ + Save the brain to a folder path. + + Args: + folder_path (str | Path): The path to the folder where the brain will be saved. + + Returns: + str: The path to the folder where the brain was saved. + + Example: + ```python + await brain.save("path/to/brain") + ``` + """ if isinstance(folder_path, str): folder_path = Path(folder_path) @@ -247,6 +325,28 @@ class Brain: skip_file_error: bool = False, processor_kwargs: dict[str, Any] | None = None, ): + """ + Create a brain from a list of file paths. + + Args: + name (str): The name of the brain. + file_paths (list[str | Path]): The list of file paths to add to the brain. + vector_db (VectorStore | None): The vector store used to store the processed files. + storage (StorageBase): The storage used to store the files. + llm (LLMEndpoint | None): The language model used to generate the answer. + embedder (Embeddings | None): The embeddings used to create the index of the processed files. + skip_file_error (bool): Whether to skip files that cannot be processed. + processor_kwargs (dict[str, Any] | None): Additional arguments for the processor. + + Returns: + Brain: The brain created from the file paths. + + Example: + ```python + brain = await Brain.afrom_files(name="My Brain", file_paths=["file1.pdf", "file2.pdf"]) + brain.print_info() + ``` + """ if llm is None: llm = default_llm() @@ -327,6 +427,28 @@ class Brain: llm: LLMEndpoint | None = None, embedder: Embeddings | None = None, ) -> Self: + """ + Create a brain from a list of langchain documents. + + Args: + name (str): The name of the brain. + langchain_documents (list[Document]): The list of langchain documents to add to the brain. + vector_db (VectorStore | None): The vector store used to store the processed files. + storage (StorageBase): The storage used to store the files. + llm (LLMEndpoint | None): The language model used to generate the answer. + embedder (Embeddings | None): The embeddings used to create the index of the processed files. + + Returns: + Brain: The brain created from the langchain documents. + + Example: + ```python + from langchain_core.documents import Document + documents = [Document(page_content="Hello, world!")] + brain = await Brain.afrom_langchain_documents(name="My Brain", langchain_documents=documents) + brain.print_info() + ``` + """ if llm is None: llm = default_llm() @@ -357,6 +479,26 @@ class Brain: filter: Callable | Dict[str, Any] | None = None, fetch_n_neighbors: int = 20, ) -> list[SearchResult]: + """ + Search for relevant documents in the brain based on a query. + + Args: + query (str | Document): The query to search for. + n_results (int): The number of results to return. + filter (Callable | Dict[str, Any] | None): The filter to apply to the search. + fetch_n_neighbors (int): The number of neighbors to fetch. + + Returns: + list[SearchResult]: The list of retrieved chunks. + + Example: + ```python + brain = Brain.from_files(name="My Brain", file_paths=["file1.pdf", "file2.pdf"]) + results = await brain.asearch("Why everybody loves Quivr?") + for result in results: + print(result.chunk.page_content) + ``` + """ if not self.vector_db: raise ValueError("No vector db configured for this brain") @@ -383,6 +525,26 @@ class Brain: list_files: list[QuivrKnowledge] | None = None, chat_history: ChatHistory | None = None, ) -> ParsedRAGResponse: + """ + Ask a question to the brain and get a generated answer. + + Args: + question (str): The question to ask. + retrieval_config (RetrievalConfig | None): The retrieval configuration (see RetrievalConfig docs). + rag_pipeline (Type[Union[QuivrQARAG, QuivrQARAGLangGraph]] | None): The RAG pipeline to use. + list_files (list[QuivrKnowledge] | None): The list of files to include in the RAG pipeline. + chat_history (ChatHistory | None): The chat history to use. + + Returns: + ParsedRAGResponse: The generated answer. + + Example: + ```python + brain = Brain.from_files(name="My Brain", file_paths=["file1.pdf", "file2.pdf"]) + answer = brain.ask("What is the meaning of life?") + print(answer.answer) + ``` + """ llm = self.llm # If you passed a different llm model we'll override the brain one @@ -420,6 +582,27 @@ class Brain: list_files: list[QuivrKnowledge] | None = None, chat_history: ChatHistory | None = None, ) -> AsyncGenerator[ParsedRAGChunkResponse, ParsedRAGChunkResponse]: + """ + Ask a question to the brain and get a streamed generated answer. + + Args: + question (str): The question to ask. + retrieval_config (RetrievalConfig | None): The retrieval configuration (see RetrievalConfig docs). + rag_pipeline (Type[Union[QuivrQARAG, QuivrQARAGLangGraph]] | None): The RAG pipeline to use. + list_files (list[QuivrKnowledge] | None): The list of files to include in the RAG pipeline. + chat_history (ChatHistory | None): The chat history to use. + + Returns: + AsyncGenerator[ParsedRAGChunkResponse, ParsedRAGChunkResponse]: The streamed generated answer. + + Example: + ```python + brain = Brain.from_files(name="My Brain", file_paths=["file1.pdf", "file2.pdf"]) + async for chunk in brain.ask_streaming("What is the meaning of life?"): + print(chunk.answer) + ``` + + """ llm = self.llm # If you passed a different llm model we'll override the brain one diff --git a/backend/core/quivr_core/chat.py b/backend/core/quivr_core/chat.py index 458c7fafb..15da1c053 100644 --- a/backend/core/quivr_core/chat.py +++ b/backend/core/quivr_core/chat.py @@ -10,21 +10,35 @@ from quivr_core.models import ChatMessage class ChatHistory: """ - Chat history is a list of ChatMessage. - It is used to store the chat history of a chat. + ChatHistory is a class that maintains a record of chat conversations. Each message + in the history is represented by an instance of the `ChatMessage` class, and the + chat history is stored internally as a list of these `ChatMessage` objects. + The class provides methods to retrieve, append, iterate, and manipulate the chat + history, as well as utilities to convert the messages into specific formats + and support deep copying. """ def __init__(self, chat_id: UUID, brain_id: UUID | None) -> None: + """Init a new ChatHistory object. + + Args: + chat_id (UUID): A unique identifier for the chat session. + brain_id (UUID | None): An optional identifier for the brain associated with the chat. + """ self.id = chat_id self.brain_id = brain_id # TODO(@aminediro): maybe use a deque() instead ? self._msgs: list[ChatMessage] = [] def get_chat_history(self, newest_first: bool = False): - """Returns a ChatMessage list sorted by time + """ + Retrieves the chat history, optionally sorted in reverse chronological order. + + Args: + newest_first (bool, optional): If True, returns the messages in reverse order (newest first). Defaults to False. Returns: - list[ChatMessage]: list of chat messages + List[ChatMessage]: A sorted list of chat messages. """ history = sorted(self._msgs, key=lambda msg: msg.message_time) if newest_first: @@ -38,7 +52,11 @@ class ChatHistory: self, langchain_msg: AIMessage | HumanMessage, metadata: dict[str, Any] = {} ): """ - Append a message to the chat history. + Appends a new message to the chat history. + + Args: + langchain_msg (AIMessage | HumanMessage): The message content (either an AI or Human message). + metadata (dict[str, Any], optional): Additional metadata related to the message. Defaults to an empty dictionary. """ chat_msg = ChatMessage( chat_id=self.id, @@ -52,7 +70,13 @@ class ChatHistory: def iter_pairs(self) -> Generator[Tuple[HumanMessage, AIMessage], None, None]: """ - Iterate over the chat history as pairs of HumanMessage and AIMessage. + Iterates over the chat history in pairs, returning a HumanMessage followed by an AIMessage. + + Yields: + Tuple[HumanMessage, AIMessage]: Pairs of human and AI messages. + + Raises: + AssertionError: If the messages in the pair are not in the expected order (i.e., a HumanMessage followed by an AIMessage). """ # Reverse the chat_history, newest first it = iter(self.get_chat_history(newest_first=True)) @@ -66,7 +90,13 @@ class ChatHistory: yield (human_message.msg, ai_message.msg) def to_list(self) -> List[HumanMessage | AIMessage]: - """Format the chat history into a list of HumanMessage and AIMessage""" + """ + Converts the chat history into a list of raw HumanMessage or AIMessage objects. + + Returns: + list[HumanMessage | AIMessage]: A list of messages in their raw form, without metadata. + """ + return [_msg.msg for _msg in self._msgs] def __deepcopy__(self, memo): diff --git a/backend/core/quivr_core/config.py b/backend/core/quivr_core/config.py index 4c5f9d851..c1abcc471 100644 --- a/backend/core/quivr_core/config.py +++ b/backend/core/quivr_core/config.py @@ -21,11 +21,38 @@ class BrainConfig(QuivrBaseConfig): class DefaultRerankers(str, Enum): + """ + Enum representing the default API-based reranker suppliers supported by the application. + + This enum defines the various reranker providers that can be used in the system. + Each enum value corresponds to a specific supplier's identifier and has an + associated default model. + + Attributes: + COHERE (str): Represents Cohere AI as a reranker supplier. + JINA (str): Represents Jina AI as a reranker supplier. + + Methods: + default_model (property): Returns the default model for the selected supplier. + """ + COHERE = "cohere" JINA = "jina" @property def default_model(self) -> str: + """ + Get the default model for the selected reranker supplier. + + This property method returns the default model associated with the current + reranker supplier (COHERE or JINA). + + Returns: + str: The name of the default model for the selected supplier. + + Raises: + KeyError: If the current enum value doesn't have a corresponding default model. + """ # Mapping of suppliers to their default models return { self.COHERE: "rerank-multilingual-v3.0", @@ -34,6 +61,22 @@ class DefaultRerankers(str, Enum): class DefaultModelSuppliers(str, Enum): + """ + Enum representing the default model suppliers supported by the application. + + This enum defines the various AI model providers that can be used as sources + for LLMs in the system. Each enum value corresponds to a specific + supplier's identifier. + + Attributes: + OPENAI (str): Represents OpenAI as a model supplier. + AZURE (str): Represents Azure (Microsoft) as a model supplier. + ANTHROPIC (str): Represents Anthropic as a model supplier. + META (str): Represents Meta as a model supplier. + MISTRAL (str): Represents Mistral AI as a model supplier. + GROQ (str): Represents Groq as a model supplier. + """ + OPENAI = "openai" AZURE = "azure" ANTHROPIC = "anthropic" @@ -159,6 +202,27 @@ class LLMModelConfig: class LLMEndpointConfig(QuivrBaseConfig): + """ + Configuration class for Large Language Models (LLM) endpoints. + + This class defines the settings and parameters for interacting with various LLM providers. + It includes configuration for the model, API keys, token limits, and other relevant settings. + + Attributes: + supplier (DefaultModelSuppliers): The LLM provider (default: OPENAI). + model (str): The specific model to use (default: "gpt-3.5-turbo-0125"). + context_length (int | None): The maximum context length for the model. + tokenizer_hub (str | None): The tokenizer to use for this model. + llm_base_url (str | None): Base URL for the LLM API. + env_variable_name (str): Name of the environment variable for the API key. + llm_api_key (str | None): The API key for the LLM provider. + max_input_tokens (int): Maximum number of input tokens sent to the LLM (default: 2000). + max_output_tokens (int): Maximum number of output tokens returned by the LLM (default: 2000). + temperature (float): Temperature setting for text generation (default: 0.7). + streaming (bool): Whether to use streaming for responses (default: True). + prompt (CustomPromptsModel | None): Custom prompt configuration. + """ + supplier: DefaultModelSuppliers = DefaultModelSuppliers.OPENAI model: str = "gpt-3.5-turbo-0125" context_length: int | None = None @@ -176,15 +240,41 @@ class LLMEndpointConfig(QuivrBaseConfig): @property def fallback_tokenizer(self) -> str: + """ + Get the fallback tokenizer. + + Returns: + str: The name of the fallback tokenizer. + """ return self._FALLBACK_TOKENIZER def __init__(self, **data): + """ + Initialize the LLMEndpointConfig. + + This method sets up the initial configuration, including setting the LLM model + config and API key. + + Args: + **data: Keyword arguments for initializing the config. + """ super().__init__(**data) self.set_llm_model_config() self.set_api_key() def set_api_key(self, force_reset: bool = False): - # Check if the corresponding API key environment variable is set + """ + Set the API key for the LLM provider. + + This method attempts to set the API key from the environment variable. + If the key is not found, it raises a ValueError. + + Args: + force_reset (bool): If True, forces a reset of the API key even if already set. + + Raises: + ValueError: If the API key is not set in the environment. + """ if not self.llm_api_key or force_reset: self.llm_api_key = os.getenv(self.env_variable_name) @@ -195,7 +285,12 @@ class LLMEndpointConfig(QuivrBaseConfig): ) def set_llm_model_config(self): - # Automatically set context_length and tokenizer_hub based on the supplier and model + """ + Set the LLM model configuration. + + This method automatically sets the context_length and tokenizer_hub + based on the current supplier and model. + """ llm_model_config = LLMModelConfig.get_llm_model_config( self.supplier, self.model ) @@ -204,6 +299,18 @@ class LLMEndpointConfig(QuivrBaseConfig): self.tokenizer_hub = llm_model_config.tokenizer_hub def set_llm_model(self, model: str): + """ + Set the LLM model and update related configurations. + + This method updates the supplier and model based on the provided model name, + then updates the model config and API key accordingly. + + Args: + model (str): The name of the model to set. + + Raises: + ValueError: If no corresponding supplier is found for the given model. + """ supplier = LLMModelConfig.get_supplier_by_model_name(model) if supplier is None: raise ValueError( @@ -217,11 +324,18 @@ class LLMEndpointConfig(QuivrBaseConfig): def set_from_sqlmodel(self, sqlmodel: SQLModel, mapping: Dict[str, str]): """ - Set attributes in LLMEndpointConfig from Model attributes using a field mapping. + Set attributes in LLMEndpointConfig from SQLModel attributes using a field mapping. - :param model_instance: An instance of the Model class. - :param mapping: A dictionary that maps Model fields to LLMEndpointConfig fields. - Example: {"max_input": "max_input_tokens", "env_variable_name": "env_variable_name"} + This method allows for dynamic setting of LLMEndpointConfig attributes based on + a provided SQLModel instance and a mapping dictionary. + + Args: + sqlmodel (SQLModel): An instance of the SQLModel class. + mapping (Dict[str, str]): A dictionary that maps SQLModel fields to LLMEndpointConfig fields. + Example: {"max_input": "max_input_tokens", "env_variable_name": "env_variable_name"} + + Raises: + AttributeError: If any field in the mapping doesn't exist in either the SQLModel or LLMEndpointConfig. """ for model_field, llm_field in mapping.items(): if hasattr(sqlmodel, model_field) and hasattr(self, llm_field): @@ -234,21 +348,47 @@ class LLMEndpointConfig(QuivrBaseConfig): # Cannot use Pydantic v2 field_validator because of conflicts with pydantic v1 still in use in LangChain class RerankerConfig(QuivrBaseConfig): + """ + Configuration class for reranker models. + + This class defines the settings for reranker models used in the application, + including the supplier, model, and API key information. + + Attributes: + supplier (DefaultRerankers | None): The reranker supplier (e.g., COHERE). + model (str | None): The specific reranker model to use. + top_n (int): The number of top chunks returned by the reranker (default: 5). + api_key (str | None): The API key for the reranker service. + """ + supplier: DefaultRerankers | None = None model: str | None = None top_n: int = 5 api_key: str | None = None def __init__(self, **data): - super().__init__(**data) # Call Pydantic's BaseModel init - self.validate_model() # Automatically call external validation + """ + Initialize the RerankerConfig. + + Args: + **data: Keyword arguments for initializing the config. + """ + super().__init__(**data) + self.validate_model() def validate_model(self): - # If model is not provided, get default model based on supplier + """ + Validate and set up the reranker model configuration. + + This method ensures that a model is set (using the default if not provided) + and that the necessary API key is available in the environment. + + Raises: + ValueError: If the required API key is not set in the environment. + """ if self.model is None and self.supplier is not None: self.model = self.supplier.default_model - # Check if the corresponding API key environment variable is set if self.supplier: api_key_var = f"{self.supplier.upper()}_API_KEY" self.api_key = os.getenv(api_key_var) @@ -261,34 +401,102 @@ class RerankerConfig(QuivrBaseConfig): class NodeConfig(QuivrBaseConfig): + """ + Configuration class for a node in an AI assistant workflow. + + This class represents a single node in a workflow configuration, + defining its name and connections to other nodes. + + Attributes: + name (str): The name of the node. + edges (List[str]): List of names of other nodes this node links to. + """ + name: str - # config: QuivrBaseConfig # This can be any config like RerankerConfig or LLMEndpointConfig - edges: List[str] # List of names of other nodes this node links to + edges: List[str] class WorkflowConfig(QuivrBaseConfig): + """ + Configuration class for an AI assistant workflow. + + This class represents the entire workflow configuration, + consisting of multiple interconnected nodes. + + Attributes: + name (str): The name of the workflow. + nodes (List[NodeConfig]): List of nodes in the workflow. + """ + name: str nodes: List[NodeConfig] class RetrievalConfig(QuivrBaseConfig): + """ + Configuration class for the retrieval phase of a RAG assistant. + + This class defines the settings for the retrieval process, + including reranker and LLM configurations, as well as various limits and prompts. + + Attributes: + workflow_config (WorkflowConfig | None): Configuration for the workflow. + reranker_config (RerankerConfig): Configuration for the reranker. + llm_config (LLMEndpointConfig): Configuration for the LLM endpoint. + max_history (int): Maximum number of past conversation turns to pass to the LLM as context (default: 10). + max_files (int): Maximum number of files to process (default: 20). + prompt (str | None): Custom prompt for the retrieval process. + """ + + workflow_config: WorkflowConfig | None = None reranker_config: RerankerConfig = RerankerConfig() llm_config: LLMEndpointConfig = LLMEndpointConfig() max_history: int = 10 max_files: int = 20 prompt: str | None = None - workflow_config: WorkflowConfig | None = None class ParserConfig(QuivrBaseConfig): + """ + Configuration class for the parser. + + This class defines the settings for the parsing process, + including configurations for the text splitter and Megaparse. + + Attributes: + splitter_config (SplitterConfig): Configuration for the text splitter. + megaparse_config (MegaparseConfig): Configuration for Megaparse. + """ + splitter_config: SplitterConfig = SplitterConfig() megaparse_config: MegaparseConfig = MegaparseConfig() class IngestionConfig(QuivrBaseConfig): + """ + Configuration class for the data ingestion process. + + This class defines the settings for the data ingestion process, + including the parser configuration. + + Attributes: + parser_config (ParserConfig): Configuration for the parser. + """ + parser_config: ParserConfig = ParserConfig() class AssistantConfig(QuivrBaseConfig): + """ + Configuration class for an AI assistant. + + This class defines the overall configuration for an AI assistant, + including settings for retrieval and ingestion processes. + + Attributes: + retrieval_config (RetrievalConfig): Configuration for the retrieval process. + ingestion_config (IngestionConfig): Configuration for the ingestion process. + """ + retrieval_config: RetrievalConfig = RetrievalConfig() - ingestion_config: IngestionConfig = IngestionConfig() + ingestion_config: IngestionConfig = IngestionConfig() \ No newline at end of file diff --git a/backend/core/quivr_core/storage/__init__.py b/backend/core/quivr_core/storage/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/core/quivr_core/storage/local_storage.py b/backend/core/quivr_core/storage/local_storage.py index 146c0f725..7c5f44018 100644 --- a/backend/core/quivr_core/storage/local_storage.py +++ b/backend/core/quivr_core/storage/local_storage.py @@ -10,6 +10,27 @@ from quivr_core.storage.storage_base import StorageBase class LocalStorage(StorageBase): + """ + LocalStorage is a concrete implementation of the `StorageBase` class that + stores files locally on disk. This class manages file uploads, tracks file + hashes, and allows retrieval of stored files from a specified directory. + + Attributes: + name (str): The name of the storage type, set to "local_storage". + files (list[QuivrFile]): A list of files stored in this local storage. + hashes (Set[str]): A set of SHA-1 hashes of the uploaded files. + copy_flag (bool): If `True`, files are copied to the storage directory. + If `False`, symbolic links are used instead. + dir_path (Path): The directory path where files are stored. + + Args: + dir_path (Path | None): Optional directory path for storing files. + Defaults to the environment variable `QUIVR_LOCAL_STORAGE` + or `~/.cache/quivr/files`. + copy_flag (bool): Whether to copy the file or create a symlink. + Defaults to `True`. + """ + name: str = "local_storage" def __init__(self, dir_path: Path | None = None, copy_flag: bool = True): @@ -36,6 +57,20 @@ class LocalStorage(StorageBase): return {"directory_path": self.dir_path, **super().info()} async def upload_file(self, file: QuivrFile, exists_ok: bool = False) -> None: + """ + Uploads a file to the local storage. Copies or creates a symlink based + on the `copy_flag` attribute. Checks for duplicate file uploads using + the file's SHA-1 hash. + + Args: + file (QuivrFile): The file object to upload. + exists_ok (bool): If `True`, allows overwriting an existing file. + Defaults to `False`. + + Raises: + FileExistsError: If a file with the same SHA-1 hash already exists + and `exists_ok` is set to `False`. + """ dst_path = os.path.join( self.dir_path, str(file.brain_id), f"{file.id}{file.file_extension}" ) @@ -53,13 +88,42 @@ class LocalStorage(StorageBase): self.hashes.add(file.file_sha1) async def get_files(self) -> list[QuivrFile]: + """ + Retrieves the list of files stored in the local storage. + + Returns: + list[QuivrFile]: A list of stored file objects. + """ return self.files async def remove_file(self, file_id: UUID) -> None: + """ + Removes a file from the local storage. This method is currently not + implemented. + + Args: + file_id (UUID): The unique identifier of the file to remove. + + Raises: + NotImplementedError: Always raises this error as the method is not yet implemented. + """ raise NotImplementedError @classmethod def load(cls, config: LocalStorageConfig) -> Self: + """ + Loads the local storage from a configuration object. This method + initializes the storage directory and populates it with deserialized + files from the configuration. + + Args: + config (LocalStorageConfig): Configuration object containing the + storage path and serialized file data. + + Returns: + LocalStorage: An instance of `LocalStorage` with files loaded + from the configuration. + """ tstorage = cls(dir_path=config.storage_path) tstorage.files = [QuivrFile.deserialize(f) for f in config.files.values()] return tstorage diff --git a/backend/core/quivr_core/storage/storage_base.py b/backend/core/quivr_core/storage/storage_base.py index 9fa385355..57183df38 100644 --- a/backend/core/quivr_core/storage/storage_base.py +++ b/backend/core/quivr_core/storage/storage_base.py @@ -6,6 +6,13 @@ from quivr_core.storage.local_storage import QuivrFile class StorageBase(ABC): + """ + Abstract base class for storage systems. All subclasses are required to define certain attributes and implement specific methods for managing files + + Attributes: + name (str): Name of the storage type. + """ + name: str def __init_subclass__(cls, **kwargs): @@ -21,21 +28,64 @@ class StorageBase(ABC): @abstractmethod def nb_files(self) -> int: + """ + Abstract method to get the number of files in the storage. + + Returns: + int: The number of files in the storage. + + Raises: + Exception: If the method is not implemented. + """ raise Exception("Unimplemented nb_files method") @abstractmethod async def get_files(self) -> list[QuivrFile]: + """ + Abstract asynchronous method to get the files `QuivrFile` in the storage. + + Returns: + list[QuivrFile]: A list of QuivrFile objects representing the files in the storage. + + Raises: + Exception: If the method is not implemented. + """ raise Exception("Unimplemented get_files method") @abstractmethod async def upload_file(self, file: QuivrFile, exists_ok: bool = False) -> None: + """ + Abstract asynchronous method to upload a file to the storage. + + Args: + file (QuivrFile): The file to upload. + exists_ok (bool): If True, allows overwriting the file if it already exists. Default is False. + + Raises: + Exception: If the method is not implemented. + """ raise Exception("Unimplemented upload_file method") @abstractmethod async def remove_file(self, file_id: UUID) -> None: + """ + Abstract asynchronous method to remove a file from the storage. + + Args: + file_id (UUID): The unique identifier of the file to be removed. + + Raises: + Exception: If the method is not implemented. + """ raise Exception("Unimplemented remove_file method") def info(self) -> StorageInfo: + """ + Returns information about the storage, including the storage type and the number of files. + + Returns: + StorageInfo: An object containing details about the storage. + """ return StorageInfo( storage_type=self.name, n_files=self.nb_files(), diff --git a/backend/docs/docs/brain/brain.md b/backend/docs/docs/brain/brain.md new file mode 100644 index 000000000..cf0e26cf1 --- /dev/null +++ b/backend/docs/docs/brain/brain.md @@ -0,0 +1,3 @@ +::: quivr_core.brain.brain + options: + heading_level: 2 diff --git a/backend/docs/docs/brain/chat.md b/backend/docs/docs/brain/chat.md new file mode 100644 index 000000000..b4c77370f --- /dev/null +++ b/backend/docs/docs/brain/chat.md @@ -0,0 +1,11 @@ +## ChatHistory + +The `ChatHistory` class is where all the conversation between the user and the LLM gets stored. A `ChatHistory` object will transparently be instanciated in the `Brain` every time you create one. + +At each interaction with `Brain.ask_streaming` both your message and the LLM's response are added to this chat history. It's super handy because this history is used in the Retrieval-Augmented Generation (RAG) process to give the LLM more context, working as form of memory between the user and the system and helping it generate better responses by looking at what’s already been said. + +You can also get some cool info about the brain by printing its details with the `print_info()` method, which shows things like how many chats are stored, the current chat history, and more. This makes it easy to keep track of what’s going on in your conversations and manage the context being sent to the LLM! + +::: quivr_core.chat +options: +heading_level: 2 diff --git a/backend/docs/docs/brain/index.md b/backend/docs/docs/brain/index.md new file mode 100644 index 000000000..35dafcd72 --- /dev/null +++ b/backend/docs/docs/brain/index.md @@ -0,0 +1,42 @@ +# Brain + +The brain is the essential component of Quivr that stores and processes the knowledge you want to retrieve informations from. Simply create a brain with the files you want to process and use the latest Quivr RAG workflow to retrieve informations from the knowledge. + +Quick Start 🪄: + +```python +from quivr_core import Brain +from quivr_core.quivr_rag_langgraph import QuivrQARAGLangGraph + + +brain = Brain.from_files(name="My Brain", file_paths=["file1.pdf", "file2.pdf"]) +answer = brain.ask("What is Quivr ?") +print("Answer Quivr :", answer.answer) + +``` + +Pimp your Brain 🔨 : + +```python +from quivr_core import Brain +from quivr_core.llm.llm_endpoint import LLMEndpoint +from quivr_core.embedder.embedder import DeterministicFakeEmbedding +from quivr_core.llm.llm_endpoint import LLMEndpointConfig +from quivr_core.llm.llm_endpoint import FakeListChatModel + +brain = Brain.from_files( + name="test_brain", + file_paths=["my/information/source/file.pdf"], + llm=LLMEndpoint( + llm=FakeListChatModel(responses=["good"]), + llm_config=LLMEndpointConfig(model="fake_model", llm_base_url="local"), + ), + embedder=DeterministicFakeEmbedding(size=20), + ) + +answer = brain.ask( + "What is Quivr ?" + ) +print("Answer Quivr :", answer.answer) + +``` diff --git a/backend/docs/docs/config/base_config.md b/backend/docs/docs/config/base_config.md new file mode 100644 index 000000000..812aaa4b3 --- /dev/null +++ b/backend/docs/docs/config/base_config.md @@ -0,0 +1,5 @@ +# Configuration Base Class + +::: quivr_core.base_config + options: + heading_level: 2 \ No newline at end of file diff --git a/backend/docs/docs/config/config.md b/backend/docs/docs/config/config.md new file mode 100644 index 000000000..25f4460f0 --- /dev/null +++ b/backend/docs/docs/config/config.md @@ -0,0 +1,22 @@ +# Configuration + +## Retrieval Configuration +::: quivr_core.config.RetrievalConfig + +## Workflow Configuration +::: quivr_core.config.WorkflowConfig + +## LLM Configuration +::: quivr_core.config.LLMEndpointConfig + +## Reranker Configuration +::: quivr_core.config.RerankerConfig + +## Supported LLM Model Suppliers +::: quivr_core.config.DefaultModelSuppliers + +## Supported Rerankers +::: quivr_core.config.DefaultRerankers + + + diff --git a/backend/docs/docs/config/index.md b/backend/docs/docs/config/index.md new file mode 100644 index 000000000..e69de29bb diff --git a/backend/docs/docs/examples/custom_storage.md b/backend/docs/docs/examples/custom_storage.md new file mode 100644 index 000000000..2c2f85450 --- /dev/null +++ b/backend/docs/docs/examples/custom_storage.md @@ -0,0 +1,3 @@ +# Transparent Storage + +**todo** diff --git a/backend/docs/docs/parsers/index.md b/backend/docs/docs/parsers/index.md index d878fa1fb..ced0885e4 100644 --- a/backend/docs/docs/parsers/index.md +++ b/backend/docs/docs/parsers/index.md @@ -1,3 +1,3 @@ +# Parsers Quivr provides a suite of parsers to extract structured data from various sources. - diff --git a/backend/docs/docs/storage/base.md b/backend/docs/docs/storage/base.md new file mode 100644 index 000000000..b0b2de8c5 --- /dev/null +++ b/backend/docs/docs/storage/base.md @@ -0,0 +1,5 @@ +# StorageBase + +::: quivr_core.storage.storage_base +options: +heading_level: 2 diff --git a/backend/docs/docs/storage/index.md b/backend/docs/docs/storage/index.md new file mode 100644 index 000000000..7e6fe01af --- /dev/null +++ b/backend/docs/docs/storage/index.md @@ -0,0 +1,28 @@ +# 🗄️ Storage + +## Your Brain’s File Management System + +The `Storage` class is the backbone of how a brain interacts with files in `quivr-core`. Every brain holds a reference to an underlying storage system to manage its files. All storages should implement the `StorageBase` base classe that provides the structure and methods to make that happen seamlessly. Let's walk through how it works: + +- **Brain-Storage Connection:** Your brain holds a reference to a storage system. This class is the main way your brain can interact with and manage the files it uses. Adding files to a brain will upload them to the storage. This means that files in the storage are stored **before** processing! +- **File Management:** the storage holds a set of `QuivrFile` objects, which are the building blocks of your brain’s file system. The storage can store them remotely or locally or hold simple + +### What can you do with this storage system? + +1. Upload Files: You can add new files to your storage whenever you need. The system also lets you decide whether to overwrite existing files or not. +2. Get Files: Need to see what's in your storage? No problem. You can easily retrieve a list of all the files that are stored. +3. Delete Files: Clean-up is simple. You can remove any file from your storage by referencing its unique file ID (more on that in `QuivrFile`). + +StorageBase is the foundation of how your brain organizes, uploads, retrieves, and deletes its files. It ensures that your brain can always stay up-to-date with the files it needs, making file management smooth and intuitive. You can build your own storage system by subclassing the `StorageBase` class and passing it to the brain. See [custom_storage](../examples/custom_storage.md) for more details. + +### Storage Implementations in `quivr_core` + +`quivr_core` currently offers two storage implementations: `LocalStorage` and `TransparentStorage`: + +- **LocalStorage**: + This storage type is perfect when you want to keep files on your local machine. `LocalStorage` saves your files to a specific directory, either a default path (`~/.cache/quivr/files`) or a user-defined location. It can store files by copying them or by creating symbolic links to the original files, based on your preference. This storage type also keeps track of file hashes to prevent accidental overwrites during uploads. + +- **TransparentStorage**: + The `TransparentStorage` implementation offers a lightweight and flexible approach, mainly managing files in memory without a need for local file paths. This storage system is useful when you don't need persistent storage but rather an easy way to store and retrieve files temporarily during the brain's operation. + +Each of these storage systems has its own strengths, catering to different use cases. As `quivr_core` evolves, we will implementat more ande more storage systems allowing for even more advanced and customized ways to manage your files like `S3Storage`, `NFSStorage` ... diff --git a/backend/docs/docs/storage/local_storage.md b/backend/docs/docs/storage/local_storage.md new file mode 100644 index 000000000..62606b7c9 --- /dev/null +++ b/backend/docs/docs/storage/local_storage.md @@ -0,0 +1,5 @@ +# StorageBase + +::: quivr_core.storage.local_storage +options: +heading_level: 2 diff --git a/backend/docs/mkdocs.yml b/backend/docs/mkdocs.yml index c80609003..3c25eaf6b 100644 --- a/backend/docs/mkdocs.yml +++ b/backend/docs/mkdocs.yml @@ -45,28 +45,45 @@ theme: name: Switch to system preference plugins: -- mkdocstrings: - default_handler: python - + - mkdocstrings: + default_handler: python + handlers: + python: + docstring_style: google + options: + show_source: false + heading_level: 2 + separate_signature: true nav: - - Home: - - index.md - - installation.md - - Parsers: - - parsers/index.md - - parsers/megaparse.md - - parsers/simple.md - - Vector Stores: - - vectorstores/index.md - - vectorstores/faiss.md - - vectorstores/pgvector.md - - Workflows: - - workflows/index.md + - Home: + - index.md + - installation.md + - Brain: + - brain/index.md + - brain/brain.md + - brain/chat.md + - Storage: + - storage/index.md + - storage/base.md + - Parsers: + - parsers/index.md + - parsers/megaparse.md + - parsers/simple.md + - Vector Stores: + - vectorstores/index.md + - vectorstores/faiss.md + - vectorstores/pgvector.md + - Workflows: + - workflows/index.md + - Examples: + - workflows/examples/chat.md + - workflows/examples/rag_with_internet.md + - Configuration: + - config/index.md + - config/base_config.md + - config/config.md - Examples: - - workflows/examples/chat.md - - workflows/examples/rag_with_internet.md - - Examples: - - examples/index.md + - examples/index.md + - examples/custom_storage.md - Enterprise: https://docs.quivr.app/ -