2024-07-22 11:09:02 +03:00
|
|
|
import logging
|
|
|
|
from heapq import heappop
|
|
|
|
|
2024-07-19 10:47:39 +03:00
|
|
|
import pytest
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from quivr_core import registry
|
2024-07-22 11:09:02 +03:00
|
|
|
from quivr_core.files.file import FileExtension, QuivrFile
|
2024-07-30 19:49:12 +03:00
|
|
|
from quivr_core.processor.implementations.simple_txt_processor import SimpleTxtProcessor
|
2024-07-22 11:09:02 +03:00
|
|
|
from quivr_core.processor.implementations.tika_processor import TikaProcessor
|
2024-07-19 10:47:39 +03:00
|
|
|
from quivr_core.processor.processor_base import ProcessorBase
|
|
|
|
from quivr_core.processor.registry import (
|
2024-07-22 11:09:02 +03:00
|
|
|
_LOWEST_PRIORITY,
|
|
|
|
ProcEntry,
|
|
|
|
ProcMapping,
|
|
|
|
_append_proc_mapping,
|
2024-07-19 10:47:39 +03:00
|
|
|
_import_class,
|
2024-07-22 11:09:02 +03:00
|
|
|
available_processors,
|
2024-07-19 10:47:39 +03:00
|
|
|
get_processor_class,
|
2024-07-22 11:09:02 +03:00
|
|
|
known_processors,
|
2024-07-19 10:47:39 +03:00
|
|
|
register_processor,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2024-09-02 11:20:53 +03:00
|
|
|
# TODO : reimplement when quivr-core will be its own package
|
|
|
|
@pytest.mark.skip(reason="TODO: reimplement when quivr-core will be its own package")
|
2024-07-22 11:09:02 +03:00
|
|
|
def test_get_default_processors_cls():
|
2024-07-30 19:49:12 +03:00
|
|
|
from quivr_core.processor.implementations.default import TikTokenTxtProcessor
|
|
|
|
|
2024-07-19 10:47:39 +03:00
|
|
|
cls = get_processor_class(FileExtension.txt)
|
2024-07-22 11:09:02 +03:00
|
|
|
assert cls == TikTokenTxtProcessor
|
|
|
|
|
2024-07-30 19:49:12 +03:00
|
|
|
cls = get_processor_class(FileExtension.pdf)
|
|
|
|
# FIXME: using this class will actually fail if you don't have the
|
|
|
|
assert cls == TikaProcessor
|
|
|
|
|
|
|
|
|
2024-09-02 11:20:53 +03:00
|
|
|
@pytest.mark.skip(reason="TODO: reimplement when quivr-core will be its own package")
|
2024-07-30 19:49:12 +03:00
|
|
|
def test_get_default_processors_cls_core():
|
|
|
|
cls = get_processor_class(FileExtension.txt)
|
|
|
|
assert cls == SimpleTxtProcessor
|
|
|
|
|
2024-07-19 10:47:39 +03:00
|
|
|
cls = get_processor_class(FileExtension.pdf)
|
|
|
|
assert cls == TikaProcessor
|
|
|
|
|
|
|
|
|
2024-07-22 11:09:02 +03:00
|
|
|
def test_append_proc_mapping_empty():
|
|
|
|
proc_mapping = {}
|
|
|
|
|
|
|
|
_append_proc_mapping(
|
|
|
|
proc_mapping,
|
|
|
|
file_ext=FileExtension.txt,
|
|
|
|
cls_mod="test.test",
|
|
|
|
errtxt="error",
|
|
|
|
priority=None,
|
|
|
|
)
|
|
|
|
assert len(proc_mapping) == 1
|
|
|
|
assert len(proc_mapping[FileExtension.txt]) == 1
|
|
|
|
assert proc_mapping[FileExtension.txt][0] == ProcEntry(
|
|
|
|
priority=_LOWEST_PRIORITY, cls_mod="test.test", err="error"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_append_proc_mapping_priority():
|
|
|
|
proc_mapping: ProcMapping = {
|
|
|
|
FileExtension.txt: [
|
|
|
|
ProcEntry(
|
|
|
|
cls_mod="quivr_core.processor.implementations.simple_txt_processor.SimpleTxtProcessor",
|
|
|
|
err=None,
|
|
|
|
priority=_LOWEST_PRIORITY,
|
|
|
|
)
|
|
|
|
],
|
|
|
|
}
|
|
|
|
_append_proc_mapping(
|
|
|
|
proc_mapping,
|
|
|
|
file_ext=FileExtension.txt,
|
|
|
|
cls_mod="test.test",
|
|
|
|
errtxt="error",
|
|
|
|
priority=0,
|
|
|
|
)
|
|
|
|
|
|
|
|
assert len(proc_mapping[FileExtension.txt]) == 2
|
|
|
|
# Procs are appended in order
|
|
|
|
assert heappop(proc_mapping[FileExtension.txt]) == ProcEntry(
|
|
|
|
priority=0, cls_mod="test.test", err="error"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_append_proc_mapping():
|
|
|
|
proc_mapping: ProcMapping = {
|
|
|
|
FileExtension.txt: [
|
|
|
|
ProcEntry(
|
|
|
|
cls_mod="quivr_core.processor.implementations.simple_txt_processor.SimpleTxtProcessor",
|
|
|
|
err=None,
|
|
|
|
priority=_LOWEST_PRIORITY,
|
|
|
|
)
|
|
|
|
],
|
|
|
|
}
|
|
|
|
_append_proc_mapping(
|
|
|
|
proc_mapping,
|
|
|
|
file_ext=FileExtension.txt,
|
|
|
|
cls_mod="test.test",
|
|
|
|
errtxt="error",
|
|
|
|
priority=None,
|
|
|
|
)
|
|
|
|
|
|
|
|
assert len(proc_mapping[FileExtension.txt]) == 2
|
|
|
|
# Procs are appended in order
|
|
|
|
assert heappop(proc_mapping[FileExtension.txt]) == ProcEntry(
|
|
|
|
priority=_LOWEST_PRIORITY - 1, cls_mod="test.test", err="error"
|
|
|
|
)
|
|
|
|
assert heappop(proc_mapping[FileExtension.txt]) == ProcEntry(
|
|
|
|
cls_mod="quivr_core.processor.implementations.simple_txt_processor.SimpleTxtProcessor",
|
|
|
|
err=None,
|
|
|
|
priority=_LOWEST_PRIORITY,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2024-09-02 11:20:53 +03:00
|
|
|
@pytest.mark.skip(
|
|
|
|
reason="TODO: audio processors will be added to quivr-core very soon!"
|
|
|
|
)
|
2024-07-22 11:09:02 +03:00
|
|
|
def test_known_processors():
|
|
|
|
assert all(
|
|
|
|
ext in known_processors for ext in list(FileExtension)
|
2024-07-30 19:49:12 +03:00
|
|
|
), "base-env : Some file extensions don't have a default processor"
|
2024-07-22 11:09:02 +03:00
|
|
|
|
|
|
|
|
2024-07-19 10:47:39 +03:00
|
|
|
def test__import_class():
|
2024-07-22 11:09:02 +03:00
|
|
|
mod_path = "quivr_core.processor.implementations.tika_processor.TikaProcessor"
|
2024-07-19 10:47:39 +03:00
|
|
|
mod = _import_class(mod_path)
|
|
|
|
assert mod == TikaProcessor
|
|
|
|
|
|
|
|
with pytest.raises(TypeError, match=r".* is not a class"):
|
|
|
|
mod_path = "quivr_core.processor"
|
|
|
|
_import_class(mod_path)
|
|
|
|
|
|
|
|
with pytest.raises(TypeError, match=r".* ProcessorBase"):
|
|
|
|
mod_path = "quivr_core.Brain"
|
|
|
|
_import_class(mod_path)
|
|
|
|
|
|
|
|
|
2024-09-02 11:20:53 +03:00
|
|
|
@pytest.mark.skip(reason="TODO: reimplement when quivr-core will be its own package")
|
2024-07-22 11:09:02 +03:00
|
|
|
def test_get_processor_cls_import_error(caplog):
|
2024-07-30 19:49:12 +03:00
|
|
|
"""
|
|
|
|
Test in an environement where we only have the bare minimum parsers.
|
|
|
|
The .html can't be parsed so we should raise an ImportError"""
|
2024-07-22 11:09:02 +03:00
|
|
|
with pytest.raises(ImportError):
|
2024-07-30 19:49:12 +03:00
|
|
|
get_processor_class(".html")
|
2024-07-22 11:09:02 +03:00
|
|
|
|
|
|
|
|
2024-07-19 10:47:39 +03:00
|
|
|
def test_get_processor_cls_error():
|
|
|
|
with pytest.raises(ValueError):
|
2024-07-22 11:09:02 +03:00
|
|
|
get_processor_class(".sdfkj")
|
|
|
|
|
|
|
|
|
2024-09-02 11:20:53 +03:00
|
|
|
@pytest.mark.skip("needs tox for separating side effects on other tests")
|
2024-07-22 11:09:02 +03:00
|
|
|
def test_register_new_proc_noappend():
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
register_processor(FileExtension.txt, "test.", append=False)
|
|
|
|
|
|
|
|
|
2024-09-02 11:20:53 +03:00
|
|
|
@pytest.mark.skip("needs tox for separating side effects on other tests")
|
2024-07-22 11:09:02 +03:00
|
|
|
def test_register_new_proc_append(caplog):
|
|
|
|
n = len(known_processors[FileExtension.txt])
|
|
|
|
register_processor(FileExtension.txt, "test.", append=True)
|
|
|
|
assert len(known_processors[FileExtension.txt]) == n + 1
|
|
|
|
|
|
|
|
with caplog.at_level(logging.INFO, logger="quivr_core"):
|
|
|
|
register_processor(FileExtension.txt, "test.", append=True)
|
|
|
|
assert caplog.record_tuples == [
|
|
|
|
("quivr_core", logging.INFO, "test. already in registry...")
|
|
|
|
]
|
2024-07-19 10:47:39 +03:00
|
|
|
|
|
|
|
|
2024-09-02 11:20:53 +03:00
|
|
|
@pytest.mark.skip("needs tox for separating side effects on other tests")
|
2024-07-19 10:47:39 +03:00
|
|
|
def test_register_new_proc():
|
|
|
|
nprocs = len(registry)
|
|
|
|
|
|
|
|
class TestProcessor(ProcessorBase):
|
|
|
|
supported_extensions = [".test"]
|
|
|
|
|
|
|
|
async def process_file(self, file: QuivrFile) -> list[Document]:
|
|
|
|
return []
|
|
|
|
|
|
|
|
register_processor(".test", TestProcessor)
|
|
|
|
assert len(registry) == nprocs + 1
|
|
|
|
|
|
|
|
cls = get_processor_class(".test")
|
|
|
|
assert cls == TestProcessor
|
|
|
|
|
|
|
|
|
2024-07-22 11:09:02 +03:00
|
|
|
def test_register_non_processor():
|
|
|
|
class NOTPROC:
|
|
|
|
supported_extensions = [".pdf"]
|
|
|
|
|
|
|
|
with pytest.raises(AssertionError):
|
|
|
|
register_processor(".pdf", NOTPROC) # type: ignore
|
|
|
|
|
|
|
|
|
2024-07-19 10:47:39 +03:00
|
|
|
def test_register_override_proc():
|
|
|
|
class TestProcessor(ProcessorBase):
|
|
|
|
supported_extensions = [".pdf"]
|
|
|
|
|
2024-07-22 11:09:02 +03:00
|
|
|
@property
|
|
|
|
def processor_metadata(self):
|
|
|
|
return {}
|
|
|
|
|
|
|
|
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
|
2024-07-19 10:47:39 +03:00
|
|
|
return []
|
|
|
|
|
|
|
|
register_processor(".pdf", TestProcessor, override=True)
|
|
|
|
cls = get_processor_class(FileExtension.pdf)
|
|
|
|
assert cls == TestProcessor
|
2024-07-22 11:09:02 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_register_override_error():
|
|
|
|
# Register class to pdf
|
|
|
|
_ = get_processor_class(FileExtension.pdf)
|
|
|
|
|
|
|
|
class TestProcessor(ProcessorBase):
|
|
|
|
supported_extensions = [FileExtension.pdf]
|
|
|
|
|
|
|
|
@property
|
|
|
|
def processor_metadata(self):
|
|
|
|
return {}
|
|
|
|
|
|
|
|
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
|
|
|
|
return []
|
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
register_processor(".pdf", TestProcessor, override=False)
|
|
|
|
|
|
|
|
|
|
|
|
def test_available_processors():
|
2024-07-30 19:49:12 +03:00
|
|
|
assert 17 == len(available_processors())
|