mirror of
https://github.com/QuivrHQ/quivr.git
synced 2024-12-15 17:43:03 +03:00
ef90e8e672
# Description
Major PR which, among other things, introduces the possibility of easily
customizing the retrieval workflows. Workflows are based on LangGraph,
and can be customized using a [yaml configuration
file](core/tests/test_llm_endpoint.py), and adding the implementation of
the nodes logic into
[quivr_rag_langgraph.py](1a0c98437a/backend/core/quivr_core/quivr_rag_langgraph.py
)
This is a first, simple implementation that will significantly evolve in
the coming weeks to enable more complex workflows (for instance, with
conditional nodes). We also plan to adopt a similar approach for the
ingestion part, i.e. to enable user to easily customize the ingestion
pipeline.
Closes CORE-195, CORE-203, CORE-204
## Checklist before requesting a review
Please delete options that are not relevant.
- [X] My code follows the style guidelines of this project
- [X] I have performed a self-review of my code
- [X] I have commented hard-to-understand areas
- [X] I have ideally added tests that prove my fix is effective or that
my feature works
- [X] New and existing unit tests pass locally with my changes
- [X] Any dependent changes have been merged
## Screenshots (if appropriate):
160 lines
4.0 KiB
Plaintext
Vendored
160 lines
4.0 KiB
Plaintext
Vendored
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from megaparse.Converter import MegaParse\n",
|
|
"from IPython.display import display_markdown\n",
|
|
"import pdfminer\n",
|
|
"from pdfminer.image import ImageWriter\n",
|
|
"from pdfminer.high_level import extract_pages\n",
|
|
"\n",
|
|
"import fitz\n",
|
|
"import io\n",
|
|
"from PIL import Image"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"file_path = \"megaparse/tests/input_tests/MegaFake_report.pdf\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"megaparse = MegaParse(file_path=file_path)\n",
|
|
"content = megaparse.convert()\n",
|
|
"megaparse.save_md(md_content=content, file_path=\"./content.md\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"display_markdown(content, raw=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# def extract_images_from_pdf(pdf_file_path, output_dir):\n",
|
|
"# iw = ImageWriter(output_dir)\n",
|
|
"# image_count = 0\n",
|
|
"\n",
|
|
"# for page_num, page_layout in enumerate(extract_pages(pdf_file_path)):\n",
|
|
"# for image in get_images_from_page(page_layout):\n",
|
|
"# image_name = f\"image_{image_count}_page_{page_num}.png\"\n",
|
|
"# iw.export_image(image)\n",
|
|
"# image_count += 1\n",
|
|
"\n",
|
|
"\n",
|
|
"# def get_images_from_page(page_layout):\n",
|
|
"# if isinstance(page_layout, pdfminer.layout.LTImage):\n",
|
|
"# return [page_layout]\n",
|
|
"# if isinstance(page_layout, pdfminer.layout.LTContainer):\n",
|
|
"# img_list = []\n",
|
|
"# for child in page_layout:\n",
|
|
"# img_list += get_images_from_page(child)\n",
|
|
"# return img_list\n",
|
|
"# else:\n",
|
|
"# return []"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# extract_images_from_pdf(pdf_file_path=file_path, output_dir=\"output/\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 34,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def extract_images_from_pdf(pdf_file_path: str, output_dir: str):\n",
|
|
" pdf_file = fitz.open(pdf_file_path)\n",
|
|
" for page_number in range(1, len(pdf_file)):\n",
|
|
" page = pdf_file[page_number]\n",
|
|
" for image_index, img in enumerate(page.get_images(), start=1):\n",
|
|
" xref = img[0]\n",
|
|
" base_image = pdf_file.extract_image(xref)\n",
|
|
" image_bytes = base_image[\"image\"]\n",
|
|
" image_ext = base_image[\"ext\"]\n",
|
|
" pil_image = Image.open(io.BytesIO(image_bytes))\n",
|
|
" image_path = (\n",
|
|
" f\"{output_dir}image_{image_index}_page_{page_number}.{image_ext}\"\n",
|
|
" )\n",
|
|
" pil_image.save(image_path)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 35,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"1\n",
|
|
"1\n",
|
|
"1\n",
|
|
"1\n",
|
|
"1\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"extract_images_from_pdf(pdf_file_path=file_path, output_dir=\"output/\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "ENV",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.12"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|