diff --git a/CHANGELOG.md b/CHANGELOG.md index c45dc657..c3ecad46 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## [v1.20.0](https://github.com/DS4SD/docling/releases/tag/v1.20.0) - 2024-10-11 + +### Feature + +* New experimental docling-parse v2 backend ([#131](https://github.com/DS4SD/docling/issues/131)) ([`5e4944f`](https://github.com/DS4SD/docling/commit/5e4944f15f0ac1faf3e6a532c3e3ab4da56517a3)) + ## [v1.19.1](https://github.com/DS4SD/docling/releases/tag/v1.19.1) - 2024-10-11 ### Fix diff --git a/README.md b/README.md index 49221b52..9e20e86e 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@
@@ -201,8 +201,8 @@ To see all available options (export formats etc.) run `docling --help`. ### RAG Check out the following examples showcasing RAG using Docling with standard LLM application frameworks: -- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb) -- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb) +- [Basic RAG pipeline with LlamaIndex 🦙](https://github.com/DS4SD/docling/tree/main/docs/examples/rag_llamaindex.ipynb) +- [Basic RAG pipeline with LangChain 🦜🔗](https://github.com/DS4SD/docling/tree/main/docs/examples/rag_langchain.ipynb) ## Advanced features diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py new file mode 100644 index 00000000..239ea9af --- /dev/null +++ b/docling/backend/docling_parse_v2_backend.py @@ -0,0 +1,237 @@ +import logging +import random +from io import BytesIO +from pathlib import Path +from typing import Iterable, List, Optional, Union + +import pypdfium2 as pdfium +from docling_core.types.doc import BoundingBox, CoordOrigin +from docling_parse.docling_parse import pdf_parser_v2 +from PIL import Image, ImageDraw +from pypdfium2 import PdfPage + +from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend +from docling.datamodel.base_models import Cell, Size + +_log = logging.getLogger(__name__) + + +class DoclingParseV2PageBackend(PdfPageBackend): + def __init__( + self, parser: pdf_parser_v2, document_hash: str, page_no: int, page_obj: PdfPage + ): + self._ppage = page_obj + parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no) + + self.valid = "pages" in parsed_page + if self.valid: + self._dpage = parsed_page["pages"][page_no] + else: + _log.info( + f"An error occured when loading page {page_no} of document {document_hash}." + ) + + def is_valid(self) -> bool: + return self.valid + + def get_text_in_rect(self, bbox: BoundingBox) -> str: + if not self.valid: + return "" + # Find intersecting cells on the page + text_piece = "" + page_size = self.get_size() + + parser_width = self._dpage["sanitized"]["dimension"]["width"] + parser_height = self._dpage["sanitized"]["dimension"]["height"] + + scale = ( + 1 # FIX - Replace with param in get_text_in_rect across backends (optional) + ) + + cells_data = self._dpage["sanitized"]["cells"]["data"] + cells_header = self._dpage["sanitized"]["cells"]["header"] + + for i, cell_data in enumerate(cells_data): + x0 = cell_data[cells_header.index("x0")] + y0 = cell_data[cells_header.index("y0")] + x1 = cell_data[cells_header.index("x1")] + y1 = cell_data[cells_header.index("y1")] + + cell_bbox = BoundingBox( + l=x0 * scale * page_size.width / parser_width, + b=y0 * scale * page_size.height / parser_height, + r=x1 * scale * page_size.width / parser_width, + t=y1 * scale * page_size.height / parser_height, + coord_origin=CoordOrigin.BOTTOMLEFT, + ).to_top_left_origin(page_height=page_size.height * scale) + + overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area() + + if overlap_frac > 0.5: + if len(text_piece) > 0: + text_piece += " " + text_piece += cell_data[cells_header.index("text")] + + return text_piece + + def get_text_cells(self) -> Iterable[Cell]: + cells: List[Cell] = [] + cell_counter = 0 + + if not self.valid: + return cells + + page_size = self.get_size() + + parser_width = self._dpage["sanitized"]["dimension"]["width"] + parser_height = self._dpage["sanitized"]["dimension"]["height"] + + cells_data = self._dpage["sanitized"]["cells"]["data"] + cells_header = self._dpage["sanitized"]["cells"]["header"] + + for i, cell_data in enumerate(cells_data): + x0 = cell_data[cells_header.index("x0")] + y0 = cell_data[cells_header.index("y0")] + x1 = cell_data[cells_header.index("x1")] + y1 = cell_data[cells_header.index("y1")] + + if x1 < x0: + x0, x1 = x1, x0 + if y1 < y0: + y0, y1 = y1, y0 + + text_piece = cell_data[cells_header.index("text")] + cells.append( + Cell( + id=cell_counter, + text=text_piece, + bbox=BoundingBox( + # l=x0, b=y0, r=x1, t=y1, + l=x0 * page_size.width / parser_width, + b=y0 * page_size.height / parser_height, + r=x1 * page_size.width / parser_width, + t=y1 * page_size.height / parser_height, + coord_origin=CoordOrigin.BOTTOMLEFT, + ).to_top_left_origin(page_size.height), + ) + ) + cell_counter += 1 + + def draw_clusters_and_cells(): + image = ( + self.get_page_image() + ) # make new image to avoid drawing on the saved ones + draw = ImageDraw.Draw(image) + for c in cells: + x0, y0, x1, y1 = c.bbox.as_tuple() + cell_color = ( + random.randint(30, 140), + random.randint(30, 140), + random.randint(30, 140), + ) + draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color) + image.show() + + # draw_clusters_and_cells() + + return cells + + def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: + AREA_THRESHOLD = 32 * 32 + + images = self._dpage["sanitized"]["images"]["data"] + images_header = self._dpage["sanitized"]["images"]["header"] + + for row in images: + x0 = row[images_header.index("x0")] + y0 = row[images_header.index("y0")] + x1 = row[images_header.index("x1")] + y1 = row[images_header.index("y1")] + + cropbox = BoundingBox.from_tuple( + (x0, y0, x1, y1), origin=CoordOrigin.BOTTOMLEFT + ).to_top_left_origin(self.get_size().height) + + if cropbox.area() > AREA_THRESHOLD: + cropbox = cropbox.scaled(scale=scale) + + yield cropbox + + def get_page_image( + self, scale: float = 1, cropbox: Optional[BoundingBox] = None + ) -> Image.Image: + + page_size = self.get_size() + + if not cropbox: + cropbox = BoundingBox( + l=0, + r=page_size.width, + t=0, + b=page_size.height, + coord_origin=CoordOrigin.TOPLEFT, + ) + padbox = BoundingBox( + l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT + ) + else: + padbox = cropbox.to_bottom_left_origin(page_size.height) + padbox.r = page_size.width - padbox.r + padbox.t = page_size.height - padbox.t + + image = ( + self._ppage.render( + scale=scale * 1.5, + rotation=0, # no additional rotation + crop=padbox.as_tuple(), + ) + .to_pil() + .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale))) + ) # We resize the image from 1.5x the given scale to make it sharper. + + return image + + def get_size(self) -> Size: + return Size(width=self._ppage.get_width(), height=self._ppage.get_height()) + + def unload(self): + self._ppage = None + self._dpage = None + + +class DoclingParseV2DocumentBackend(PdfDocumentBackend): + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) + + self._pdoc = pdfium.PdfDocument(self.path_or_stream) + self.parser = pdf_parser_v2("fatal") + + success = False + if isinstance(path_or_stream, BytesIO): + success = self.parser.load_document_from_bytesio( + self.document_hash, path_or_stream + ) + elif isinstance(path_or_stream, Path): + success = self.parser.load_document(self.document_hash, str(path_or_stream)) + + if not success: + raise RuntimeError( + f"docling-parse v2 could not load document {self.document_hash}." + ) + + def page_count(self) -> int: + return len(self._pdoc) # To be replaced with docling-parse API + + def load_page(self, page_no: int) -> DoclingParseV2PageBackend: + return DoclingParseV2PageBackend( + self.parser, self.document_hash, page_no, self._pdoc[page_no] + ) + + def is_valid(self) -> bool: + return self.page_count() > 0 + + def unload(self): + super().unload() + self.parser.unload_document(self.document_hash) + self._pdoc.close() + self._pdoc = None diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index 6c5e5f04..c31981be 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -1,19 +1,20 @@ import logging from typing import Iterable +import numpy from docling_core.types.doc import BoundingBox, CoordOrigin from docling.datamodel.base_models import OcrCell, Page -from docling.datamodel.pipeline_options import TesseractCliOcrOptions +from docling.datamodel.pipeline_options import TesseractOcrOptions from docling.models.base_ocr_model import BaseOcrModel _log = logging.getLogger(__name__) class TesseractOcrModel(BaseOcrModel): - def __init__(self, enabled: bool, options: TesseractCliOcrOptions): + def __init__(self, enabled: bool, options: TesseractOcrOptions): super().__init__(enabled=enabled, options=options) - self.options: TesseractCliOcrOptions + self.options: TesseractOcrOptions self.scale = 3 # multiplier for 72 dpi == 216 dpi. self.reader = None diff --git a/docs/assets/logo.png b/docs/assets/logo.png new file mode 100644 index 00000000..8763d718 Binary files /dev/null and b/docs/assets/logo.png differ diff --git a/docs/assets/logo.svg b/docs/assets/logo.svg new file mode 100644 index 00000000..035671c0 --- /dev/null +++ b/docs/assets/logo.svg @@ -0,0 +1,116 @@ + + + diff --git a/examples/batch_convert.py b/docs/examples/batch_convert.py similarity index 98% rename from examples/batch_convert.py rename to docs/examples/batch_convert.py index 02a6fc5e..73498915 100644 --- a/examples/batch_convert.py +++ b/docs/examples/batch_convert.py @@ -122,7 +122,7 @@ def main(): raises_on_error=False, # to let conversion run through all and examine results at the end ) success_count, partial_success_count, failure_count = export_documents( - conv_results, output_dir=Path("./scratch") + conv_results, output_dir=Path("../../examples/scratch") ) end_time = time.time() - start_time diff --git a/examples/custom_convert.py b/docs/examples/custom_convert.py similarity index 98% rename from examples/custom_convert.py rename to docs/examples/custom_convert.py index 7c28a23b..67d58a6b 100644 --- a/examples/custom_convert.py +++ b/docs/examples/custom_convert.py @@ -113,7 +113,7 @@ def main(): _log.info(f"Document converted in {end_time:.2f} seconds.") ## Export results - output_dir = Path("./scratch") + output_dir = Path("../../examples/scratch") output_dir.mkdir(parents=True, exist_ok=True) doc_filename = conv_result.input.file.stem diff --git a/examples/export_figures.py b/docs/examples/export_figures.py similarity index 97% rename from examples/export_figures.py rename to docs/examples/export_figures.py index 4fa4dc58..d6fed16d 100644 --- a/examples/export_figures.py +++ b/docs/examples/export_figures.py @@ -15,7 +15,7 @@ def main(): logging.basicConfig(level=logging.INFO) input_doc_path = Path("./tests/data/2206.01062.pdf") - output_dir = Path("./scratch") + output_dir = Path("../../examples/scratch") # Important: For operating with page images, we must keep them, otherwise the DocumentConverter # will destroy them for cleaning up memory. diff --git a/examples/export_multimodal.py b/docs/examples/export_multimodal.py similarity index 98% rename from examples/export_multimodal.py rename to docs/examples/export_multimodal.py index 01477f00..3646aea3 100644 --- a/examples/export_multimodal.py +++ b/docs/examples/export_multimodal.py @@ -20,7 +20,7 @@ def main(): logging.basicConfig(level=logging.INFO) input_doc_path = Path("./tests/data/2206.01062.pdf") - output_dir = Path("./scratch") + output_dir = Path("../../examples/scratch") # Important: For operating with page images, we must keep them, otherwise the DocumentConverter # will destroy them for cleaning up memory. diff --git a/examples/export_tables.py b/docs/examples/export_tables.py similarity index 96% rename from examples/export_tables.py rename to docs/examples/export_tables.py index c7be89bc..ff962e80 100644 --- a/examples/export_tables.py +++ b/docs/examples/export_tables.py @@ -13,7 +13,7 @@ def main(): logging.basicConfig(level=logging.INFO) input_doc_path = Path("./tests/data/2206.01062.pdf") - output_dir = Path("./scratch") + output_dir = Path("../../examples/scratch") doc_converter = DocumentConverter() diff --git a/examples/minimal.py b/docs/examples/minimal.py similarity index 100% rename from examples/minimal.py rename to docs/examples/minimal.py diff --git a/docs/examples/rag_langchain.ipynb b/docs/examples/rag_langchain.ipynb new file mode 100644 index 00000000..f2464f29 --- /dev/null +++ b/docs/examples/rag_langchain.ipynb @@ -0,0 +1,369 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RAG with LangChain 🦜🔗" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "# requirements for this example:\n", + "%pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface langchain-milvus" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n", + "warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loader and splitter" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below we set up:\n", + "- a `Loader` which will be used to create LangChain documents, and\n", + "- a splitter, which will be used to split these documents" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from enum import Enum\n", + "from typing import Iterator\n", + "\n", + "from langchain_core.document_loaders import BaseLoader\n", + "from langchain_core.documents import Document as LCDocument\n", + "from pydantic import BaseModel\n", + "\n", + "from docling.document_converter import DocumentConverter\n", + "\n", + "\n", + "class DocumentMetadata(BaseModel):\n", + " dl_doc_hash: str\n", + " # source: str\n", + "\n", + "\n", + "class DoclingPDFLoader(BaseLoader):\n", + " class ParseType(str, Enum):\n", + " MARKDOWN = \"markdown\"\n", + " # JSON = \"json\"\n", + "\n", + " def __init__(self, file_path: str | list[str], parse_type: ParseType) -> None:\n", + " self._file_paths = file_path if isinstance(file_path, list) else [file_path]\n", + " self._parse_type = parse_type\n", + " self._converter = DocumentConverter()\n", + "\n", + " def lazy_load(self) -> Iterator[LCDocument]:\n", + " for source in self._file_paths:\n", + " dl_doc = self._converter.convert_single(source).output\n", + " match self._parse_type:\n", + " case self.ParseType.MARKDOWN:\n", + " text = dl_doc.export_to_markdown()\n", + " # case self.ParseType.JSON:\n", + " # text = dl_doc.model_dump_json()\n", + " case _:\n", + " raise RuntimeError(\n", + " f\"Unexpected parse type encountered: {self._parse_type}\"\n", + " )\n", + " lc_doc = LCDocument(\n", + " page_content=text,\n", + " metadata=DocumentMetadata(\n", + " dl_doc_hash=dl_doc.file_info.document_hash,\n", + " ).model_dump(),\n", + " )\n", + " yield lc_doc" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "FILE_PATH = \"https://arxiv.org/pdf/2206.01062\" # DocLayNet paper" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1b38d07d5fed4618a44ecf261e1e5c44", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Fetching 7 files: 0%| | 0/7 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", + "\n", + "loader = DoclingPDFLoader(\n", + " file_path=FILE_PATH,\n", + " parse_type=DoclingPDFLoader.ParseType.MARKDOWN,\n", + ")\n", + "text_splitter = RecursiveCharacterTextSplitter(\n", + " chunk_size=1000,\n", + " chunk_overlap=200,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now used the above-defined objects to get the document splits:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()\n", + "splits = text_splitter.split_documents(docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n", + "\n", + "HF_EMBED_MODEL_ID = \"BAAI/bge-small-en-v1.5\"\n", + "embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Vector store" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from tempfile import TemporaryDirectory\n", + "\n", + "from langchain_milvus import Milvus\n", + "\n", + "MILVUS_URI = os.environ.get(\n", + " \"MILVUS_URL\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n", + ")\n", + "\n", + "vectorstore = Milvus.from_documents(\n", + " splits,\n", + " embeddings,\n", + " connection_args={\"uri\": MILVUS_URI},\n", + " drop_old=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### LLM" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n", + "Token is valid (permission: write).\n", + "Your token has been saved to /Users/pva/.cache/huggingface/token\n", + "Login successful\n" + ] + } + ], + "source": [ + "from langchain_huggingface import HuggingFaceEndpoint\n", + "\n", + "HF_API_KEY = os.environ.get(\"HF_API_KEY\")\n", + "HF_LLM_MODEL_ID = \"mistralai/Mistral-7B-Instruct-v0.3\"\n", + "\n", + "llm = HuggingFaceEndpoint(\n", + " repo_id=HF_LLM_MODEL_ID,\n", + " huggingfacehub_api_token=HF_API_KEY,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## RAG" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Iterable\n", + "\n", + "from langchain_core.documents import Document as LCDocument\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.prompts import PromptTemplate\n", + "from langchain_core.runnables import RunnablePassthrough\n", + "\n", + "\n", + "def format_docs(docs: Iterable[LCDocument]):\n", + " return \"\\n\\n\".join(doc.page_content for doc in docs)\n", + "\n", + "\n", + "retriever = vectorstore.as_retriever()\n", + "\n", + "prompt = PromptTemplate.from_template(\n", + " \"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {question}\\nAnswer:\\n\"\n", + ")\n", + "\n", + "rag_chain = (\n", + " {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n", + " | prompt\n", + " | llm\n", + " | StrOutputParser()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'The human annotation of DocLayNet was performed on 80863 pages.\\n\\nExplanation:\\nThe information is found in the paragraph \"DocLayNet contains 80863 PDF pages\" in the context.'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rag_chain.invoke(\"How many pages were human annotated for DocLayNet?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/examples/rag_llamaindex.ipynb b/docs/examples/rag_llamaindex.ipynb new file mode 100644 index 00000000..48ade368 --- /dev/null +++ b/docs/examples/rag_llamaindex.ipynb @@ -0,0 +1,436 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
+
+
+
+
🎉 Docling is now officially supported in LlamaIndex! Check it out!
+{% endblock %} +#} diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css new file mode 100644 index 00000000..5beec977 --- /dev/null +++ b/docs/stylesheets/extra.css @@ -0,0 +1,3 @@ +[data-md-color-scheme="default"] .md-banner a { + color: #5e8bde; +} diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 00000000..2deb6463 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,97 @@ +site_name: Docling +site_url: https://ds4sd.github.io/docling/ +repo_name: DS4SD/docling +repo_url: https://github.com/DS4SD/docling + +theme: + name: material + custom_dir: docs/overrides + palette: + # Palette toggle for automatic mode + - media: "(prefers-color-scheme)" + scheme: default + primary: black + toggle: + icon: material/brightness-auto + name: Switch to light mode + + # Palette toggle for light mode + - media: "(prefers-color-scheme: light)" + scheme: default + primary: black + toggle: + icon: material/brightness-7 + name: Switch to dark mode + + # Palette toggle for dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: black + toggle: + icon: material/brightness-4 + name: Switch to system preference + + logo: assets/logo.png + favicon: assets/logo.png + features: + - content.tabs.link + - content.code.annotate + - content.code.copy + - announce.dismiss + - navigation.tabs + # - navigation.indexes # <= if set, each "section" can have its own page, if index.md is used + - navigation.instant + - navigation.instant.prefetch + # - navigation.instant.preview + - navigation.instant.progress + - navigation.path + - navigation.sections # <= + - navigation.top + - navigation.tracking + - search.suggest + - toc.follow +nav: + - Get started: + - Home: index.md + - Installation: installation.md + # - Docling v2: v2.md + # - Concepts: + # - Docling Document: concepts/document.md + # - Chunking: concepts/chunking.md + - Examples: + - Conversion: + - "Simple conversion": examples/minimal.py + - "Custom conversion": examples/custom_convert.py + - "Batch conversion": examples/batch_convert.py + - "Figure export": examples/export_figures.py + - "Table export": examples/export_tables.py + - "Multimodal export": examples/export_multimodal.py + - RAG / QA: + - "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb + - "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb + # - Chunking: + # - Chunking: examples/chunking.md + # - CLI: + # - CLI: examples/cli.md + - Integrations: + - "LlamaIndex 🦙 extension": integrations/llamaindex.md + # - "LangChain 🦜🔗 extension": integrations/langchain.md + # - API reference: + # - API reference: api_reference/index.md + +markdown_extensions: + - pymdownx.superfences + - pymdownx.tabbed: + alternate_style: true + slugify: !!python/object/apply:pymdownx.slugs.slugify + kwds: + case: lower + - admonition + - pymdownx.details + - attr_list +plugins: + - search + - mkdocs-jupyter + +extra_css: + - stylesheets/extra.css diff --git a/tests/test_backend_docling_parse_v2.py b/tests/test_backend_docling_parse_v2.py new file mode 100644 index 00000000..8c4252cb --- /dev/null +++ b/tests/test_backend_docling_parse_v2.py @@ -0,0 +1,77 @@ +from pathlib import Path + +import pytest + +from docling.backend.docling_parse_v2_backend import ( + DoclingParseV2DocumentBackend, + DoclingParseV2PageBackend, +) +from docling.datamodel.base_models import BoundingBox, InputFormat +from docling.datamodel.document import InputDocument + + +@pytest.fixture +def test_doc_path(): + return Path("./tests/data/2206.01062.pdf") + + +def _get_backend(pdf_doc): + in_doc = InputDocument( + path_or_stream=pdf_doc, + format=InputFormat.PDF, + backend=DoclingParseV2DocumentBackend, + ) + + doc_backend = in_doc._backend + return doc_backend + + +@pytest.mark.skip +def test_text_cell_counts(): + pdf_doc = Path("./tests/data/redp5695.pdf") + + doc_backend = _get_backend(pdf_doc) + + for page_index in range(0, doc_backend.page_count()): + last_cell_count = None + for i in range(10): + page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0) + cells = list(page_backend.get_text_cells()) + + if last_cell_count is None: + last_cell_count = len(cells) + + if len(cells) != last_cell_count: + assert ( + False + ), "Loading page multiple times yielded non-identical text cell counts" + last_cell_count = len(cells) + + +def test_get_text_from_rect(test_doc_path): + doc_backend = _get_backend(test_doc_path) + page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0) + + # Get the title text of the DocLayNet paper + textpiece = page_backend.get_text_in_rect( + bbox=BoundingBox(l=102, t=77, r=511, b=124) + ) + ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" + + assert textpiece.strip() == ref + + +def test_crop_page_image(test_doc_path): + doc_backend = _get_backend(test_doc_path) + page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0) + + # Crop out "Figure 1" from the DocLayNet paper + im = page_backend.get_page_image( + scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) + ) + # im.show() + + +def test_num_pages(test_doc_path): + doc_backend = _get_backend(test_doc_path) + doc_backend.page_count() == 9