diff --git a/CHANGELOG.md b/CHANGELOG.md index c45dc657..c3ecad46 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## [v1.20.0](https://github.com/DS4SD/docling/releases/tag/v1.20.0) - 2024-10-11 + +### Feature + +* New experimental docling-parse v2 backend ([#131](https://github.com/DS4SD/docling/issues/131)) ([`5e4944f`](https://github.com/DS4SD/docling/commit/5e4944f15f0ac1faf3e6a532c3e3ab4da56517a3)) + ## [v1.19.1](https://github.com/DS4SD/docling/releases/tag/v1.19.1) - 2024-10-11 ### Fix diff --git a/README.md b/README.md index 49221b52..9e20e86e 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@

- Docling + Docling

@@ -201,8 +201,8 @@ To see all available options (export formats etc.) run `docling --help`. ### RAG Check out the following examples showcasing RAG using Docling with standard LLM application frameworks: -- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb) -- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb) +- [Basic RAG pipeline with LlamaIndex 🦙](https://github.com/DS4SD/docling/tree/main/docs/examples/rag_llamaindex.ipynb) +- [Basic RAG pipeline with LangChain 🦜🔗](https://github.com/DS4SD/docling/tree/main/docs/examples/rag_langchain.ipynb) ## Advanced features diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py new file mode 100644 index 00000000..239ea9af --- /dev/null +++ b/docling/backend/docling_parse_v2_backend.py @@ -0,0 +1,237 @@ +import logging +import random +from io import BytesIO +from pathlib import Path +from typing import Iterable, List, Optional, Union + +import pypdfium2 as pdfium +from docling_core.types.doc import BoundingBox, CoordOrigin +from docling_parse.docling_parse import pdf_parser_v2 +from PIL import Image, ImageDraw +from pypdfium2 import PdfPage + +from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend +from docling.datamodel.base_models import Cell, Size + +_log = logging.getLogger(__name__) + + +class DoclingParseV2PageBackend(PdfPageBackend): + def __init__( + self, parser: pdf_parser_v2, document_hash: str, page_no: int, page_obj: PdfPage + ): + self._ppage = page_obj + parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no) + + self.valid = "pages" in parsed_page + if self.valid: + self._dpage = parsed_page["pages"][page_no] + else: + _log.info( + f"An error occured when loading page {page_no} of document {document_hash}." + ) + + def is_valid(self) -> bool: + return self.valid + + def get_text_in_rect(self, bbox: BoundingBox) -> str: + if not self.valid: + return "" + # Find intersecting cells on the page + text_piece = "" + page_size = self.get_size() + + parser_width = self._dpage["sanitized"]["dimension"]["width"] + parser_height = self._dpage["sanitized"]["dimension"]["height"] + + scale = ( + 1 # FIX - Replace with param in get_text_in_rect across backends (optional) + ) + + cells_data = self._dpage["sanitized"]["cells"]["data"] + cells_header = self._dpage["sanitized"]["cells"]["header"] + + for i, cell_data in enumerate(cells_data): + x0 = cell_data[cells_header.index("x0")] + y0 = cell_data[cells_header.index("y0")] + x1 = cell_data[cells_header.index("x1")] + y1 = cell_data[cells_header.index("y1")] + + cell_bbox = BoundingBox( + l=x0 * scale * page_size.width / parser_width, + b=y0 * scale * page_size.height / parser_height, + r=x1 * scale * page_size.width / parser_width, + t=y1 * scale * page_size.height / parser_height, + coord_origin=CoordOrigin.BOTTOMLEFT, + ).to_top_left_origin(page_height=page_size.height * scale) + + overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area() + + if overlap_frac > 0.5: + if len(text_piece) > 0: + text_piece += " " + text_piece += cell_data[cells_header.index("text")] + + return text_piece + + def get_text_cells(self) -> Iterable[Cell]: + cells: List[Cell] = [] + cell_counter = 0 + + if not self.valid: + return cells + + page_size = self.get_size() + + parser_width = self._dpage["sanitized"]["dimension"]["width"] + parser_height = self._dpage["sanitized"]["dimension"]["height"] + + cells_data = self._dpage["sanitized"]["cells"]["data"] + cells_header = self._dpage["sanitized"]["cells"]["header"] + + for i, cell_data in enumerate(cells_data): + x0 = cell_data[cells_header.index("x0")] + y0 = cell_data[cells_header.index("y0")] + x1 = cell_data[cells_header.index("x1")] + y1 = cell_data[cells_header.index("y1")] + + if x1 < x0: + x0, x1 = x1, x0 + if y1 < y0: + y0, y1 = y1, y0 + + text_piece = cell_data[cells_header.index("text")] + cells.append( + Cell( + id=cell_counter, + text=text_piece, + bbox=BoundingBox( + # l=x0, b=y0, r=x1, t=y1, + l=x0 * page_size.width / parser_width, + b=y0 * page_size.height / parser_height, + r=x1 * page_size.width / parser_width, + t=y1 * page_size.height / parser_height, + coord_origin=CoordOrigin.BOTTOMLEFT, + ).to_top_left_origin(page_size.height), + ) + ) + cell_counter += 1 + + def draw_clusters_and_cells(): + image = ( + self.get_page_image() + ) # make new image to avoid drawing on the saved ones + draw = ImageDraw.Draw(image) + for c in cells: + x0, y0, x1, y1 = c.bbox.as_tuple() + cell_color = ( + random.randint(30, 140), + random.randint(30, 140), + random.randint(30, 140), + ) + draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color) + image.show() + + # draw_clusters_and_cells() + + return cells + + def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: + AREA_THRESHOLD = 32 * 32 + + images = self._dpage["sanitized"]["images"]["data"] + images_header = self._dpage["sanitized"]["images"]["header"] + + for row in images: + x0 = row[images_header.index("x0")] + y0 = row[images_header.index("y0")] + x1 = row[images_header.index("x1")] + y1 = row[images_header.index("y1")] + + cropbox = BoundingBox.from_tuple( + (x0, y0, x1, y1), origin=CoordOrigin.BOTTOMLEFT + ).to_top_left_origin(self.get_size().height) + + if cropbox.area() > AREA_THRESHOLD: + cropbox = cropbox.scaled(scale=scale) + + yield cropbox + + def get_page_image( + self, scale: float = 1, cropbox: Optional[BoundingBox] = None + ) -> Image.Image: + + page_size = self.get_size() + + if not cropbox: + cropbox = BoundingBox( + l=0, + r=page_size.width, + t=0, + b=page_size.height, + coord_origin=CoordOrigin.TOPLEFT, + ) + padbox = BoundingBox( + l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT + ) + else: + padbox = cropbox.to_bottom_left_origin(page_size.height) + padbox.r = page_size.width - padbox.r + padbox.t = page_size.height - padbox.t + + image = ( + self._ppage.render( + scale=scale * 1.5, + rotation=0, # no additional rotation + crop=padbox.as_tuple(), + ) + .to_pil() + .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale))) + ) # We resize the image from 1.5x the given scale to make it sharper. + + return image + + def get_size(self) -> Size: + return Size(width=self._ppage.get_width(), height=self._ppage.get_height()) + + def unload(self): + self._ppage = None + self._dpage = None + + +class DoclingParseV2DocumentBackend(PdfDocumentBackend): + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) + + self._pdoc = pdfium.PdfDocument(self.path_or_stream) + self.parser = pdf_parser_v2("fatal") + + success = False + if isinstance(path_or_stream, BytesIO): + success = self.parser.load_document_from_bytesio( + self.document_hash, path_or_stream + ) + elif isinstance(path_or_stream, Path): + success = self.parser.load_document(self.document_hash, str(path_or_stream)) + + if not success: + raise RuntimeError( + f"docling-parse v2 could not load document {self.document_hash}." + ) + + def page_count(self) -> int: + return len(self._pdoc) # To be replaced with docling-parse API + + def load_page(self, page_no: int) -> DoclingParseV2PageBackend: + return DoclingParseV2PageBackend( + self.parser, self.document_hash, page_no, self._pdoc[page_no] + ) + + def is_valid(self) -> bool: + return self.page_count() > 0 + + def unload(self): + super().unload() + self.parser.unload_document(self.document_hash) + self._pdoc.close() + self._pdoc = None diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index 6c5e5f04..c31981be 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -1,19 +1,20 @@ import logging from typing import Iterable +import numpy from docling_core.types.doc import BoundingBox, CoordOrigin from docling.datamodel.base_models import OcrCell, Page -from docling.datamodel.pipeline_options import TesseractCliOcrOptions +from docling.datamodel.pipeline_options import TesseractOcrOptions from docling.models.base_ocr_model import BaseOcrModel _log = logging.getLogger(__name__) class TesseractOcrModel(BaseOcrModel): - def __init__(self, enabled: bool, options: TesseractCliOcrOptions): + def __init__(self, enabled: bool, options: TesseractOcrOptions): super().__init__(enabled=enabled, options=options) - self.options: TesseractCliOcrOptions + self.options: TesseractOcrOptions self.scale = 3 # multiplier for 72 dpi == 216 dpi. self.reader = None diff --git a/docs/assets/logo.png b/docs/assets/logo.png new file mode 100644 index 00000000..8763d718 Binary files /dev/null and b/docs/assets/logo.png differ diff --git a/docs/assets/logo.svg b/docs/assets/logo.svg new file mode 100644 index 00000000..035671c0 --- /dev/null +++ b/docs/assets/logo.svg @@ -0,0 +1,116 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/examples/batch_convert.py b/docs/examples/batch_convert.py similarity index 98% rename from examples/batch_convert.py rename to docs/examples/batch_convert.py index 02a6fc5e..73498915 100644 --- a/examples/batch_convert.py +++ b/docs/examples/batch_convert.py @@ -122,7 +122,7 @@ def main(): raises_on_error=False, # to let conversion run through all and examine results at the end ) success_count, partial_success_count, failure_count = export_documents( - conv_results, output_dir=Path("./scratch") + conv_results, output_dir=Path("../../examples/scratch") ) end_time = time.time() - start_time diff --git a/examples/custom_convert.py b/docs/examples/custom_convert.py similarity index 98% rename from examples/custom_convert.py rename to docs/examples/custom_convert.py index 7c28a23b..67d58a6b 100644 --- a/examples/custom_convert.py +++ b/docs/examples/custom_convert.py @@ -113,7 +113,7 @@ def main(): _log.info(f"Document converted in {end_time:.2f} seconds.") ## Export results - output_dir = Path("./scratch") + output_dir = Path("../../examples/scratch") output_dir.mkdir(parents=True, exist_ok=True) doc_filename = conv_result.input.file.stem diff --git a/examples/export_figures.py b/docs/examples/export_figures.py similarity index 97% rename from examples/export_figures.py rename to docs/examples/export_figures.py index 4fa4dc58..d6fed16d 100644 --- a/examples/export_figures.py +++ b/docs/examples/export_figures.py @@ -15,7 +15,7 @@ def main(): logging.basicConfig(level=logging.INFO) input_doc_path = Path("./tests/data/2206.01062.pdf") - output_dir = Path("./scratch") + output_dir = Path("../../examples/scratch") # Important: For operating with page images, we must keep them, otherwise the DocumentConverter # will destroy them for cleaning up memory. diff --git a/examples/export_multimodal.py b/docs/examples/export_multimodal.py similarity index 98% rename from examples/export_multimodal.py rename to docs/examples/export_multimodal.py index 01477f00..3646aea3 100644 --- a/examples/export_multimodal.py +++ b/docs/examples/export_multimodal.py @@ -20,7 +20,7 @@ def main(): logging.basicConfig(level=logging.INFO) input_doc_path = Path("./tests/data/2206.01062.pdf") - output_dir = Path("./scratch") + output_dir = Path("../../examples/scratch") # Important: For operating with page images, we must keep them, otherwise the DocumentConverter # will destroy them for cleaning up memory. diff --git a/examples/export_tables.py b/docs/examples/export_tables.py similarity index 96% rename from examples/export_tables.py rename to docs/examples/export_tables.py index c7be89bc..ff962e80 100644 --- a/examples/export_tables.py +++ b/docs/examples/export_tables.py @@ -13,7 +13,7 @@ def main(): logging.basicConfig(level=logging.INFO) input_doc_path = Path("./tests/data/2206.01062.pdf") - output_dir = Path("./scratch") + output_dir = Path("../../examples/scratch") doc_converter = DocumentConverter() diff --git a/examples/minimal.py b/docs/examples/minimal.py similarity index 100% rename from examples/minimal.py rename to docs/examples/minimal.py diff --git a/docs/examples/rag_langchain.ipynb b/docs/examples/rag_langchain.ipynb new file mode 100644 index 00000000..f2464f29 --- /dev/null +++ b/docs/examples/rag_langchain.ipynb @@ -0,0 +1,369 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RAG with LangChain 🦜🔗" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "# requirements for this example:\n", + "%pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface langchain-milvus" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n", + "warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loader and splitter" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below we set up:\n", + "- a `Loader` which will be used to create LangChain documents, and\n", + "- a splitter, which will be used to split these documents" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from enum import Enum\n", + "from typing import Iterator\n", + "\n", + "from langchain_core.document_loaders import BaseLoader\n", + "from langchain_core.documents import Document as LCDocument\n", + "from pydantic import BaseModel\n", + "\n", + "from docling.document_converter import DocumentConverter\n", + "\n", + "\n", + "class DocumentMetadata(BaseModel):\n", + " dl_doc_hash: str\n", + " # source: str\n", + "\n", + "\n", + "class DoclingPDFLoader(BaseLoader):\n", + " class ParseType(str, Enum):\n", + " MARKDOWN = \"markdown\"\n", + " # JSON = \"json\"\n", + "\n", + " def __init__(self, file_path: str | list[str], parse_type: ParseType) -> None:\n", + " self._file_paths = file_path if isinstance(file_path, list) else [file_path]\n", + " self._parse_type = parse_type\n", + " self._converter = DocumentConverter()\n", + "\n", + " def lazy_load(self) -> Iterator[LCDocument]:\n", + " for source in self._file_paths:\n", + " dl_doc = self._converter.convert_single(source).output\n", + " match self._parse_type:\n", + " case self.ParseType.MARKDOWN:\n", + " text = dl_doc.export_to_markdown()\n", + " # case self.ParseType.JSON:\n", + " # text = dl_doc.model_dump_json()\n", + " case _:\n", + " raise RuntimeError(\n", + " f\"Unexpected parse type encountered: {self._parse_type}\"\n", + " )\n", + " lc_doc = LCDocument(\n", + " page_content=text,\n", + " metadata=DocumentMetadata(\n", + " dl_doc_hash=dl_doc.file_info.document_hash,\n", + " ).model_dump(),\n", + " )\n", + " yield lc_doc" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "FILE_PATH = \"https://arxiv.org/pdf/2206.01062\" # DocLayNet paper" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1b38d07d5fed4618a44ecf261e1e5c44", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Fetching 7 files: 0%| | 0/7 [00:00\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RAG with LlamaIndex 🦙" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Overview" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This example leverages the official [LlamaIndex Docling extension](../../integrations/llamaindex/).\n", + "\n", + "Presented extensions `DoclingReader` and `DoclingNodeParser` enable you to:\n", + "- use PDF documents in your LLM applications with ease and speed, and\n", + "- harness Docling's rich format for advanced, document-native grounding." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- 👉 For best conversion speed, use GPU acceleration whenever available; e.g. if running on Colab, use GPU-enabled runtime.\n", + "- Notebook uses HuggingFace's Inference API; for increased LLM quota, token can be provided via env var `HF_TOKEN`.\n", + "- Requirements can be installed as shown below (`--no-warn-conflicts` meant for Colab's pre-populated Python env; feel free to remove for stricter usage):" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -q --progress-bar off --no-warn-conflicts llama-index-core llama-index-readers-docling llama-index-node-parser-docling llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-readers-file python-dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path\n", + "from tempfile import mkdtemp\n", + "from warnings import filterwarnings\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "\n", + "def _get_env_from_colab_or_os(key):\n", + " try:\n", + " from google.colab import userdata\n", + "\n", + " try:\n", + " return userdata.get(key)\n", + " except userdata.SecretNotFoundError:\n", + " pass\n", + " except ImportError:\n", + " pass\n", + " return os.getenv(key)\n", + "\n", + "\n", + "load_dotenv()\n", + "\n", + "filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic\")\n", + "filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")\n", + "# https://github.com/huggingface/transformers/issues/5486:\n", + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now define the main parameters:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n", + "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n", + "\n", + "EMBED_MODEL = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n", + "MILVUS_URI = str(Path(mkdtemp()) / \"docling.db\")\n", + "GEN_MODEL = HuggingFaceInferenceAPI(\n", + " token=_get_env_from_colab_or_os(\"HF_TOKEN\"),\n", + " model_name=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n", + ")\n", + "SOURCE = \"https://arxiv.org/pdf/2408.09869\" # Docling Technical Report\n", + "QUERY = \"Which are the main AI models in Docling?\"\n", + "\n", + "embed_dim = len(EMBED_MODEL.get_text_embedding(\"hi\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Markdown export" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To create a simple RAG pipeline, we can:\n", + "- define a `DoclingReader`, which by default exports to Markdown, and\n", + "- use a standard node parser for these Markdown-based docs, e.g. a `MarkdownNodeParser`" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Q: Which are the main AI models in Docling?\n", + "A: 1. A layout analysis model, an accurate object-detector for page elements. 2. TableFormer, a state-of-the-art table structure recognition model.\n", + "\n", + "Sources:\n" + ] + }, + { + "data": { + "text/plain": [ + "[('3.2 AI models\\n\\nAs part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.',\n", + " {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n", + " 'Header_2': '3.2 AI models'}),\n", + " (\"5 Applications\\n\\nThanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.\",\n", + " {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n", + " 'Header_2': '5 Applications'})]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from llama_index.core import StorageContext, VectorStoreIndex\n", + "from llama_index.core.node_parser import MarkdownNodeParser\n", + "from llama_index.readers.docling import DoclingReader\n", + "from llama_index.vector_stores.milvus import MilvusVectorStore\n", + "\n", + "reader = DoclingReader()\n", + "node_parser = MarkdownNodeParser()\n", + "\n", + "vector_store = MilvusVectorStore(\n", + " uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n", + " dim=embed_dim,\n", + " overwrite=True,\n", + ")\n", + "index = VectorStoreIndex.from_documents(\n", + " documents=reader.load_data(SOURCE),\n", + " transformations=[node_parser],\n", + " storage_context=StorageContext.from_defaults(vector_store=vector_store),\n", + " embed_model=EMBED_MODEL,\n", + ")\n", + "result = index.as_query_engine(llm=GEN_MODEL).query(QUERY)\n", + "print(f\"Q: {QUERY}\\nA: {result.response.strip()}\\n\\nSources:\")\n", + "display([(n.text, n.metadata) for n in result.source_nodes])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Docling format" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To leverage Docling's rich native format, we:\n", + "- create a `DoclingReader` with JSON export type, and\n", + "- employ a `DoclingNodeParser` in order to appropriately parse that Docling format.\n", + "\n", + "Notice how the sources now also contain document-level grounding (e.g. page number or bounding box information):" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Q: Which are the main AI models in Docling?\n", + "A: The main AI models in Docling are a layout analysis model and TableFormer. The layout analysis model is an accurate object-detector for page elements, and TableFormer is a state-of-the-art table structure recognition model.\n", + "\n", + "Sources:\n" + ] + }, + { + "data": { + "text/plain": [ + "[('As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.',\n", + " {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n", + " 'path': '#/main-text/37',\n", + " 'heading': '3.2 AI models',\n", + " 'page': 3,\n", + " 'bbox': [107.36903381347656,\n", + " 330.07513427734375,\n", + " 506.29705810546875,\n", + " 407.3725280761719]}),\n", + " ('With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.',\n", + " {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n", + " 'path': '#/main-text/10',\n", + " 'heading': '1 Introduction',\n", + " 'page': 1,\n", + " 'bbox': [107.33261108398438,\n", + " 83.3067626953125,\n", + " 504.0033874511719,\n", + " 136.45367431640625]})]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from llama_index.node_parser.docling import DoclingNodeParser\n", + "\n", + "reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)\n", + "node_parser = DoclingNodeParser()\n", + "\n", + "vector_store = MilvusVectorStore(\n", + " uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n", + " dim=embed_dim,\n", + " overwrite=True,\n", + ")\n", + "index = VectorStoreIndex.from_documents(\n", + " documents=reader.load_data(SOURCE),\n", + " transformations=[node_parser],\n", + " storage_context=StorageContext.from_defaults(vector_store=vector_store),\n", + " embed_model=EMBED_MODEL,\n", + ")\n", + "result = index.as_query_engine(llm=GEN_MODEL).query(QUERY)\n", + "print(f\"Q: {QUERY}\\nA: {result.response.strip()}\\n\\nSources:\")\n", + "display([(n.text, n.metadata) for n in result.source_nodes])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## With Simple Directory Reader" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To demonstrate this usage pattern, we first set up a test document directory." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from tempfile import mkdtemp\n", + "\n", + "import requests\n", + "\n", + "tmp_dir_path = Path(mkdtemp())\n", + "r = requests.get(SOURCE)\n", + "with open(tmp_dir_path / f\"{Path(SOURCE).name}.pdf\", \"wb\") as out_file:\n", + " out_file.write(r.content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using the `reader` and `node_parser` definitions from any of the above variants, usage with `SimpleDirectoryReader` then looks as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading files: 100%|██████████| 1/1 [00:11<00:00, 11.15s/file]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Q: Which are the main AI models in Docling?\n", + "A: The main AI models in Docling are a layout analysis model and TableFormer. The layout analysis model is an accurate object-detector for page elements, and TableFormer is a state-of-the-art table structure recognition model.\n", + "\n", + "Sources:\n" + ] + }, + { + "data": { + "text/plain": [ + "[('As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.',\n", + " {'file_path': '/var/folders/76/4wwfs06x6835kcwj4186c0nc0000gn/T/tmp4vsev3_r/2408.09869.pdf',\n", + " 'file_name': '2408.09869.pdf',\n", + " 'file_type': 'application/pdf',\n", + " 'file_size': 5566574,\n", + " 'creation_date': '2024-10-09',\n", + " 'last_modified_date': '2024-10-09',\n", + " 'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n", + " 'path': '#/main-text/37',\n", + " 'heading': '3.2 AI models',\n", + " 'page': 3,\n", + " 'bbox': [107.36903381347656,\n", + " 330.07513427734375,\n", + " 506.29705810546875,\n", + " 407.3725280761719]}),\n", + " ('With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.',\n", + " {'file_path': '/var/folders/76/4wwfs06x6835kcwj4186c0nc0000gn/T/tmp4vsev3_r/2408.09869.pdf',\n", + " 'file_name': '2408.09869.pdf',\n", + " 'file_type': 'application/pdf',\n", + " 'file_size': 5566574,\n", + " 'creation_date': '2024-10-09',\n", + " 'last_modified_date': '2024-10-09',\n", + " 'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n", + " 'path': '#/main-text/10',\n", + " 'heading': '1 Introduction',\n", + " 'page': 1,\n", + " 'bbox': [107.33261108398438,\n", + " 83.3067626953125,\n", + " 504.0033874511719,\n", + " 136.45367431640625]})]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from llama_index.core import SimpleDirectoryReader\n", + "\n", + "dir_reader = SimpleDirectoryReader(\n", + " input_dir=tmp_dir_path,\n", + " file_extractor={\".pdf\": reader},\n", + ")\n", + "\n", + "vector_store = MilvusVectorStore(\n", + " uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n", + " dim=embed_dim,\n", + " overwrite=True,\n", + ")\n", + "index = VectorStoreIndex.from_documents(\n", + " documents=dir_reader.load_data(SOURCE),\n", + " transformations=[node_parser],\n", + " storage_context=StorageContext.from_defaults(vector_store=vector_store),\n", + " embed_model=EMBED_MODEL,\n", + ")\n", + "result = index.as_query_engine(llm=GEN_MODEL).query(QUERY)\n", + "print(f\"Q: {QUERY}\\nA: {result.response.strip()}\\n\\nSources:\")\n", + "display([(n.text, n.metadata) for n in result.source_nodes])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/run_with_formats.py b/docs/examples/run_with_formats.py similarity index 97% rename from examples/run_with_formats.py rename to docs/examples/run_with_formats.py index 6ff33034..4ae7055c 100644 --- a/examples/run_with_formats.py +++ b/docs/examples/run_with_formats.py @@ -53,7 +53,7 @@ doc_converter = ( conv_results = doc_converter.convert_all(input_paths) for res in conv_results: - out_path = Path("./scratch") + out_path = Path("../../examples/scratch") print( f"Document {res.input.file.name} converted." f"\nSaved markdown output to: {str(out_path)}" diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..6f218e48 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,29 @@ +# Docling + +

+ + Docling + +

+ + +[![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869) +[![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/) +![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue) +[![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/) +[![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev) +[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) +[![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT) + +Docling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package. + +## Features + +* ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast +* 📑 Understands detailed page layout, reading order and recovers table structures +* 📝 Extracts metadata from the document, such as title, authors, references and language +* 🔍 Includes OCR support for scanned PDFs +* 🤖 Integrates easily with LLM app / RAG frameworks like LlamaIndex 🦙 & LangChain 🦜🔗 +* 💻 Provides a simple and convenient CLI diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 00000000..df18dece --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,100 @@ +To use Docling, simply install `docling` from your Python package manager, e.g. pip: +```bash +pip install docling +``` + +Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 architectures. + +??? "Alternative PyTorch distributions" + + The Docling models depend on the [PyTorch](https://pytorch.org/) library. + Depending on your architecture, you might want to use a different distribution of `torch`. + For example, you might want support for different accelerator or for a cpu-only version. + All the different ways for installing `torch` are listed on their website . + + One common situation is the installation on Linux systems with cpu-only support. + In this case, we suggest the installation of Docling with the following options + + ```bash + # Example for installing on the Linux cpu-only version + pip install docling --extra-index-url https://download.pytorch.org/whl/cpu + ``` + +??? "Alternative OCR engines" + + Docling supports multiple OCR engines for processing scanned documents. The current version provides + the following engines. + + | Engine | Installation | Usage | + | ------ | ------------ | ----- | + | [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` | + | Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` | + | Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` | + + The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example + + ```python + from docling.datamodel.base_models import ConversionStatus, PipelineOptions + from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions + from docling.document_converter import DocumentConverter + + pipeline_options = PipelineOptions() + pipeline_options.do_ocr = True + pipeline_options.ocr_options = TesseractOcrOptions() # Use Tesseract + + doc_converter = DocumentConverter( + pipeline_options=pipeline_options, + ) + ``` + +

Tesseract installation

+ + [Tesseract](https://github.com/tesseract-ocr/tesseract) is a popular OCR engine which is available + on most operating systems. For using this engine with Docling, Tesseract must be installed on your + system, using the packaging tool of your choice. Below we provide example commands. + After installing Tesseract you are expected to provide the path to its language files using the + `TESSDATA_PREFIX` environment variable (note that it must terminate with a slash `/`). + + === "macOS (via [Homebrew](https://brew.sh/))" + + ```console + brew install tesseract leptonica pkg-config + TESSDATA_PREFIX=/opt/homebrew/share/tessdata/ + echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}" + ``` + + === "Debian-based" + + ```console + apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev pkg-config + TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$) + echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}" + ``` + + === "RHEL" + + ```console + dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel + TESSDATA_PREFIX=/usr/share/tesseract/tessdata/ + echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}" + ``` + +

Linking to Tesseract

+ The most efficient usage of the Tesseract library is via linking. Docling is using + the [Tesserocr](https://github.com/sirfz/tesserocr) package for this. + + If you get into installation issues of Tesserocr, we suggest using the following + installation options: + + ```console + pip uninstall tesserocr + pip install --no-binary :all: tesserocr + ``` + +## Development setup + +To develop Docling features, bugfixes etc., install as follows from your local clone's root dir: + +```bash +poetry install --all-extras +``` diff --git a/docs/integrations/llamaindex.md b/docs/integrations/llamaindex.md new file mode 100644 index 00000000..d3c0f46e --- /dev/null +++ b/docs/integrations/llamaindex.md @@ -0,0 +1,25 @@ +## Get started + +Docling is available as an official LlamaIndex extension! + +To get started, check out the [step-by-step guide in LlamaIndex \[↗\]](https://docs.llamaindex.ai/en/stable/examples/data_connectors/DoclingReaderDemo/). + +## Components + +### Docling Reader + +Reads document files and uses Docling to populate LlamaIndex `Document` objects — either serializing Docling's data model (losslessly, e.g. as JSON) or exporting to a simplified format (lossily, e.g. as Markdown). + +- 💻 [GitHub \[↗\]](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/readers/llama-index-readers-docling) +- 📖 [API docs \[↗\]](https://docs.llamaindex.ai/en/stable/api_reference/readers/docling/) +- 📦 [PyPI \[↗\]](https://pypi.org/project/llama-index-readers-docling/) +- 🦙 [LlamaHub \[↗\]](https://llamahub.ai/l/readers/llama-index-readers-docling) + +### Docling Node Parser + +Reads LlamaIndex `Document` objects populated in Docling's format by Docling Reader and, using its knowledge of the Docling format, parses them to LlamaIndex `Node` objects for downstream usage in LlamaIndex applications, e.g. as chunks for embedding. + +- 💻 [GitHub \[↗\]](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/node_parser/llama-index-node-parser-docling) +- 📖 [API docs \[↗\]](https://docs.llamaindex.ai/en/stable/api_reference/node_parser/docling/) +- 📦 [PyPI \[↗\]](https://pypi.org/project/llama-index-node-parser-docling/) +- 🦙 [LlamaHub \[↗\]](https://llamahub.ai/l/node_parser/llama-index-node-parser-docling) diff --git a/docs/overrides/main.html b/docs/overrides/main.html new file mode 100644 index 00000000..a60852d8 --- /dev/null +++ b/docs/overrides/main.html @@ -0,0 +1,7 @@ +{% extends "base.html" %} + +{# +{% block announce %} +

🎉 Docling is now officially supported in LlamaIndex! Check it out!

+{% endblock %} +#} diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css new file mode 100644 index 00000000..5beec977 --- /dev/null +++ b/docs/stylesheets/extra.css @@ -0,0 +1,3 @@ +[data-md-color-scheme="default"] .md-banner a { + color: #5e8bde; +} diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 00000000..2deb6463 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,97 @@ +site_name: Docling +site_url: https://ds4sd.github.io/docling/ +repo_name: DS4SD/docling +repo_url: https://github.com/DS4SD/docling + +theme: + name: material + custom_dir: docs/overrides + palette: + # Palette toggle for automatic mode + - media: "(prefers-color-scheme)" + scheme: default + primary: black + toggle: + icon: material/brightness-auto + name: Switch to light mode + + # Palette toggle for light mode + - media: "(prefers-color-scheme: light)" + scheme: default + primary: black + toggle: + icon: material/brightness-7 + name: Switch to dark mode + + # Palette toggle for dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: black + toggle: + icon: material/brightness-4 + name: Switch to system preference + + logo: assets/logo.png + favicon: assets/logo.png + features: + - content.tabs.link + - content.code.annotate + - content.code.copy + - announce.dismiss + - navigation.tabs + # - navigation.indexes # <= if set, each "section" can have its own page, if index.md is used + - navigation.instant + - navigation.instant.prefetch + # - navigation.instant.preview + - navigation.instant.progress + - navigation.path + - navigation.sections # <= + - navigation.top + - navigation.tracking + - search.suggest + - toc.follow +nav: + - Get started: + - Home: index.md + - Installation: installation.md + # - Docling v2: v2.md + # - Concepts: + # - Docling Document: concepts/document.md + # - Chunking: concepts/chunking.md + - Examples: + - Conversion: + - "Simple conversion": examples/minimal.py + - "Custom conversion": examples/custom_convert.py + - "Batch conversion": examples/batch_convert.py + - "Figure export": examples/export_figures.py + - "Table export": examples/export_tables.py + - "Multimodal export": examples/export_multimodal.py + - RAG / QA: + - "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb + - "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb + # - Chunking: + # - Chunking: examples/chunking.md + # - CLI: + # - CLI: examples/cli.md + - Integrations: + - "LlamaIndex 🦙 extension": integrations/llamaindex.md + # - "LangChain 🦜🔗 extension": integrations/langchain.md + # - API reference: + # - API reference: api_reference/index.md + +markdown_extensions: + - pymdownx.superfences + - pymdownx.tabbed: + alternate_style: true + slugify: !!python/object/apply:pymdownx.slugs.slugify + kwds: + case: lower + - admonition + - pymdownx.details + - attr_list +plugins: + - search + - mkdocs-jupyter + +extra_css: + - stylesheets/extra.css diff --git a/tests/test_backend_docling_parse_v2.py b/tests/test_backend_docling_parse_v2.py new file mode 100644 index 00000000..8c4252cb --- /dev/null +++ b/tests/test_backend_docling_parse_v2.py @@ -0,0 +1,77 @@ +from pathlib import Path + +import pytest + +from docling.backend.docling_parse_v2_backend import ( + DoclingParseV2DocumentBackend, + DoclingParseV2PageBackend, +) +from docling.datamodel.base_models import BoundingBox, InputFormat +from docling.datamodel.document import InputDocument + + +@pytest.fixture +def test_doc_path(): + return Path("./tests/data/2206.01062.pdf") + + +def _get_backend(pdf_doc): + in_doc = InputDocument( + path_or_stream=pdf_doc, + format=InputFormat.PDF, + backend=DoclingParseV2DocumentBackend, + ) + + doc_backend = in_doc._backend + return doc_backend + + +@pytest.mark.skip +def test_text_cell_counts(): + pdf_doc = Path("./tests/data/redp5695.pdf") + + doc_backend = _get_backend(pdf_doc) + + for page_index in range(0, doc_backend.page_count()): + last_cell_count = None + for i in range(10): + page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0) + cells = list(page_backend.get_text_cells()) + + if last_cell_count is None: + last_cell_count = len(cells) + + if len(cells) != last_cell_count: + assert ( + False + ), "Loading page multiple times yielded non-identical text cell counts" + last_cell_count = len(cells) + + +def test_get_text_from_rect(test_doc_path): + doc_backend = _get_backend(test_doc_path) + page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0) + + # Get the title text of the DocLayNet paper + textpiece = page_backend.get_text_in_rect( + bbox=BoundingBox(l=102, t=77, r=511, b=124) + ) + ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" + + assert textpiece.strip() == ref + + +def test_crop_page_image(test_doc_path): + doc_backend = _get_backend(test_doc_path) + page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0) + + # Crop out "Figure 1" from the DocLayNet paper + im = page_backend.get_page_image( + scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) + ) + # im.show() + + +def test_num_pages(test_doc_path): + doc_backend = _get_backend(test_doc_path) + doc_backend.page_count() == 9