From 72ab8e1821e79c0d732e5a9f13973f019914fa27 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 10 Apr 2025 12:24:09 +0000 Subject: [PATCH 1/6] chore: bump version to 2.29.0 [skip ci] --- CHANGELOG.md | 19 +++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fdbd2b22..7856fe2d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,22 @@ +## [v2.29.0](https://github.com/docling-project/docling/releases/tag/v2.29.0) - 2025-04-10 + +### Feature + +* Handle tags as code blocks ([#1320](https://github.com/docling-project/docling/issues/1320)) ([`0499cd1`](https://github.com/docling-project/docling/commit/0499cd1c1e93f74260754476a8423059915f59c2)) +* **docx:** Add text formatting and hyperlink support ([#630](https://github.com/docling-project/docling/issues/630)) ([`bfcab3d`](https://github.com/docling-project/docling/commit/bfcab3d6778e6f622bb4a6b241bdb4bab22ba378)) + +### Fix + +* **docx:** Adding new latex symbols, simplifying how equations are added to text ([#1295](https://github.com/docling-project/docling/issues/1295)) ([`14e9c0c`](https://github.com/docling-project/docling/commit/14e9c0ce9a7559fac96ba5ed82befa12a7f53bfa)) +* **pptx:** Check if picture shape has an image attached ([#1316](https://github.com/docling-project/docling/issues/1316)) ([`dc3bf9c`](https://github.com/docling-project/docling/commit/dc3bf9ceacb7048a97ceb8b7aa80bfccc8a05ca5)) +* **docx:** Improve text parsing ([#1268](https://github.com/docling-project/docling/issues/1268)) ([`d2d6874`](https://github.com/docling-project/docling/commit/d2d68747f9c31be897f3e63c160c835086d37014)) +* Tesseract OCR CLI can't process images composed with numbers only ([#1201](https://github.com/docling-project/docling/issues/1201)) ([`b3d111a`](https://github.com/docling-project/docling/commit/b3d111a3cdb90b653ddaaa356f9299e9cd39b340)) + +### Documentation + +* Add plugins docs ([#1319](https://github.com/docling-project/docling/issues/1319)) ([`2e99e5a`](https://github.com/docling-project/docling/commit/2e99e5a54fafd901d8f26b56b25bb006c0e8e8b0)) +* Add visual grounding example ([#1270](https://github.com/docling-project/docling/issues/1270)) ([`71148eb`](https://github.com/docling-project/docling/commit/71148eb381747a6b899c84b72946ba9bde665a40)) + ## [v2.28.4](https://github.com/docling-project/docling/releases/tag/v2.28.4) - 2025-03-29 ### Fix diff --git a/pyproject.toml b/pyproject.toml index dd48a9d2..ee3e067e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "docling" -version = "2.28.4" # DO NOT EDIT, updated automatically +version = "2.29.0" # DO NOT EDIT, updated automatically description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." authors = [ "Christoph Auer ", From 6b696b504a03ba49f05237d0e1b23fcced1a538a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joan=20Fabr=C3=A9gat?= Date: Thu, 10 Apr 2025 16:11:28 +0200 Subject: [PATCH 2/6] fix: Properly address page in pipeline _assemble_document when page_range is provided (#1334) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fixes #1333 Signed-off-by: Joan Fabrégat * fix for the (dumb) MyPy type checker Signed-off-by: Joan Fabrégat --------- Signed-off-by: Joan Fabrégat --- docling/pipeline/standard_pdf_pipeline.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index ecaa27c7..ae2d918d 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -2,7 +2,7 @@ import logging import sys import warnings from pathlib import Path -from typing import Optional +from typing import Optional, cast from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem @@ -226,7 +226,11 @@ class StandardPdfPipeline(PaginatedPipeline): and self.pipeline_options.generate_table_images ): page_ix = element.prov[0].page_no - 1 - page = conv_res.pages[page_ix] + page = next( + (p for p in conv_res.pages if p.page_no == page_ix), + cast("Page", None), + ) + assert page is not None assert page.size is not None assert page.image is not None From c605edd8e91d988f6dca2bdfc67c54d6396fe903 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Thu, 10 Apr 2025 10:03:04 -0600 Subject: [PATCH 3/6] feat: OllamaVlmModel for Granite Vision 3.2 (#1337) * build: Add ollama sdk dependency Branch: OllamaVlmModel Signed-off-by: Gabe Goodhart * feat: Add option plumbing for OllamaVlmOptions in pipeline_options Branch: OllamaVlmModel Signed-off-by: Gabe Goodhart * feat: Full implementation of OllamaVlmModel Branch: OllamaVlmModel Signed-off-by: Gabe Goodhart * feat: Connect "granite_vision_ollama" pipeline option to CLI Branch: OllamaVlmModel Signed-off-by: Gabe Goodhart * Revert "build: Add ollama sdk dependency" After consideration, we're going to use the generic OpenAI API instead of the Ollama-specific API to avoid duplicate work. This reverts commit bc6b366468cdd66b52540aac9c7d8b584ab48ad0. Signed-off-by: Gabe Goodhart * refactor: Move OpenAI API call logic into utils.utils This will allow reuse of this logic in a generic VLM model NOTE: There is a subtle change here in the ordering of the text prompt and the image in the call to the OpenAI API. When run against Ollama, this ordering makes a big difference. If the prompt comes before the image, the result is terse and not usable whereas the prompt coming after the image works as expected and matches the non-OpenAI chat API. Branch: OllamaVlmModel Signed-off-by: Gabe Goodhart * refactor: Refactor from Ollama SDK to generic OpenAI API Branch: OllamaVlmModel Signed-off-by: Gabe Goodhart * fix: Linting, formatting, and bug fixes The one bug fix was in the timeout arg to openai_image_request. Otherwise, this is all style changes to get MyPy and black passing cleanly. Branch: OllamaVlmModel Signed-off-by: Gabe Goodhart * remove model from download enum Signed-off-by: Michele Dolfi * generalize input args for other API providers Signed-off-by: Michele Dolfi * rename and refactor Signed-off-by: Michele Dolfi * add example Signed-off-by: Michele Dolfi * require flag for remote services Signed-off-by: Michele Dolfi * disable example from CI Signed-off-by: Michele Dolfi * add examples to docs Signed-off-by: Michele Dolfi --------- Signed-off-by: Gabe Goodhart Signed-off-by: Michele Dolfi Co-authored-by: Michele Dolfi --- .github/workflows/checks.yml | 2 +- docling/cli/main.py | 9 +- docling/datamodel/base_models.py | 32 +++++ docling/datamodel/pipeline_options.py | 28 ++++- docling/models/api_vlm_model.py | 67 +++++++++++ .../models/picture_description_api_model.py | 83 ++----------- docling/pipeline/vlm_pipeline.py | 44 ++++--- docling/utils/api_image_request.py | 61 ++++++++++ docs/examples/vlm_pipeline_api_model.py | 111 ++++++++++++++++++ mkdocs.yml | 2 + 10 files changed, 344 insertions(+), 95 deletions(-) create mode 100644 docling/models/api_vlm_model.py create mode 100644 docling/utils/api_image_request.py create mode 100644 docs/examples/vlm_pipeline_api_model.py diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index b2a295dc..ee5ba79b 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -37,7 +37,7 @@ jobs: run: | for file in docs/examples/*.py; do # Skip batch_convert.py - if [[ "$(basename "$file")" =~ ^(batch_convert|minimal_vlm_pipeline|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api).py ]]; then + if [[ "$(basename "$file")" =~ ^(batch_convert|minimal_vlm_pipeline|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model).py ]]; then echo "Skipping $file" continue fi diff --git a/docling/cli/main.py b/docling/cli/main.py index e0f0cbd8..c87e311f 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -40,6 +40,7 @@ from docling.datamodel.pipeline_options import ( VlmModelType, VlmPipelineOptions, granite_vision_vlm_conversion_options, + granite_vision_vlm_ollama_conversion_options, smoldocling_vlm_conversion_options, smoldocling_vlm_mlx_conversion_options, ) @@ -531,10 +532,16 @@ def convert( backend=backend, # pdf_backend ) elif pipeline == PdfPipeline.VLM: - pipeline_options = VlmPipelineOptions() + pipeline_options = VlmPipelineOptions( + enable_remote_services=enable_remote_services, + ) if vlm_model == VlmModelType.GRANITE_VISION: pipeline_options.vlm_options = granite_vision_vlm_conversion_options + elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA: + pipeline_options.vlm_options = ( + granite_vision_vlm_ollama_conversion_options + ) elif vlm_model == VlmModelType.SMOLDOCLING: pipeline_options.vlm_options = smoldocling_vlm_conversion_options if sys.platform == "darwin": diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 76827a1b..7dcf89c0 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -262,3 +262,35 @@ class Page(BaseModel): @property def image(self) -> Optional[Image]: return self.get_image(scale=self._default_image_scale) + + +## OpenAI API Request / Response Models ## + + +class OpenAiChatMessage(BaseModel): + role: str + content: str + + +class OpenAiResponseChoice(BaseModel): + index: int + message: OpenAiChatMessage + finish_reason: str + + +class OpenAiResponseUsage(BaseModel): + prompt_tokens: int + completion_tokens: int + total_tokens: int + + +class OpenAiApiResponse(BaseModel): + model_config = ConfigDict( + protected_namespaces=(), + ) + + id: str + model: Optional[str] = None # returned by openai + choices: List[OpenAiResponseChoice] + created: int + usage: OpenAiResponseUsage diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 654e04df..9791a251 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -266,6 +266,7 @@ class ResponseFormat(str, Enum): class InferenceFramework(str, Enum): MLX = "mlx" TRANSFORMERS = "transformers" + OPENAI = "openai" class HuggingFaceVlmOptions(BaseVlmOptions): @@ -284,6 +285,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions): return self.repo_id.replace("/", "--") +class ApiVlmOptions(BaseVlmOptions): + kind: Literal["api_model_options"] = "api_model_options" + + url: AnyUrl = AnyUrl( + "http://localhost:11434/v1/chat/completions" + ) # Default to ollama + headers: Dict[str, str] = {} + params: Dict[str, Any] = {} + scale: float = 2.0 + timeout: float = 60 + response_format: ResponseFormat + + smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions( repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16", prompt="Convert this page to docling.", @@ -307,10 +321,20 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions( inference_framework=InferenceFramework.TRANSFORMERS, ) +granite_vision_vlm_ollama_conversion_options = ApiVlmOptions( + url=AnyUrl("http://localhost:11434/v1/chat/completions"), + params={"model": "granite3.2-vision:2b"}, + prompt="OCR the full page to markdown.", + scale=1.0, + timeout=120, + response_format=ResponseFormat.MARKDOWN, +) + class VlmModelType(str, Enum): SMOLDOCLING = "smoldocling" GRANITE_VISION = "granite_vision" + GRANITE_VISION_OLLAMA = "granite_vision_ollama" # Define an enum for the backend options @@ -362,7 +386,9 @@ class VlmPipelineOptions(PaginatedPipelineOptions): False # (To be used with vlms, or other generative models) ) # If True, text from backend will be used instead of generated text - vlm_options: Union[HuggingFaceVlmOptions] = smoldocling_vlm_conversion_options + vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = ( + smoldocling_vlm_conversion_options + ) class PdfPipelineOptions(PaginatedPipelineOptions): diff --git a/docling/models/api_vlm_model.py b/docling/models/api_vlm_model.py new file mode 100644 index 00000000..95201224 --- /dev/null +++ b/docling/models/api_vlm_model.py @@ -0,0 +1,67 @@ +from typing import Iterable + +from docling.datamodel.base_models import Page, VlmPrediction +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import ApiVlmOptions +from docling.exceptions import OperationNotAllowed +from docling.models.base_model import BasePageModel +from docling.utils.api_image_request import api_image_request +from docling.utils.profiling import TimeRecorder + + +class ApiVlmModel(BasePageModel): + + def __init__( + self, + enabled: bool, + enable_remote_services: bool, + vlm_options: ApiVlmOptions, + ): + self.enabled = enabled + self.vlm_options = vlm_options + if self.enabled: + if not enable_remote_services: + raise OperationNotAllowed( + "Connections to remote services is only allowed when set explicitly. " + "pipeline_options.enable_remote_services=True, or using the CLI " + "--enable-remote-services." + ) + + self.timeout = self.vlm_options.timeout + self.prompt_content = ( + f"This is a page from a document.\n{self.vlm_options.prompt}" + ) + self.params = { + **self.vlm_options.params, + "temperature": 0, + } + + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: + for page in page_batch: + assert page._backend is not None + if not page._backend.is_valid(): + yield page + else: + with TimeRecorder(conv_res, "vlm"): + assert page.size is not None + + hi_res_image = page.get_image(scale=self.vlm_options.scale) + assert hi_res_image is not None + if hi_res_image: + if hi_res_image.mode != "RGB": + hi_res_image = hi_res_image.convert("RGB") + + page_tags = api_image_request( + image=hi_res_image, + prompt=self.prompt_content, + url=self.vlm_options.url, + timeout=self.timeout, + headers=self.vlm_options.headers, + **self.params, + ) + + page.predictions.vlm_response = VlmPrediction(text=page_tags) + + yield page diff --git a/docling/models/picture_description_api_model.py b/docling/models/picture_description_api_model.py index 6ef8a7fc..1aa73518 100644 --- a/docling/models/picture_description_api_model.py +++ b/docling/models/picture_description_api_model.py @@ -1,12 +1,7 @@ -import base64 -import io -import logging from pathlib import Path -from typing import Iterable, List, Optional, Type, Union +from typing import Iterable, Optional, Type, Union -import requests from PIL import Image -from pydantic import BaseModel, ConfigDict from docling.datamodel.pipeline_options import ( AcceleratorOptions, @@ -15,37 +10,7 @@ from docling.datamodel.pipeline_options import ( ) from docling.exceptions import OperationNotAllowed from docling.models.picture_description_base_model import PictureDescriptionBaseModel - -_log = logging.getLogger(__name__) - - -class ChatMessage(BaseModel): - role: str - content: str - - -class ResponseChoice(BaseModel): - index: int - message: ChatMessage - finish_reason: str - - -class ResponseUsage(BaseModel): - prompt_tokens: int - completion_tokens: int - total_tokens: int - - -class ApiResponse(BaseModel): - model_config = ConfigDict( - protected_namespaces=(), - ) - - id: str - model: Optional[str] = None # returned by openai - choices: List[ResponseChoice] - created: int - usage: ResponseUsage +from docling.utils.api_image_request import api_image_request class PictureDescriptionApiModel(PictureDescriptionBaseModel): @@ -83,43 +48,11 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel): # Note: technically we could make a batch request here, # but not all APIs will allow for it. For example, vllm won't allow more than 1. for image in images: - img_io = io.BytesIO() - image.save(img_io, "PNG") - image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8") - - messages = [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": self.options.prompt, - }, - { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{image_base64}" - }, - }, - ], - } - ] - - payload = { - "messages": messages, - **self.options.params, - } - - r = requests.post( - str(self.options.url), - headers=self.options.headers, - json=payload, + yield api_image_request( + image=image, + prompt=self.options.prompt, + url=self.options.url, timeout=self.options.timeout, + headers=self.options.headers, + **self.options.params, ) - if not r.ok: - _log.error(f"Error calling the API. Reponse was {r.text}") - r.raise_for_status() - - api_resp = ApiResponse.model_validate_json(r.text) - generated_text = api_resp.choices[0].message.content.strip() - yield generated_text diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index d4defa89..79279fd0 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -15,11 +15,14 @@ from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import InputFormat, Page from docling.datamodel.document import ConversionResult, InputDocument from docling.datamodel.pipeline_options import ( + ApiVlmOptions, + HuggingFaceVlmOptions, InferenceFramework, ResponseFormat, VlmPipelineOptions, ) from docling.datamodel.settings import settings +from docling.models.api_vlm_model import ApiVlmModel from docling.models.hf_mlx_model import HuggingFaceMlxModel from docling.models.hf_vlm_model import HuggingFaceVlmModel from docling.pipeline.base_pipeline import PaginatedPipeline @@ -57,27 +60,34 @@ class VlmPipeline(PaginatedPipeline): self.keep_images = self.pipeline_options.generate_page_images - if ( - self.pipeline_options.vlm_options.inference_framework - == InferenceFramework.MLX - ): + if isinstance(pipeline_options.vlm_options, ApiVlmOptions): self.build_pipe = [ - HuggingFaceMlxModel( + ApiVlmModel( enabled=True, # must be always enabled for this pipeline to make sense. - artifacts_path=artifacts_path, - accelerator_options=pipeline_options.accelerator_options, - vlm_options=self.pipeline_options.vlm_options, - ), - ] - else: - self.build_pipe = [ - HuggingFaceVlmModel( - enabled=True, # must be always enabled for this pipeline to make sense. - artifacts_path=artifacts_path, - accelerator_options=pipeline_options.accelerator_options, - vlm_options=self.pipeline_options.vlm_options, + enable_remote_services=self.pipeline_options.enable_remote_services, + vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options), ), ] + elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions): + vlm_options = cast(HuggingFaceVlmOptions, self.pipeline_options.vlm_options) + if vlm_options.inference_framework == InferenceFramework.MLX: + self.build_pipe = [ + HuggingFaceMlxModel( + enabled=True, # must be always enabled for this pipeline to make sense. + artifacts_path=artifacts_path, + accelerator_options=pipeline_options.accelerator_options, + vlm_options=vlm_options, + ), + ] + else: + self.build_pipe = [ + HuggingFaceVlmModel( + enabled=True, # must be always enabled for this pipeline to make sense. + artifacts_path=artifacts_path, + accelerator_options=pipeline_options.accelerator_options, + vlm_options=vlm_options, + ), + ] self.enrichment_pipe = [ # Other models working on `NodeItem` elements in the DoclingDocument diff --git a/docling/utils/api_image_request.py b/docling/utils/api_image_request.py new file mode 100644 index 00000000..9227389c --- /dev/null +++ b/docling/utils/api_image_request.py @@ -0,0 +1,61 @@ +import base64 +import logging +from io import BytesIO +from typing import Dict, Optional + +import requests +from PIL import Image +from pydantic import AnyUrl + +from docling.datamodel.base_models import OpenAiApiResponse + +_log = logging.getLogger(__name__) + + +def api_image_request( + image: Image.Image, + prompt: str, + url: AnyUrl, + timeout: float = 20, + headers: Optional[Dict[str, str]] = None, + **params, +) -> str: + img_io = BytesIO() + image.save(img_io, "PNG") + image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8") + messages = [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{image_base64}"}, + }, + { + "type": "text", + "text": prompt, + }, + ], + } + ] + + payload = { + "messages": messages, + **params, + } + + headers = headers or {} + + r = requests.post( + str(url), + headers=headers, + json=payload, + timeout=timeout, + ) + if not r.ok: + _log.error(f"Error calling the API. Response was {r.text}") + r.raise_for_status() + + api_resp = OpenAiApiResponse.model_validate_json(r.text) + generated_text = api_resp.choices[0].message.content.strip() + return generated_text diff --git a/docs/examples/vlm_pipeline_api_model.py b/docs/examples/vlm_pipeline_api_model.py new file mode 100644 index 00000000..33fb72a2 --- /dev/null +++ b/docs/examples/vlm_pipeline_api_model.py @@ -0,0 +1,111 @@ +import logging +import os +from pathlib import Path + +import requests +from dotenv import load_dotenv + +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import ( + ApiVlmOptions, + ResponseFormat, + VlmPipelineOptions, + granite_vision_vlm_ollama_conversion_options, +) +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.pipeline.vlm_pipeline import VlmPipeline + + +def ollama_vlm_options(model: str, prompt: str): + options = ApiVlmOptions( + url="http://localhost:11434/v1/chat/completions", # the default Ollama endpoint + params=dict( + model=model, + ), + prompt=prompt, + timeout=90, + scale=1.0, + response_format=ResponseFormat.MARKDOWN, + ) + return options + + +def watsonx_vlm_options(model: str, prompt: str): + load_dotenv() + api_key = os.environ.get("WX_API_KEY") + project_id = os.environ.get("WX_PROJECT_ID") + + def _get_iam_access_token(api_key: str) -> str: + res = requests.post( + url="https://iam.cloud.ibm.com/identity/token", + headers={ + "Content-Type": "application/x-www-form-urlencoded", + }, + data=f"grant_type=urn:ibm:params:oauth:grant-type:apikey&apikey={api_key}", + ) + res.raise_for_status() + api_out = res.json() + print(f"{api_out=}") + return api_out["access_token"] + + options = ApiVlmOptions( + url="https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29", + params=dict( + model_id=model, + project_id=project_id, + parameters=dict( + max_new_tokens=400, + ), + ), + headers={ + "Authorization": "Bearer " + _get_iam_access_token(api_key=api_key), + }, + prompt=prompt, + timeout=60, + response_format=ResponseFormat.MARKDOWN, + ) + return options + + +def main(): + logging.basicConfig(level=logging.INFO) + + # input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + input_doc_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf") + + pipeline_options = VlmPipelineOptions( + enable_remote_services=True # <-- this is required! + ) + + # The ApiVlmOptions() allows to interface with APIs supporting + # the multi-modal chat interface. Here follow a few example on how to configure those. + + # One possibility is self-hosting model, e.g. via Ollama. + # Example using the Granite Vision model: (uncomment the following lines) + pipeline_options.vlm_options = ollama_vlm_options( + model="granite3.2-vision:2b", + prompt="OCR the full page to markdown.", + ) + + # Another possibility is using online services, e.g. watsonx.ai. + # Using requires setting the env variables WX_API_KEY and WX_PROJECT_ID. + # Uncomment the following line for this option: + # pipeline_options.vlm_options = watsonx_vlm_options( + # model="ibm/granite-vision-3-2-2b", prompt="OCR the full page to markdown." + # ) + + # Create the DocumentConverter and launch the conversion. + doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + pipeline_cls=VlmPipeline, + ) + } + ) + result = doc_converter.convert(input_doc_path) + print(result.document.export_to_markdown()) + + +if __name__ == "__main__": + main() diff --git a/mkdocs.yml b/mkdocs.yml index 0fc7f5f1..dd842d6d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -75,6 +75,8 @@ nav: - "Custom conversion": examples/custom_convert.py - "Batch conversion": examples/batch_convert.py - "Multi-format conversion": examples/run_with_formats.py + - "VLM pipeline with SmolDocling": examples/minimal_vlm_pipeline.py + - "VLM pipeline with remote model": examples/vlm_pipeline_api_model.py - "Figure export": examples/export_figures.py - "Table export": examples/export_tables.py - "Multimodal export": examples/export_multimodal.py From eef2bdea77fa32061e798f538bf2cd95f8d72165 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Fri, 11 Apr 2025 10:29:53 +0200 Subject: [PATCH 4/6] feat(xlsx): create a page for each worksheet in XLSX backend (#1332) * sytle(xlsx): enforce type hints in XLSX backend Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * feat(xlsx): create a page for each worksheet in XLSX backend Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * docs(xlsx): add docstrings to XLSX backend module. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * docling(xlsx): add bounding boxes and page size information in cell units Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/msexcel_backend.py | 360 +++++++++++++----- .../groundtruth/docling_v2/test-01.xlsx.json | 143 ++++++- tests/test_backend_msexcel.py | 50 ++- 3 files changed, 452 insertions(+), 101 deletions(-) diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py index 121637a1..971b93cd 100644 --- a/docling/backend/msexcel_backend.py +++ b/docling/backend/msexcel_backend.py @@ -1,36 +1,50 @@ import logging from io import BytesIO from pathlib import Path -from typing import Dict, Set, Tuple, Union +from typing import Any, Union, cast from docling_core.types.doc import ( + BoundingBox, + CoordOrigin, + DocItem, DoclingDocument, DocumentOrigin, GroupLabel, ImageRef, + ProvenanceItem, + Size, TableCell, TableData, ) - -# from lxml import etree -from openpyxl import Workbook, load_workbook -from openpyxl.cell.cell import Cell +from openpyxl import load_workbook from openpyxl.drawing.image import Image +from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor from openpyxl.worksheet.worksheet import Worksheet +from PIL import Image as PILImage +from pydantic import BaseModel, NonNegativeInt, PositiveInt +from typing_extensions import override -from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.backend.abstract_backend import ( + DeclarativeDocumentBackend, + PaginatedDocumentBackend, +) from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument _log = logging.getLogger(__name__) -from typing import Any, List - -from PIL import Image as PILImage -from pydantic import BaseModel - class ExcelCell(BaseModel): + """Represents an Excel cell. + + Attributes: + row: The row number of the cell. + col: The column number of the cell. + text: The text content of the cell. + row_span: The number of rows the cell spans. + col_span: The number of columns the cell spans. + """ + row: int col: int text: str @@ -39,19 +53,57 @@ class ExcelCell(BaseModel): class ExcelTable(BaseModel): + """Represents an Excel table on a worksheet. + + Attributes: + anchor: The column and row indices of the upper-left cell of the table + (0-based index). + num_rows: The number of rows in the table. + num_cols: The number of columns in the table. + data: The data in the table, represented as a list of ExcelCell objects. + """ + + anchor: tuple[NonNegativeInt, NonNegativeInt] num_rows: int num_cols: int - data: List[ExcelCell] + data: list[ExcelCell] -class MsExcelDocumentBackend(DeclarativeDocumentBackend): - def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): +class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend): + """Backend for parsing Excel workbooks. + + The backend converts an Excel workbook into a DoclingDocument object. + Each worksheet is converted into a separate page. + The following elements are parsed: + - Cell contents, parsed as tables. If two groups of cells are disconnected + between each other, they will be parsed as two different tables. + - Images, parsed as PictureItem objects. + + The DoclingDocument tables and pictures have their provenance information, including + the position in their original Excel worksheet. The position is represented by a + bounding box object with the cell indices as units (0-based index). The size of this + bounding box is the number of columns and rows that the table or picture spans. + """ + + @override + def __init__( + self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] + ) -> None: + """Initialize the MsExcelDocumentBackend object. + + Parameters: + in_doc: The input document object. + path_or_stream: The path or stream to the Excel file. + + Raises: + RuntimeError: An error occurred parsing the file. + """ super().__init__(in_doc, path_or_stream) # Initialise the parents for the hierarchy self.max_levels = 10 - self.parents: Dict[int, Any] = {} + self.parents: dict[int, Any] = {} for i in range(-1, self.max_levels): self.parents[i] = None @@ -63,35 +115,47 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): elif isinstance(self.path_or_stream, Path): self.workbook = load_workbook(filename=str(self.path_or_stream)) - self.valid = True + self.valid = self.workbook is not None except Exception as e: self.valid = False raise RuntimeError( - f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}" + f"MsExcelDocumentBackend could not load document with hash {self.document_hash}" ) from e + @override def is_valid(self) -> bool: - _log.info(f"valid: {self.valid}") + _log.debug(f"valid: {self.valid}") return self.valid @classmethod + @override def supports_pagination(cls) -> bool: return True - def unload(self): - if isinstance(self.path_or_stream, BytesIO): - self.path_or_stream.close() - - self.path_or_stream = None + @override + def page_count(self) -> int: + if self.is_valid() and self.workbook: + return len(self.workbook.sheetnames) + else: + return 0 @classmethod - def supported_formats(cls) -> Set[InputFormat]: + @override + def supported_formats(cls) -> set[InputFormat]: return {InputFormat.XLSX} + @override def convert(self) -> DoclingDocument: - # Parses the XLSX into a structured document model. + """Parse the Excel workbook into a DoclingDocument object. + Raises: + RuntimeError: Unable to run the conversion since the backend object failed to + initialize. + + Returns: + The DoclingDocument object representing the Excel workbook. + """ origin = DocumentOrigin( filename=self.file.name or "file.xlsx", mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", @@ -110,6 +174,14 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): return doc def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument: + """Parse the Excel workbook and attach its structure to a DoclingDocument. + + Args: + doc: A DoclingDocument object. + + Returns: + A DoclingDocument object with the parsed items. + """ if self.workbook is not None: @@ -117,22 +189,34 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): for sheet_name in self.workbook.sheetnames: _log.info(f"Processing sheet: {sheet_name}") - # Access the sheet by name sheet = self.workbook[sheet_name] + page_no = self.workbook.index(sheet) + 1 + # do not rely on sheet.max_column, sheet.max_row if there are images + page = doc.add_page(page_no=page_no, size=Size(width=0, height=0)) self.parents[0] = doc.add_group( parent=None, label=GroupLabel.SECTION, name=f"sheet: {sheet_name}", ) - doc = self._convert_sheet(doc, sheet) + width, height = self._find_page_size(doc, page_no) + page.size = Size(width=width, height=height) else: _log.error("Workbook is not initialized.") return doc - def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet): + def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument: + """Parse an Excel worksheet and attach its structure to a DoclingDocument + + Args: + doc: The DoclingDocument to be updated. + sheet: The Excel worksheet to be parsed. + + Returns: + The updated DoclingDocument. + """ doc = self._find_tables_in_sheet(doc, sheet) @@ -140,47 +224,81 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): return doc - def _find_tables_in_sheet(self, doc: DoclingDocument, sheet: Worksheet): + def _find_tables_in_sheet( + self, doc: DoclingDocument, sheet: Worksheet + ) -> DoclingDocument: + """Find all tables in an Excel sheet and attach them to a DoclingDocument. - tables = self._find_data_tables(sheet) + Args: + doc: The DoclingDocument to be updated. + sheet: The Excel worksheet to be parsed. - for excel_table in tables: - num_rows = excel_table.num_rows - num_cols = excel_table.num_cols + Returns: + The updated DoclingDocument. + """ - table_data = TableData( - num_rows=num_rows, - num_cols=num_cols, - table_cells=[], - ) + if self.workbook is not None: + tables = self._find_data_tables(sheet) - for excel_cell in excel_table.data: + for excel_table in tables: + origin_col = excel_table.anchor[0] + origin_row = excel_table.anchor[1] + num_rows = excel_table.num_rows + num_cols = excel_table.num_cols - cell = TableCell( - text=excel_cell.text, - row_span=excel_cell.row_span, - col_span=excel_cell.col_span, - start_row_offset_idx=excel_cell.row, - end_row_offset_idx=excel_cell.row + excel_cell.row_span, - start_col_offset_idx=excel_cell.col, - end_col_offset_idx=excel_cell.col + excel_cell.col_span, - column_header=excel_cell.row == 0, - row_header=False, + table_data = TableData( + num_rows=num_rows, + num_cols=num_cols, + table_cells=[], ) - table_data.table_cells.append(cell) - doc.add_table(data=table_data, parent=self.parents[0]) + for excel_cell in excel_table.data: + + cell = TableCell( + text=excel_cell.text, + row_span=excel_cell.row_span, + col_span=excel_cell.col_span, + start_row_offset_idx=excel_cell.row, + end_row_offset_idx=excel_cell.row + excel_cell.row_span, + start_col_offset_idx=excel_cell.col, + end_col_offset_idx=excel_cell.col + excel_cell.col_span, + column_header=excel_cell.row == 0, + row_header=False, + ) + table_data.table_cells.append(cell) + + page_no = self.workbook.index(sheet) + 1 + doc.add_table( + data=table_data, + parent=self.parents[0], + prov=ProvenanceItem( + page_no=page_no, + charspan=(0, 0), + bbox=BoundingBox.from_tuple( + ( + origin_col, + origin_row, + origin_col + num_cols, + origin_row + num_rows, + ), + origin=CoordOrigin.TOPLEFT, + ), + ), + ) return doc - def _find_data_tables(self, sheet: Worksheet) -> List[ExcelTable]: - """ - Find all compact rectangular data tables in a sheet. - """ - # _log.info("find_data_tables") + def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]: + """Find all compact rectangular data tables in an Excel worksheet. - tables = [] # List to store found tables - visited: set[Tuple[int, int]] = set() # Track already visited cells + Args: + sheet: The Excel worksheet to be parsed. + + Returns: + A list of ExcelTable objects representing the data tables. + """ + tables: list[ExcelTable] = [] # List to store found tables + visited: set[tuple[int, int]] = set() # Track already visited cells # Iterate over all cells in the sheet for ri, row in enumerate(sheet.iter_rows(values_only=False)): @@ -191,9 +309,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): continue # If the cell starts a new table, find its bounds - table_bounds, visited_cells = self._find_table_bounds( - sheet, ri, rj, visited - ) + table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj) visited.update(visited_cells) # Mark these cells as visited tables.append(table_bounds) @@ -205,22 +321,25 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): sheet: Worksheet, start_row: int, start_col: int, - visited: set[Tuple[int, int]], - ): - """ - Determine the bounds of a compact rectangular table. + ) -> tuple[ExcelTable, set[tuple[int, int]]]: + """Determine the bounds of a compact rectangular table. + + Args: + sheet: The Excel worksheet to be parsed. + start_row: The row number of the starting cell. + start_col: The column number of the starting cell. + Returns: - - A dictionary with the bounds and data. - - A set of visited cell coordinates. + A tuple with an Excel table and a set of cell coordinates. """ - _log.info("find_table_bounds") + _log.debug("find_table_bounds") max_row = self._find_table_bottom(sheet, start_row, start_col) max_col = self._find_table_right(sheet, start_row, start_col) # Collect the data within the bounds data = [] - visited_cells = set() + visited_cells: set[tuple[int, int]] = set() for ri in range(start_row, max_row + 1): for rj in range(start_col, max_col + 1): @@ -230,7 +349,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): row_span = 1 col_span = 1 - # _log.info(sheet.merged_cells.ranges) for merged_range in sheet.merged_cells.ranges: if ( @@ -254,7 +372,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): col_span=col_span, ) ) - # _log.info(f"cell: {ri}, {rj} -> {ri - start_row}, {rj - start_col}, {row_span}, {col_span}: {str(cell.value)}") # Mark all cells in the span as visited for span_row in range(ri, ri + row_span): @@ -263,6 +380,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): return ( ExcelTable( + anchor=(start_col, start_row), num_rows=max_row + 1 - start_row, num_cols=max_col + 1 - start_col, data=data, @@ -270,10 +388,20 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): visited_cells, ) - def _find_table_bottom(self, sheet: Worksheet, start_row: int, start_col: int): - """Function to find the bottom boundary of the table""" + def _find_table_bottom( + self, sheet: Worksheet, start_row: int, start_col: int + ) -> int: + """Find the bottom boundary of a table. - max_row = start_row + Args: + sheet: The Excel worksheet to be parsed. + start_row: The starting row of the table. + start_col: The starting column of the table. + + Returns: + The row index representing the bottom boundary of the table. + """ + max_row: int = start_row while max_row < sheet.max_row - 1: # Get the cell value or check if it is part of a merged cell @@ -296,10 +424,20 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): return max_row - def _find_table_right(self, sheet: Worksheet, start_row: int, start_col: int): - """Function to find the right boundary of the table""" + def _find_table_right( + self, sheet: Worksheet, start_row: int, start_col: int + ) -> int: + """Find the right boundary of a table. - max_col = start_col + Args: + sheet: The Excel worksheet to be parsed. + start_row: The starting row of the table. + start_col: The starting column of the table. + + Returns: + The column index representing the right boundary of the table." + """ + max_col: int = start_col while max_col < sheet.max_column - 1: # Get the cell value or check if it is part of a merged cell @@ -325,19 +463,63 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): def _find_images_in_sheet( self, doc: DoclingDocument, sheet: Worksheet ) -> DoclingDocument: + """Find images in the Excel sheet and attach them to the DoclingDocument. - # Iterate over byte images in the sheet - for idx, image in enumerate(sheet._images): # type: ignore + Args: + doc: The DoclingDocument to be updated. + sheet: The Excel worksheet to be parsed. - try: - pil_image = PILImage.open(image.ref) - - doc.add_picture( - parent=self.parents[0], - image=ImageRef.from_pil(image=pil_image, dpi=72), - caption=None, - ) - except: - _log.error("could not extract the image from excel sheets") + Returns: + The updated DoclingDocument. + """ + if self.workbook is not None: + # Iterate over byte images in the sheet + for item in sheet._images: # type: ignore[attr-defined] + try: + image: Image = cast(Image, item) + pil_image = PILImage.open(image.ref) # type: ignore[arg-type] + page_no = self.workbook.index(sheet) + 1 + anchor = (0, 0, 0, 0) + if isinstance(image.anchor, TwoCellAnchor): + anchor = ( + image.anchor._from.col, + image.anchor._from.row, + image.anchor.to.col + 1, + image.anchor.to.row + 1, + ) + doc.add_picture( + parent=self.parents[0], + image=ImageRef.from_pil(image=pil_image, dpi=72), + caption=None, + prov=ProvenanceItem( + page_no=page_no, + charspan=(0, 0), + bbox=BoundingBox.from_tuple( + anchor, origin=CoordOrigin.TOPLEFT + ), + ), + ) + except: + _log.error("could not extract the image from excel sheets") return doc + + @staticmethod + def _find_page_size( + doc: DoclingDocument, page_no: PositiveInt + ) -> tuple[float, float]: + left: float = -1.0 + top: float = -1.0 + right: float = -1.0 + bottom: float = -1.0 + for item, _ in doc.iterate_items(traverse_pictures=True, page_no=page_no): + if not isinstance(item, DocItem): + continue + for provenance in item.prov: + bbox = provenance.bbox + left = min(left, bbox.l) if left != -1 else bbox.l + right = max(right, bbox.r) if right != -1 else bbox.r + top = min(top, bbox.t) if top != -1 else bbox.t + bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b + + return (right - left, bottom - top) diff --git a/tests/data/groundtruth/docling_v2/test-01.xlsx.json b/tests/data/groundtruth/docling_v2/test-01.xlsx.json index 173cd5fb..2bdfe509 100644 --- a/tests/data/groundtruth/docling_v2/test-01.xlsx.json +++ b/tests/data/groundtruth/docling_v2/test-01.xlsx.json @@ -97,7 +97,22 @@ "children": [], "content_layer": "body", "label": "picture", - "prov": [], + "prov": [ + { + "page_no": 3, + "bbox": { + "l": 8.0, + "t": 18.0, + "r": 13.0, + "b": 36.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "captions": [], "references": [], "footnotes": [], @@ -122,7 +137,22 @@ "children": [], "content_layer": "body", "label": "table", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 3.0, + "b": 7.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "captions": [], "references": [], "footnotes": [], @@ -661,7 +691,22 @@ "children": [], "content_layer": "body", "label": "table", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 4.0, + "b": 9.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "captions": [], "references": [], "footnotes": [], @@ -1564,7 +1609,22 @@ "children": [], "content_layer": "body", "label": "table", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 6.0, + "t": 4.0, + "r": 9.0, + "b": 9.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "captions": [], "references": [], "footnotes": [], @@ -1955,7 +2015,22 @@ "children": [], "content_layer": "body", "label": "table", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 2.0, + "t": 13.0, + "r": 5.0, + "b": 18.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "captions": [], "references": [], "footnotes": [], @@ -2346,7 +2421,22 @@ "children": [], "content_layer": "body", "label": "table", - "prov": [], + "prov": [ + { + "page_no": 3, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 3.0, + "b": 7.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "captions": [], "references": [], "footnotes": [], @@ -2813,7 +2903,22 @@ "children": [], "content_layer": "body", "label": "table", - "prov": [], + "prov": [ + { + "page_no": 3, + "bbox": { + "l": 4.0, + "t": 6.0, + "r": 7.0, + "b": 13.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "captions": [], "references": [], "footnotes": [], @@ -3275,5 +3380,27 @@ ], "key_value_items": [], "form_items": [], - "pages": {} + "pages": { + "1": { + "size": { + "width": 3.0, + "height": 7.0 + }, + "page_no": 1 + }, + "2": { + "size": { + "width": 9.0, + "height": 18.0 + }, + "page_no": 2 + }, + "3": { + "size": { + "width": 13.0, + "height": 36.0 + }, + "page_no": 3 + } + } } \ No newline at end of file diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index 1844dff1..0604429c 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -1,13 +1,18 @@ -import os +import logging from pathlib import Path +import pytest + +from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import ConversionResult, DoclingDocument +from docling.datamodel.document import ConversionResult, DoclingDocument, InputDocument from docling.document_converter import DocumentConverter from .test_data_gen_flag import GEN_TEST_DATA from .verify_utils import verify_document, verify_export +_log = logging.getLogger(__name__) + GENERATE = GEN_TEST_DATA @@ -28,13 +33,15 @@ def get_converter(): return converter -def test_e2e_xlsx_conversions(): +@pytest.fixture(scope="module") +def documents() -> list[tuple[Path, DoclingDocument]]: + documents: list[dict[Path, DoclingDocument]] = [] xlsx_paths = get_xlsx_paths() converter = get_converter() for xlsx_path in xlsx_paths: - print(f"converting {xlsx_path}") + _log.debug(f"converting {xlsx_path}") gt_path = ( xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name @@ -44,6 +51,14 @@ def test_e2e_xlsx_conversions(): doc: DoclingDocument = conv_result.document + assert doc, f"Failed to convert document from file {gt_path}" + documents.append((gt_path, doc)) + + return documents + + +def test_e2e_xlsx_conversions(documents) -> None: + for gt_path, doc in documents: pred_md: str = doc.export_to_markdown() assert verify_export(pred_md, str(gt_path) + ".md"), "export to md" @@ -57,3 +72,30 @@ def test_e2e_xlsx_conversions(): assert verify_document( doc, str(gt_path) + ".json", GENERATE ), "document document" + + +def test_pages(documents) -> None: + """Test the page count and page size of converted documents. + + Args: + documents: The paths and converted documents. + """ + # number of pages from the backend method + path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0] + in_doc = InputDocument( + path_or_stream=path, + format=InputFormat.XLSX, + filename=path.stem, + backend=MsExcelDocumentBackend, + ) + backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path) + assert backend.page_count() == 3 + + # number of pages from the converted document + doc = [item for path, item in documents if path.stem == "test-01"][0] + assert len(doc.pages) == 3 + + # page sizes as number of cells + assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0) + assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0) + assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0) From 250399948de69fe01cd789e328194f38a03598a7 Mon Sep 17 00:00:00 2001 From: Rowan Skewes Date: Fri, 11 Apr 2025 19:14:05 +1000 Subject: [PATCH 5/6] fix: Implement PictureDescriptionApiOptions.bitmap_area_threshold (#1248) fix: Implement PictureDescriptionApiOptions.picture_area_threshold Signed-off-by: Rowan Skewes --- docling/datamodel/pipeline_options.py | 4 ++-- docling/models/picture_description_base_model.py | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 9791a251..8e99cd09 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -213,8 +213,8 @@ class PictureDescriptionBaseOptions(BaseOptions): batch_size: int = 8 scale: float = 2 - bitmap_area_threshold: float = ( - 0.2 # percentage of the area for a bitmap to processed with the models + picture_area_threshold: float = ( + 0.05 # percentage of the area for a picture to processed with the models ) diff --git a/docling/models/picture_description_base_model.py b/docling/models/picture_description_base_model.py index 129387b3..96169227 100644 --- a/docling/models/picture_description_base_model.py +++ b/docling/models/picture_description_base_model.py @@ -63,8 +63,20 @@ class PictureDescriptionBaseModel( elements: List[PictureItem] = [] for el in element_batch: assert isinstance(el.item, PictureItem) - elements.append(el.item) - images.append(el.image) + describe_image = True + # Don't describe the image if it's smaller than the threshold + if len(el.item.prov) > 0: + prov = el.item.prov[0] # PictureItems have at most a single provenance + page = doc.pages.get(prov.page_no) + if page is not None: + page_area = page.size.width * page.size.height + if page_area > 0: + area_fraction = prov.bbox.area() / page_area + if area_fraction < self.options.picture_area_threshold: + describe_image = False + if describe_image: + elements.append(el.item) + images.append(el.image) outputs = self._annotate_images(images) From 415b877984fd89884e97b4740bc553e800055e0e Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Fri, 11 Apr 2025 13:04:00 +0200 Subject: [PATCH 6/6] fix(docx): declare image_data variable when handling pictures (#1359) Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/msword_backend.py | 32 +++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 5530bba0..5915c0a5 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -850,7 +850,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): def _handle_pictures( self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument ) -> None: - def get_docx_image(drawing_blip): + def get_docx_image(drawing_blip: Any) -> Optional[bytes]: + image_data: Optional[bytes] = None rId = drawing_blip[0].get( "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed" ) @@ -862,19 +863,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): level = self._get_level() # Open the BytesIO object with PIL to create an Image - try: - image_data = get_docx_image(drawing_blip) - image_bytes = BytesIO(image_data) - pil_image = Image.open(image_bytes) - doc.add_picture( - parent=self.parents[level - 1], - image=ImageRef.from_pil(image=pil_image, dpi=72), - caption=None, - ) - except (UnidentifiedImageError, OSError) as e: - _log.warning("Warning: image cannot be loaded by Pillow") + image_data: Optional[bytes] = get_docx_image(drawing_blip) + if image_data is None: + _log.warning("Warning: image cannot be found") doc.add_picture( parent=self.parents[level - 1], caption=None, ) + else: + try: + image_bytes = BytesIO(image_data) + pil_image = Image.open(image_bytes) + doc.add_picture( + parent=self.parents[level - 1], + image=ImageRef.from_pil(image=pil_image, dpi=72), + caption=None, + ) + except (UnidentifiedImageError, OSError) as e: + _log.warning("Warning: image cannot be loaded by Pillow") + doc.add_picture( + parent=self.parents[level - 1], + caption=None, + ) return