diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..85260753 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ +tests/data/** linguist-vendored +tests/data_scanned/** linguist-vendored +docs/** linguist-vendored diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 362380d5..f18fc88c 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -60,7 +60,7 @@ jobs: run: | for file in docs/examples/*.py; do # Skip batch_convert.py - if [[ "$(basename "$file")" =~ ^(batch_convert|minimal_vlm_pipeline|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model).py ]]; then + if [[ "$(basename "$file")" =~ ^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model).py ]]; then echo "Skipping $file" continue fi diff --git a/CHANGELOG.md b/CHANGELOG.md index 9678243c..040a3b55 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,18 @@ +## [v2.35.0](https://github.com/docling-project/docling/releases/tag/v2.35.0) - 2025-06-02 + +### Feature + +* Add visualization of bbox on page with html export. ([#1663](https://github.com/docling-project/docling/issues/1663)) ([`b356b33`](https://github.com/docling-project/docling/commit/b356b33059bdeeaf1584d9d189cbf1c4832e367c)) + +### Fix + +* Guess HTML content starting with script tag ([#1673](https://github.com/docling-project/docling/issues/1673)) ([`984cb13`](https://github.com/docling-project/docling/commit/984cb137f6a8ae2f3a63623add6c474d97ef8739)) +* UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd0 in position 0: invalid continuation byte ([#1665](https://github.com/docling-project/docling/issues/1665)) ([`51d3450`](https://github.com/docling-project/docling/commit/51d34509156e2dbec9e697276681d59f9ca7e020)) + +### Documentation + +* Fix typo in index.md ([#1676](https://github.com/docling-project/docling/issues/1676)) ([`11ca4f7`](https://github.com/docling-project/docling/commit/11ca4f7a7bd8068bee472510dd71f1cd58f86f17)) + ## [v2.34.0](https://github.com/docling-project/docling/releases/tag/v2.34.0) - 2025-05-22 ### Feature diff --git a/README.md b/README.md index 3911f794..309e1030 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Docling simplifies document processing, parsing diverse formats — including ad * 🔒 Local execution capabilities for sensitive data and air-gapped environments * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI * 🔍 Extensive OCR support for scanned PDFs and images -* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕 +* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) * 💻 Simple and convenient CLI ### Coming soon diff --git a/docling/cli/main.py b/docling/cli/main.py index 98a4c8d7..083f53b2 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -12,6 +12,12 @@ from typing import Annotated, Dict, List, Optional, Type import rich.table import typer +from docling_core.transforms.serializer.html import ( + HTMLDocSerializer, + HTMLOutputStyle, + HTMLParams, +) +from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer from docling_core.types.doc import ImageRefMode from docling_core.utils.file import resolve_source_to_path from pydantic import TypeAdapter @@ -22,6 +28,7 @@ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBacke from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend +from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions from docling.datamodel.base_models import ( ConversionStatus, FormatToExtensions, @@ -30,8 +37,6 @@ from docling.datamodel.base_models import ( ) from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, EasyOcrOptions, OcrOptions, PaginatedPipelineOptions, @@ -39,14 +44,16 @@ from docling.datamodel.pipeline_options import ( PdfPipeline, PdfPipelineOptions, TableFormerMode, - VlmModelType, VlmPipelineOptions, - granite_vision_vlm_conversion_options, - granite_vision_vlm_ollama_conversion_options, - smoldocling_vlm_conversion_options, - smoldocling_vlm_mlx_conversion_options, ) from docling.datamodel.settings import settings +from docling.datamodel.vlm_model_specs import ( + GRANITE_VISION_OLLAMA, + GRANITE_VISION_TRANSFORMERS, + SMOLDOCLING_MLX, + SMOLDOCLING_TRANSFORMERS, + VlmModelType, +) from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption from docling.models.factories import get_ocr_factory from docling.pipeline.vlm_pipeline import VlmPipeline @@ -156,6 +163,7 @@ def export_documents( export_json: bool, export_html: bool, export_html_split_page: bool, + show_layout: bool, export_md: bool, export_txt: bool, export_doctags: bool, @@ -189,9 +197,27 @@ def export_documents( if export_html_split_page: fname = output_dir / f"{doc_filename}.html" _log.info(f"writing HTML output to {fname}") - conv_res.document.save_as_html( - filename=fname, image_mode=image_export_mode, split_page_view=True - ) + if show_layout: + ser = HTMLDocSerializer( + doc=conv_res.document, + params=HTMLParams( + image_mode=image_export_mode, + output_style=HTMLOutputStyle.SPLIT_PAGE, + ), + ) + visualizer = LayoutVisualizer() + visualizer.params.show_label = False + ser_res = ser.serialize( + visualizer=visualizer, + ) + with open(fname, "w") as fw: + fw.write(ser_res.text) + else: + conv_res.document.save_as_html( + filename=fname, + image_mode=image_export_mode, + split_page_view=True, + ) # Export Text format: if export_txt: @@ -250,6 +276,13 @@ def convert( # noqa: C901 to_formats: List[OutputFormat] = typer.Option( None, "--to", help="Specify output formats. Defaults to Markdown." ), + show_layout: Annotated[ + bool, + typer.Option( + ..., + help="If enabled, the page images will show the bounding-boxes of the items.", + ), + ] = False, headers: str = typer.Option( None, "--headers", @@ -547,20 +580,16 @@ def convert( # noqa: C901 ) if vlm_model == VlmModelType.GRANITE_VISION: - pipeline_options.vlm_options = granite_vision_vlm_conversion_options + pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA: - pipeline_options.vlm_options = ( - granite_vision_vlm_ollama_conversion_options - ) + pipeline_options.vlm_options = GRANITE_VISION_OLLAMA elif vlm_model == VlmModelType.SMOLDOCLING: - pipeline_options.vlm_options = smoldocling_vlm_conversion_options + pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS if sys.platform == "darwin": try: import mlx_vlm - pipeline_options.vlm_options = ( - smoldocling_vlm_mlx_conversion_options - ) + pipeline_options.vlm_options = SMOLDOCLING_MLX except ImportError: _log.warning( "To run SmolDocling faster, please install mlx-vlm:\n" @@ -596,6 +625,7 @@ def convert( # noqa: C901 export_json=export_json, export_html=export_html, export_html_split_page=export_html_split_page, + show_layout=show_layout, export_md=export_md, export_txt=export_txt, export_doctags=export_doctags, diff --git a/docling/datamodel/accelerator_options.py b/docling/datamodel/accelerator_options.py new file mode 100644 index 00000000..1b0ea8cf --- /dev/null +++ b/docling/datamodel/accelerator_options.py @@ -0,0 +1,68 @@ +import logging +import os +import re +from enum import Enum +from typing import Any, Union + +from pydantic import field_validator, model_validator +from pydantic_settings import BaseSettings, SettingsConfigDict + +_log = logging.getLogger(__name__) + + +class AcceleratorDevice(str, Enum): + """Devices to run model inference""" + + AUTO = "auto" + CPU = "cpu" + CUDA = "cuda" + MPS = "mps" + + +class AcceleratorOptions(BaseSettings): + model_config = SettingsConfigDict( + env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True + ) + + num_threads: int = 4 + device: Union[str, AcceleratorDevice] = "auto" + cuda_use_flash_attention2: bool = False + + @field_validator("device") + def validate_device(cls, value): + # "auto", "cpu", "cuda", "mps", or "cuda:N" + if value in {d.value for d in AcceleratorDevice} or re.match( + r"^cuda(:\d+)?$", value + ): + return value + raise ValueError( + "Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'." + ) + + @model_validator(mode="before") + @classmethod + def check_alternative_envvars(cls, data: Any) -> Any: + r""" + Set num_threads from the "alternative" envvar OMP_NUM_THREADS. + The alternative envvar is used only if it is valid and the regular envvar is not set. + + Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide + the same functionality. In case the alias envvar is set and the user tries to override the + parameter in settings initialization, Pydantic treats the parameter provided in __init__() + as an extra input instead of simply overwriting the evvar value for that parameter. + """ + if isinstance(data, dict): + input_num_threads = data.get("num_threads") + # Check if to set the num_threads from the alternative envvar + if input_num_threads is None: + docling_num_threads = os.getenv("DOCLING_NUM_THREADS") + omp_num_threads = os.getenv("OMP_NUM_THREADS") + if docling_num_threads is None and omp_num_threads is not None: + try: + data["num_threads"] = int(omp_num_threads) + except ValueError: + _log.error( + "Ignoring misformatted envvar OMP_NUM_THREADS '%s'", + omp_num_threads, + ) + return data diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 1e98a972..6299e619 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -13,11 +13,11 @@ from docling_core.types.doc import ( TableCell, ) from docling_core.types.doc.page import SegmentedPdfPage, TextCell - -# DO NOT REMOVE; explicitly exposed from this location from docling_core.types.io import ( DocumentStream, ) + +# DO NOT REMOVE; explicitly exposed from this location from PIL.Image import Image from pydantic import BaseModel, ConfigDict, Field, computed_field @@ -131,12 +131,6 @@ class ErrorItem(BaseModel): error_message: str -# class Cell(BaseModel): -# id: int -# text: str -# bbox: BoundingBox - - class Cluster(BaseModel): id: int label: DocItemLabel @@ -158,8 +152,16 @@ class LayoutPrediction(BaseModel): clusters: List[Cluster] = [] +class VlmPredictionToken(BaseModel): + text: str = "" + token: int = -1 + logprob: float = -1 + + class VlmPrediction(BaseModel): text: str = "" + generated_tokens: list[VlmPredictionToken] = [] + generation_time: float = -1 class ContainerElement( diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 5791c0e4..4c71f5c8 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -412,7 +412,11 @@ class _DocumentConversionInput(BaseModel): else: return "application/xml" - if re.match(r".*?\s*)?( Any: - r""" - Set num_threads from the "alternative" envvar OMP_NUM_THREADS. - The alternative envvar is used only if it is valid and the regular envvar is not set. - - Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide - the same functionality. In case the alias envvar is set and the user tries to override the - parameter in settings initialization, Pydantic treats the parameter provided in __init__() - as an extra input instead of simply overwriting the evvar value for that parameter. - """ - if isinstance(data, dict): - input_num_threads = data.get("num_threads") - # Check if to set the num_threads from the alternative envvar - if input_num_threads is None: - docling_num_threads = os.getenv("DOCLING_NUM_THREADS") - omp_num_threads = os.getenv("OMP_NUM_THREADS") - if docling_num_threads is None and omp_num_threads is not None: - try: - data["num_threads"] = int(omp_num_threads) - except ValueError: - _log.error( - "Ignoring misformatted envvar OMP_NUM_THREADS '%s'", - omp_num_threads, - ) - return data - - class BaseOptions(BaseModel): """Base class for options.""" @@ -121,24 +74,22 @@ class RapidOcrOptions(OcrOptions): lang: List[str] = [ "english", "chinese", - ] # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything. - # For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/ + ] + # However, language as a parameter is not supported by rapidocr yet + # and hence changing this options doesn't affect anything. + + # For more details on supported languages by RapidOCR visit + # https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/ + + # For more details on the following options visit + # https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/ - # For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/ text_score: float = 0.5 # same default as rapidocr use_det: Optional[bool] = None # same default as rapidocr use_cls: Optional[bool] = None # same default as rapidocr use_rec: Optional[bool] = None # same default as rapidocr - # class Device(Enum): - # CPU = "CPU" - # CUDA = "CUDA" - # DIRECTML = "DIRECTML" - # AUTO = "AUTO" - - # device: Device = Device.AUTO # Default value is AUTO - print_verbose: bool = False # same default as rapidocr det_model_path: Optional[str] = None # same default as rapidocr @@ -244,101 +195,18 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions): return self.repo_id.replace("/", "--") +# SmolVLM smolvlm_picture_description = PictureDescriptionVlmOptions( repo_id="HuggingFaceTB/SmolVLM-256M-Instruct" ) -# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct") + +# GraniteVision granite_picture_description = PictureDescriptionVlmOptions( repo_id="ibm-granite/granite-vision-3.1-2b-preview", prompt="What is shown in this image?", ) -class BaseVlmOptions(BaseModel): - kind: str - prompt: str - - -class ResponseFormat(str, Enum): - DOCTAGS = "doctags" - MARKDOWN = "markdown" - - -class InferenceFramework(str, Enum): - MLX = "mlx" - TRANSFORMERS = "transformers" - OPENAI = "openai" - - -class HuggingFaceVlmOptions(BaseVlmOptions): - kind: Literal["hf_model_options"] = "hf_model_options" - - repo_id: str - load_in_8bit: bool = True - llm_int8_threshold: float = 6.0 - quantized: bool = False - - inference_framework: InferenceFramework - response_format: ResponseFormat - - @property - def repo_cache_folder(self) -> str: - return self.repo_id.replace("/", "--") - - -class ApiVlmOptions(BaseVlmOptions): - kind: Literal["api_model_options"] = "api_model_options" - - url: AnyUrl = AnyUrl( - "http://localhost:11434/v1/chat/completions" - ) # Default to ollama - headers: Dict[str, str] = {} - params: Dict[str, Any] = {} - scale: float = 2.0 - timeout: float = 60 - concurrency: int = 1 - response_format: ResponseFormat - - -smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions( - repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16", - prompt="Convert this page to docling.", - response_format=ResponseFormat.DOCTAGS, - inference_framework=InferenceFramework.MLX, -) - - -smoldocling_vlm_conversion_options = HuggingFaceVlmOptions( - repo_id="ds4sd/SmolDocling-256M-preview", - prompt="Convert this page to docling.", - response_format=ResponseFormat.DOCTAGS, - inference_framework=InferenceFramework.TRANSFORMERS, -) - -granite_vision_vlm_conversion_options = HuggingFaceVlmOptions( - repo_id="ibm-granite/granite-vision-3.1-2b-preview", - # prompt="OCR the full page to markdown.", - prompt="OCR this image.", - response_format=ResponseFormat.MARKDOWN, - inference_framework=InferenceFramework.TRANSFORMERS, -) - -granite_vision_vlm_ollama_conversion_options = ApiVlmOptions( - url=AnyUrl("http://localhost:11434/v1/chat/completions"), - params={"model": "granite3.2-vision:2b"}, - prompt="OCR the full page to markdown.", - scale=1.0, - timeout=120, - response_format=ResponseFormat.MARKDOWN, -) - - -class VlmModelType(str, Enum): - SMOLDOCLING = "smoldocling" - GRANITE_VISION = "granite_vision" - GRANITE_VISION_OLLAMA = "granite_vision_ollama" - - # Define an enum for the backend options class PdfBackend(str, Enum): """Enum of valid PDF backends.""" @@ -387,7 +255,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions): False # (To be used with vlms, or other generative models) ) # If True, text from backend will be used instead of generated text - vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = ( + vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = ( smoldocling_vlm_conversion_options ) diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py new file mode 100644 index 00000000..c1ec28aa --- /dev/null +++ b/docling/datamodel/pipeline_options_vlm_model.py @@ -0,0 +1,81 @@ +from enum import Enum +from typing import Any, Dict, List, Literal + +from pydantic import AnyUrl, BaseModel +from typing_extensions import deprecated + +from docling.datamodel.accelerator_options import AcceleratorDevice + + +class BaseVlmOptions(BaseModel): + kind: str + prompt: str + + +class ResponseFormat(str, Enum): + DOCTAGS = "doctags" + MARKDOWN = "markdown" + HTML = "html" + + +class InferenceFramework(str, Enum): + MLX = "mlx" + TRANSFORMERS = "transformers" + + +class TransformersModelType(str, Enum): + AUTOMODEL = "automodel" + AUTOMODEL_VISION2SEQ = "automodel-vision2seq" + AUTOMODEL_CAUSALLM = "automodel-causallm" + + +class InlineVlmOptions(BaseVlmOptions): + kind: Literal["inline_model_options"] = "inline_model_options" + + repo_id: str + trust_remote_code: bool = False + load_in_8bit: bool = True + llm_int8_threshold: float = 6.0 + quantized: bool = False + + inference_framework: InferenceFramework + transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL + response_format: ResponseFormat + + supported_devices: List[AcceleratorDevice] = [ + AcceleratorDevice.CPU, + AcceleratorDevice.CUDA, + AcceleratorDevice.MPS, + ] + + scale: float = 2.0 + + temperature: float = 0.0 + stop_strings: List[str] = [] + extra_generation_config: Dict[str, Any] = {} + + use_kv_cache: bool = True + max_new_tokens: int = 4096 + + @property + def repo_cache_folder(self) -> str: + return self.repo_id.replace("/", "--") + + +@deprecated("Use InlineVlmOptions instead.") +class HuggingFaceVlmOptions(InlineVlmOptions): + pass + + +class ApiVlmOptions(BaseVlmOptions): + kind: Literal["api_model_options"] = "api_model_options" + + url: AnyUrl = AnyUrl( + "http://localhost:11434/v1/chat/completions" + ) # Default to ollama + headers: Dict[str, str] = {} + params: Dict[str, Any] = {} + scale: float = 2.0 + timeout: float = 60 + concurrency: int = 1 + response_format: ResponseFormat diff --git a/docling/datamodel/vlm_model_specs.py b/docling/datamodel/vlm_model_specs.py new file mode 100644 index 00000000..5045c846 --- /dev/null +++ b/docling/datamodel/vlm_model_specs.py @@ -0,0 +1,144 @@ +import logging +from enum import Enum + +from pydantic import ( + AnyUrl, +) + +from docling.datamodel.accelerator_options import AcceleratorDevice +from docling.datamodel.pipeline_options_vlm_model import ( + ApiVlmOptions, + InferenceFramework, + InlineVlmOptions, + ResponseFormat, + TransformersModelType, +) + +_log = logging.getLogger(__name__) + + +# SmolDocling +SMOLDOCLING_MLX = InlineVlmOptions( + repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16", + prompt="Convert this page to docling.", + response_format=ResponseFormat.DOCTAGS, + inference_framework=InferenceFramework.MLX, + supported_devices=[AcceleratorDevice.MPS], + scale=2.0, + temperature=0.0, +) + +SMOLDOCLING_TRANSFORMERS = InlineVlmOptions( + repo_id="ds4sd/SmolDocling-256M-preview", + prompt="Convert this page to docling.", + response_format=ResponseFormat.DOCTAGS, + inference_framework=InferenceFramework.TRANSFORMERS, + transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ, + supported_devices=[ + AcceleratorDevice.CPU, + AcceleratorDevice.CUDA, + AcceleratorDevice.MPS, + ], + scale=2.0, + temperature=0.0, +) + +# GraniteVision +GRANITE_VISION_TRANSFORMERS = InlineVlmOptions( + repo_id="ibm-granite/granite-vision-3.2-2b", + prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS, + transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ, + supported_devices=[ + AcceleratorDevice.CPU, + AcceleratorDevice.CUDA, + AcceleratorDevice.MPS, + ], + scale=2.0, + temperature=0.0, +) + +GRANITE_VISION_OLLAMA = ApiVlmOptions( + url=AnyUrl("http://localhost:11434/v1/chat/completions"), + params={"model": "granite3.2-vision:2b"}, + prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", + scale=1.0, + timeout=120, + response_format=ResponseFormat.MARKDOWN, + temperature=0.0, +) + +# Pixtral +PIXTRAL_12B_TRANSFORMERS = InlineVlmOptions( + repo_id="mistral-community/pixtral-12b", + prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS, + transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ, + supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA], + scale=2.0, + temperature=0.0, +) + +PIXTRAL_12B_MLX = InlineVlmOptions( + repo_id="mlx-community/pixtral-12b-bf16", + prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.MLX, + supported_devices=[AcceleratorDevice.MPS], + scale=2.0, + temperature=0.0, +) + +# Phi4 +PHI4_TRANSFORMERS = InlineVlmOptions( + repo_id="microsoft/Phi-4-multimodal-instruct", + prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown", + trust_remote_code=True, + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS, + transformers_model_type=TransformersModelType.AUTOMODEL_CAUSALLM, + supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA], + scale=2.0, + temperature=0.0, + extra_generation_config=dict(num_logits_to_keep=0), +) + +# Qwen +QWEN25_VL_3B_MLX = InlineVlmOptions( + repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16", + prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.MLX, + supported_devices=[AcceleratorDevice.MPS], + scale=2.0, + temperature=0.0, +) + +# Gemma-3 +GEMMA3_12B_MLX = InlineVlmOptions( + repo_id="mlx-community/gemma-3-12b-it-bf16", + prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.MLX, + supported_devices=[AcceleratorDevice.MPS], + scale=2.0, + temperature=0.0, +) + +GEMMA3_27B_MLX = InlineVlmOptions( + repo_id="mlx-community/gemma-3-27b-it-bf16", + prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.MLX, + supported_devices=[AcceleratorDevice.MPS], + scale=2.0, + temperature=0.0, +) + + +class VlmModelType(str, Enum): + SMOLDOCLING = "smoldocling" + GRANITE_VISION = "granite_vision" + GRANITE_VISION_OLLAMA = "granite_vision_ollama" diff --git a/docling/document_converter.py b/docling/document_converter.py index 08095d43..e553c083 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -186,6 +186,11 @@ class DocumentConverter: Tuple[Type[BasePipeline], str], BasePipeline ] = {} + def _get_initialized_pipelines( + self, + ) -> dict[tuple[Type[BasePipeline], str], BasePipeline]: + return self.initialized_pipelines + def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str: """Generate a hash of pipeline options to use as part of the cache key.""" options_str = str(pipeline_options.model_dump()) diff --git a/docling/models/api_vlm_model.py b/docling/models/api_vlm_model.py index 60bc6fce..30bc43ea 100644 --- a/docling/models/api_vlm_model.py +++ b/docling/models/api_vlm_model.py @@ -3,7 +3,7 @@ from concurrent.futures import ThreadPoolExecutor from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import ApiVlmOptions +from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions from docling.exceptions import OperationNotAllowed from docling.models.base_model import BasePageModel from docling.utils.api_image_request import api_image_request diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index 9f05aed3..4cbdeba5 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -11,9 +11,10 @@ from PIL import Image, ImageDraw from rtree import index from scipy.ndimage import binary_dilation, find_objects, label +from docling.datamodel.accelerator_options import AcceleratorOptions from docling.datamodel.base_models import Page from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import AcceleratorOptions, OcrOptions +from docling.datamodel.pipeline_options import OcrOptions from docling.datamodel.settings import settings from docling.models.base_model import BaseModelWithOptions, BasePageModel diff --git a/docling/models/code_formula_model.py b/docling/models/code_formula_model.py index bf747c56..19a831ab 100644 --- a/docling/models/code_formula_model.py +++ b/docling/models/code_formula_model.py @@ -16,9 +16,10 @@ from docling_core.types.doc.labels import CodeLanguageLabel from PIL import Image, ImageOps from pydantic import BaseModel +from docling.datamodel.accelerator_options import AcceleratorOptions from docling.datamodel.base_models import ItemAndImageEnrichmentElement -from docling.datamodel.pipeline_options import AcceleratorOptions from docling.models.base_model import BaseItemAndImageEnrichmentModel +from docling.models.utils.hf_model_download import download_hf_model from docling.utils.accelerator_utils import decide_device @@ -117,20 +118,14 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel): force: bool = False, progress: bool = False, ) -> Path: - from huggingface_hub import snapshot_download - from huggingface_hub.utils import disable_progress_bars - - if not progress: - disable_progress_bars() - download_path = snapshot_download( + return download_hf_model( repo_id="ds4sd/CodeFormula", - force_download=force, - local_dir=local_dir, revision="v1.0.2", + local_dir=local_dir, + force=force, + progress=progress, ) - return Path(download_path) - def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool: """ Determines if a given element in a document can be processed by the model. diff --git a/docling/models/document_picture_classifier.py b/docling/models/document_picture_classifier.py index 6a57a74d..73a30203 100644 --- a/docling/models/document_picture_classifier.py +++ b/docling/models/document_picture_classifier.py @@ -13,8 +13,9 @@ from docling_core.types.doc import ( from PIL import Image from pydantic import BaseModel -from docling.datamodel.pipeline_options import AcceleratorOptions +from docling.datamodel.accelerator_options import AcceleratorOptions from docling.models.base_model import BaseEnrichmentModel +from docling.models.utils.hf_model_download import download_hf_model from docling.utils.accelerator_utils import decide_device @@ -105,20 +106,14 @@ class DocumentPictureClassifier(BaseEnrichmentModel): def download_models( local_dir: Optional[Path] = None, force: bool = False, progress: bool = False ) -> Path: - from huggingface_hub import snapshot_download - from huggingface_hub.utils import disable_progress_bars - - if not progress: - disable_progress_bars() - download_path = snapshot_download( + return download_hf_model( repo_id="ds4sd/DocumentFigureClassifier", - force_download=force, - local_dir=local_dir, revision="v1.0.1", + local_dir=local_dir, + force=force, + progress=progress, ) - return Path(download_path) - def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool: """ Determines if the given element can be processed by the classifier. diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index b40ca506..53bee9c9 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -9,11 +9,10 @@ import numpy from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc.page import BoundingRectangle, TextCell +from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions from docling.datamodel.base_models import Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, EasyOcrOptions, OcrOptions, ) diff --git a/docling/models/hf_vlm_model.py b/docling/models/hf_vlm_model.py deleted file mode 100644 index 29276fc4..00000000 --- a/docling/models/hf_vlm_model.py +++ /dev/null @@ -1,182 +0,0 @@ -import logging -import time -from collections.abc import Iterable -from pathlib import Path -from typing import Optional - -from docling.datamodel.base_models import Page, VlmPrediction -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import ( - AcceleratorOptions, - HuggingFaceVlmOptions, -) -from docling.models.base_model import BasePageModel -from docling.utils.accelerator_utils import decide_device -from docling.utils.profiling import TimeRecorder - -_log = logging.getLogger(__name__) - - -class HuggingFaceVlmModel(BasePageModel): - def __init__( - self, - enabled: bool, - artifacts_path: Optional[Path], - accelerator_options: AcceleratorOptions, - vlm_options: HuggingFaceVlmOptions, - ): - self.enabled = enabled - - self.vlm_options = vlm_options - - if self.enabled: - import torch - from transformers import ( # type: ignore - AutoModelForVision2Seq, - AutoProcessor, - BitsAndBytesConfig, - ) - - device = decide_device(accelerator_options.device) - self.device = device - - _log.debug(f"Available device for HuggingFace VLM: {device}") - - repo_cache_folder = vlm_options.repo_id.replace("/", "--") - - # PARAMETERS: - if artifacts_path is None: - artifacts_path = self.download_models(self.vlm_options.repo_id) - elif (artifacts_path / repo_cache_folder).exists(): - artifacts_path = artifacts_path / repo_cache_folder - - self.param_question = vlm_options.prompt # "Perform Layout Analysis." - self.param_quantization_config = BitsAndBytesConfig( - load_in_8bit=vlm_options.load_in_8bit, # True, - llm_int8_threshold=vlm_options.llm_int8_threshold, # 6.0 - ) - self.param_quantized = vlm_options.quantized # False - - self.processor = AutoProcessor.from_pretrained(artifacts_path) - if not self.param_quantized: - self.vlm_model = AutoModelForVision2Seq.from_pretrained( - artifacts_path, - device_map=device, - torch_dtype=torch.bfloat16, - _attn_implementation=( - "flash_attention_2" - if self.device.startswith("cuda") - and accelerator_options.cuda_use_flash_attention2 - else "eager" - ), - ) # .to(self.device) - - else: - self.vlm_model = AutoModelForVision2Seq.from_pretrained( - artifacts_path, - device_map=device, - torch_dtype="auto", - quantization_config=self.param_quantization_config, - _attn_implementation=( - "flash_attention_2" - if self.device.startswith("cuda") - and accelerator_options.cuda_use_flash_attention2 - else "eager" - ), - ) # .to(self.device) - - @staticmethod - def download_models( - repo_id: str, - local_dir: Optional[Path] = None, - force: bool = False, - progress: bool = False, - ) -> Path: - from huggingface_hub import snapshot_download - from huggingface_hub.utils import disable_progress_bars - - if not progress: - disable_progress_bars() - download_path = snapshot_download( - repo_id=repo_id, - force_download=force, - local_dir=local_dir, - # revision="v0.0.1", - ) - - return Path(download_path) - - def __call__( - self, conv_res: ConversionResult, page_batch: Iterable[Page] - ) -> Iterable[Page]: - for page in page_batch: - assert page._backend is not None - if not page._backend.is_valid(): - yield page - else: - with TimeRecorder(conv_res, "vlm"): - assert page.size is not None - - hi_res_image = page.get_image(scale=2.0) # 144dpi - # hi_res_image = page.get_image(scale=1.0) # 72dpi - - if hi_res_image is not None: - im_width, im_height = hi_res_image.size - - # populate page_tags with predicted doc tags - page_tags = "" - - if hi_res_image: - if hi_res_image.mode != "RGB": - hi_res_image = hi_res_image.convert("RGB") - - messages = [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "This is a page from a document.", - }, - {"type": "image"}, - {"type": "text", "text": self.param_question}, - ], - } - ] - prompt = self.processor.apply_chat_template( - messages, add_generation_prompt=False - ) - inputs = self.processor( - text=prompt, images=[hi_res_image], return_tensors="pt" - ) - inputs = {k: v.to(self.device) for k, v in inputs.items()} - - start_time = time.time() - # Call model to generate: - generated_ids = self.vlm_model.generate( - **inputs, max_new_tokens=4096, use_cache=True - ) - - generation_time = time.time() - start_time - generated_texts = self.processor.batch_decode( - generated_ids[:, inputs["input_ids"].shape[1] :], - skip_special_tokens=False, - )[0] - - num_tokens = len(generated_ids[0]) - page_tags = generated_texts - - _log.debug( - f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds." - ) - - # inference_time = time.time() - start_time - # tokens_per_second = num_tokens / generation_time - # print("") - # print(f"Page Inference Time: {inference_time:.2f} seconds") - # print(f"Total tokens on page: {num_tokens:.2f}") - # print(f"Tokens/sec: {tokens_per_second:.2f}") - # print("") - page.predictions.vlm_response = VlmPrediction(text=page_tags) - - yield page diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index e2abb373..d8e9c032 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -10,11 +10,12 @@ from docling_core.types.doc import DocItemLabel from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor from PIL import Image +from docling.datamodel.accelerator_options import AcceleratorOptions from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import AcceleratorOptions from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel +from docling.models.utils.hf_model_download import download_hf_model from docling.utils.accelerator_utils import decide_device from docling.utils.layout_postprocessor import LayoutPostprocessor from docling.utils.profiling import TimeRecorder @@ -83,20 +84,14 @@ class LayoutModel(BasePageModel): force: bool = False, progress: bool = False, ) -> Path: - from huggingface_hub import snapshot_download - from huggingface_hub.utils import disable_progress_bars - - if not progress: - disable_progress_bars() - download_path = snapshot_download( + return download_hf_model( repo_id="ds4sd/docling-models", - force_download=force, + revision="v2.2.0", local_dir=local_dir, - revision="v2.1.0", + force=force, + progress=progress, ) - return Path(download_path) - def draw_clusters_and_cells_side_by_side( self, conv_res, page, clusters, mode_prefix: str, show: bool = False ): @@ -185,13 +180,23 @@ class LayoutModel(BasePageModel): ).postprocess() # processed_clusters, processed_cells = clusters, page.cells - conv_res.confidence.pages[page.page_no].layout_score = float( - np.mean([c.confidence for c in processed_clusters]) - ) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Mean of empty slice|invalid value encountered in scalar divide", + RuntimeWarning, + "numpy", + ) - conv_res.confidence.pages[page.page_no].ocr_score = float( - np.mean([c.confidence for c in processed_cells if c.from_ocr]) - ) + conv_res.confidence.pages[page.page_no].layout_score = float( + np.mean([c.confidence for c in processed_clusters]) + ) + + conv_res.confidence.pages[page.page_no].ocr_score = float( + np.mean( + [c.confidence for c in processed_cells if c.from_ocr] + ) + ) page.cells = processed_cells page.predictions.layout = LayoutPrediction( diff --git a/docling/models/ocr_mac_model.py b/docling/models/ocr_mac_model.py index a8ff55b8..a410a7f6 100644 --- a/docling/models/ocr_mac_model.py +++ b/docling/models/ocr_mac_model.py @@ -8,10 +8,10 @@ from typing import Optional, Type from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc.page import BoundingRectangle, TextCell +from docling.datamodel.accelerator_options import AcceleratorOptions from docling.datamodel.base_models import Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( - AcceleratorOptions, OcrMacOptions, OcrOptions, ) diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index 6a1dcf19..3cfa6352 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -1,4 +1,5 @@ import re +import warnings from collections.abc import Iterable from pathlib import Path from typing import Optional @@ -7,7 +8,7 @@ import numpy as np from PIL import ImageDraw from pydantic import BaseModel -from docling.datamodel.base_models import Page, ScoreValue +from docling.datamodel.base_models import Page from docling.datamodel.document import ConversionResult from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel @@ -76,11 +77,15 @@ class PagePreprocessingModel(BasePageModel): score = self.rate_text_quality(c.text) text_scores.append(score) - conv_res.confidence.pages[page.page_no].parse_score = float( - np.nanquantile( - text_scores, q=0.10 - ) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells. - ) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "Mean of empty slice", RuntimeWarning, "numpy" + ) + conv_res.confidence.pages[page.page_no].parse_score = float( + np.nanquantile( + text_scores, q=0.10 + ) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells. + ) # DEBUG code: def draw_text_boxes(image, cells, show: bool = False): diff --git a/docling/models/picture_description_api_model.py b/docling/models/picture_description_api_model.py index eb331b29..a3c0c2ee 100644 --- a/docling/models/picture_description_api_model.py +++ b/docling/models/picture_description_api_model.py @@ -5,8 +5,8 @@ from typing import Optional, Type, Union from PIL import Image +from docling.datamodel.accelerator_options import AcceleratorOptions from docling.datamodel.pipeline_options import ( - AcceleratorOptions, PictureDescriptionApiOptions, PictureDescriptionBaseOptions, ) diff --git a/docling/models/picture_description_base_model.py b/docling/models/picture_description_base_model.py index 2f6e6479..055c74b1 100644 --- a/docling/models/picture_description_base_model.py +++ b/docling/models/picture_description_base_model.py @@ -13,8 +13,8 @@ from docling_core.types.doc.document import ( # TODO: move import to docling_co ) from PIL import Image +from docling.datamodel.accelerator_options import AcceleratorOptions from docling.datamodel.pipeline_options import ( - AcceleratorOptions, PictureDescriptionBaseOptions, ) from docling.models.base_model import ( diff --git a/docling/models/picture_description_vlm_model.py b/docling/models/picture_description_vlm_model.py index 679e80c2..230151d6 100644 --- a/docling/models/picture_description_vlm_model.py +++ b/docling/models/picture_description_vlm_model.py @@ -4,16 +4,21 @@ from typing import Optional, Type, Union from PIL import Image +from docling.datamodel.accelerator_options import AcceleratorOptions from docling.datamodel.pipeline_options import ( - AcceleratorOptions, PictureDescriptionBaseOptions, PictureDescriptionVlmOptions, ) from docling.models.picture_description_base_model import PictureDescriptionBaseModel +from docling.models.utils.hf_model_download import ( + HuggingFaceModelDownloadMixin, +) from docling.utils.accelerator_utils import decide_device -class PictureDescriptionVlmModel(PictureDescriptionBaseModel): +class PictureDescriptionVlmModel( + PictureDescriptionBaseModel, HuggingFaceModelDownloadMixin +): @classmethod def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]: return PictureDescriptionVlmOptions @@ -66,26 +71,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel): self.provenance = f"{self.options.repo_id}" - @staticmethod - def download_models( - repo_id: str, - local_dir: Optional[Path] = None, - force: bool = False, - progress: bool = False, - ) -> Path: - from huggingface_hub import snapshot_download - from huggingface_hub.utils import disable_progress_bars - - if not progress: - disable_progress_bars() - download_path = snapshot_download( - repo_id=repo_id, - force_download=force, - local_dir=local_dir, - ) - - return Path(download_path) - def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]: from transformers import GenerationConfig diff --git a/docling/models/rapid_ocr_model.py b/docling/models/rapid_ocr_model.py index 2c7f4357..b01c5acf 100644 --- a/docling/models/rapid_ocr_model.py +++ b/docling/models/rapid_ocr_model.py @@ -7,11 +7,10 @@ import numpy from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc.page import BoundingRectangle, TextCell +from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions from docling.datamodel.base_models import Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, OcrOptions, RapidOcrOptions, ) diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index a7679eab..b90e85d5 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -13,16 +13,16 @@ from docling_core.types.doc.page import ( from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor from PIL import ImageDraw +from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions from docling.datamodel.base_models import Page, Table, TableStructurePrediction from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, TableFormerMode, TableStructureOptions, ) from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel +from docling.models.utils.hf_model_download import download_hf_model from docling.utils.accelerator_utils import decide_device from docling.utils.profiling import TimeRecorder @@ -90,20 +90,14 @@ class TableStructureModel(BasePageModel): def download_models( local_dir: Optional[Path] = None, force: bool = False, progress: bool = False ) -> Path: - from huggingface_hub import snapshot_download - from huggingface_hub.utils import disable_progress_bars - - if not progress: - disable_progress_bars() - download_path = snapshot_download( + return download_hf_model( repo_id="ds4sd/docling-models", - force_download=force, - local_dir=local_dir, revision="v2.2.0", + local_dir=local_dir, + force=force, + progress=progress, ) - return Path(download_path) - def draw_table_and_cells( self, conv_res: ConversionResult, diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 8bca5479..e6901045 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -13,10 +13,10 @@ import pandas as pd from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc.page import TextCell +from docling.datamodel.accelerator_options import AcceleratorOptions from docling.datamodel.base_models import Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( - AcceleratorOptions, OcrOptions, TesseractCliOcrOptions, ) diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index 108485d7..18ec34dc 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -7,10 +7,10 @@ from typing import Iterable, Optional, Type from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc.page import TextCell +from docling.datamodel.accelerator_options import AcceleratorOptions from docling.datamodel.base_models import Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( - AcceleratorOptions, OcrOptions, TesseractOcrOptions, ) diff --git a/docling/models/utils/__init__.py b/docling/models/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docling/models/utils/hf_model_download.py b/docling/models/utils/hf_model_download.py new file mode 100644 index 00000000..3595166a --- /dev/null +++ b/docling/models/utils/hf_model_download.py @@ -0,0 +1,40 @@ +import logging +from pathlib import Path +from typing import Optional + +_log = logging.getLogger(__name__) + + +def download_hf_model( + repo_id: str, + local_dir: Optional[Path] = None, + force: bool = False, + progress: bool = False, + revision: Optional[str] = None, +) -> Path: + from huggingface_hub import snapshot_download + from huggingface_hub.utils import disable_progress_bars + + if not progress: + disable_progress_bars() + download_path = snapshot_download( + repo_id=repo_id, + force_download=force, + local_dir=local_dir, + revision=revision, + ) + + return Path(download_path) + + +class HuggingFaceModelDownloadMixin: + @staticmethod + def download_models( + repo_id: str, + local_dir: Optional[Path] = None, + force: bool = False, + progress: bool = False, + ) -> Path: + return download_hf_model( + repo_id=repo_id, local_dir=local_dir, force=force, progress=progress + ) diff --git a/docling/models/vlm_models_inline/__init__.py b/docling/models/vlm_models_inline/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py new file mode 100644 index 00000000..de7f289d --- /dev/null +++ b/docling/models/vlm_models_inline/hf_transformers_model.py @@ -0,0 +1,194 @@ +import importlib.metadata +import logging +import time +from collections.abc import Iterable +from pathlib import Path +from typing import Any, Optional + +from docling.datamodel.accelerator_options import ( + AcceleratorOptions, +) +from docling.datamodel.base_models import Page, VlmPrediction +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options_vlm_model import ( + InlineVlmOptions, + TransformersModelType, +) +from docling.models.base_model import BasePageModel +from docling.models.utils.hf_model_download import ( + HuggingFaceModelDownloadMixin, +) +from docling.utils.accelerator_utils import decide_device +from docling.utils.profiling import TimeRecorder + +_log = logging.getLogger(__name__) + + +class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMixin): + def __init__( + self, + enabled: bool, + artifacts_path: Optional[Path], + accelerator_options: AcceleratorOptions, + vlm_options: InlineVlmOptions, + ): + self.enabled = enabled + + self.vlm_options = vlm_options + + if self.enabled: + import torch + from transformers import ( + AutoModel, + AutoModelForCausalLM, + AutoModelForVision2Seq, + AutoProcessor, + BitsAndBytesConfig, + GenerationConfig, + ) + + transformers_version = importlib.metadata.version("transformers") + if ( + self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct" + and transformers_version >= "4.52.0" + ): + raise NotImplementedError( + f"Phi 4 only works with transformers<4.52.0 but you have {transformers_version=}. Please downgrage running pip install -U 'transformers<4.52.0'." + ) + + self.device = decide_device( + accelerator_options.device, + supported_devices=vlm_options.supported_devices, + ) + _log.debug(f"Available device for VLM: {self.device}") + + self.use_cache = vlm_options.use_kv_cache + self.max_new_tokens = vlm_options.max_new_tokens + self.temperature = vlm_options.temperature + + repo_cache_folder = vlm_options.repo_id.replace("/", "--") + + if artifacts_path is None: + artifacts_path = self.download_models(self.vlm_options.repo_id) + elif (artifacts_path / repo_cache_folder).exists(): + artifacts_path = artifacts_path / repo_cache_folder + + self.param_quantization_config: Optional[BitsAndBytesConfig] = None + if vlm_options.quantized: + self.param_quantization_config = BitsAndBytesConfig( + load_in_8bit=vlm_options.load_in_8bit, + llm_int8_threshold=vlm_options.llm_int8_threshold, + ) + + model_cls: Any = AutoModel + if ( + self.vlm_options.transformers_model_type + == TransformersModelType.AUTOMODEL_CAUSALLM + ): + model_cls = AutoModelForCausalLM + elif ( + self.vlm_options.transformers_model_type + == TransformersModelType.AUTOMODEL_VISION2SEQ + ): + model_cls = AutoModelForVision2Seq + + self.processor = AutoProcessor.from_pretrained( + artifacts_path, + trust_remote_code=vlm_options.trust_remote_code, + ) + self.vlm_model = model_cls.from_pretrained( + artifacts_path, + device_map=self.device, + _attn_implementation=( + "flash_attention_2" + if self.device.startswith("cuda") + and accelerator_options.cuda_use_flash_attention2 + else "eager" + ), + trust_remote_code=vlm_options.trust_remote_code, + ) + + # Load generation config + self.generation_config = GenerationConfig.from_pretrained(artifacts_path) + + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: + for page in page_batch: + assert page._backend is not None + if not page._backend.is_valid(): + yield page + else: + with TimeRecorder(conv_res, "vlm"): + assert page.size is not None + + hi_res_image = page.get_image(scale=self.vlm_options.scale) + + # Define prompt structure + prompt = self.formulate_prompt() + + inputs = self.processor( + text=prompt, images=[hi_res_image], return_tensors="pt" + ).to(self.device) + + start_time = time.time() + # Call model to generate: + generated_ids = self.vlm_model.generate( + **inputs, + max_new_tokens=self.max_new_tokens, + use_cache=self.use_cache, + temperature=self.temperature, + generation_config=self.generation_config, + **self.vlm_options.extra_generation_config, + ) + + generation_time = time.time() - start_time + generated_texts = self.processor.batch_decode( + generated_ids[:, inputs["input_ids"].shape[1] :], + skip_special_tokens=False, + )[0] + + num_tokens = len(generated_ids[0]) + _log.debug( + f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds." + ) + page.predictions.vlm_response = VlmPrediction( + text=generated_texts, + generation_time=generation_time, + ) + + yield page + + def formulate_prompt(self) -> str: + """Formulate a prompt for the VLM.""" + + if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct": + _log.debug("Using specialized prompt for Phi-4") + # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally + + user_prompt = "<|user|>" + assistant_prompt = "<|assistant|>" + prompt_suffix = "<|end|>" + + prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}" + _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}") + + return prompt + + messages = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "This is a page from a document.", + }, + {"type": "image"}, + {"type": "text", "text": self.vlm_options.prompt}, + ], + } + ] + prompt = self.processor.apply_chat_template( + messages, add_generation_prompt=False + ) + return prompt diff --git a/docling/models/hf_mlx_model.py b/docling/models/vlm_models_inline/mlx_model.py similarity index 55% rename from docling/models/hf_mlx_model.py rename to docling/models/vlm_models_inline/mlx_model.py index 63f8fc95..d8b90407 100644 --- a/docling/models/hf_mlx_model.py +++ b/docling/models/vlm_models_inline/mlx_model.py @@ -4,29 +4,34 @@ from collections.abc import Iterable from pathlib import Path from typing import Optional -from docling.datamodel.base_models import Page, VlmPrediction -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import ( +from docling.datamodel.accelerator_options import ( AcceleratorOptions, - HuggingFaceVlmOptions, ) +from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions from docling.models.base_model import BasePageModel +from docling.models.utils.hf_model_download import ( + HuggingFaceModelDownloadMixin, +) from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) -class HuggingFaceMlxModel(BasePageModel): +class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin): def __init__( self, enabled: bool, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions, - vlm_options: HuggingFaceVlmOptions, + vlm_options: InlineVlmOptions, ): self.enabled = enabled self.vlm_options = vlm_options + self.max_tokens = vlm_options.max_new_tokens + self.temperature = vlm_options.temperature if self.enabled: try: @@ -39,42 +44,24 @@ class HuggingFaceMlxModel(BasePageModel): ) repo_cache_folder = vlm_options.repo_id.replace("/", "--") + self.apply_chat_template = apply_chat_template self.stream_generate = stream_generate # PARAMETERS: if artifacts_path is None: - artifacts_path = self.download_models(self.vlm_options.repo_id) + artifacts_path = self.download_models( + self.vlm_options.repo_id, + ) elif (artifacts_path / repo_cache_folder).exists(): artifacts_path = artifacts_path / repo_cache_folder - self.param_question = vlm_options.prompt # "Perform Layout Analysis." + self.param_question = vlm_options.prompt ## Load the model self.vlm_model, self.processor = load(artifacts_path) self.config = load_config(artifacts_path) - @staticmethod - def download_models( - repo_id: str, - local_dir: Optional[Path] = None, - force: bool = False, - progress: bool = False, - ) -> Path: - from huggingface_hub import snapshot_download - from huggingface_hub.utils import disable_progress_bars - - if not progress: - disable_progress_bars() - download_path = snapshot_download( - repo_id=repo_id, - force_download=force, - local_dir=local_dir, - # revision="v0.0.1", - ) - - return Path(download_path) - def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: @@ -83,12 +70,10 @@ class HuggingFaceMlxModel(BasePageModel): if not page._backend.is_valid(): yield page else: - with TimeRecorder(conv_res, "vlm"): + with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"): assert page.size is not None - hi_res_image = page.get_image(scale=2.0) # 144dpi - # hi_res_image = page.get_image(scale=1.0) # 72dpi - + hi_res_image = page.get_image(scale=self.vlm_options.scale) if hi_res_image is not None: im_width, im_height = hi_res_image.size @@ -104,16 +89,45 @@ class HuggingFaceMlxModel(BasePageModel): ) start_time = time.time() + _log.debug("start generating ...") + # Call model to generate: + tokens: list[VlmPredictionToken] = [] + output = "" for token in self.stream_generate( self.vlm_model, self.processor, prompt, [hi_res_image], - max_tokens=4096, + max_tokens=self.max_tokens, verbose=False, + temp=self.temperature, ): + if len(token.logprobs.shape) == 1: + tokens.append( + VlmPredictionToken( + text=token.text, + token=token.token, + logprob=token.logprobs[token.token], + ) + ) + elif ( + len(token.logprobs.shape) == 2 + and token.logprobs.shape[0] == 1 + ): + tokens.append( + VlmPredictionToken( + text=token.text, + token=token.token, + logprob=token.logprobs[0, token.token], + ) + ) + else: + _log.warning( + f"incompatible shape for logprobs: {token.logprobs.shape}" + ) + output += token.text if "" in token.text: break @@ -121,15 +135,13 @@ class HuggingFaceMlxModel(BasePageModel): generation_time = time.time() - start_time page_tags = output - _log.debug(f"Generation time {generation_time:.2f} seconds.") - - # inference_time = time.time() - start_time - # tokens_per_second = num_tokens / generation_time - # print("") - # print(f"Page Inference Time: {inference_time:.2f} seconds") - # print(f"Total tokens on page: {num_tokens:.2f}") - # print(f"Tokens/sec: {tokens_per_second:.2f}") - # print("") - page.predictions.vlm_response = VlmPrediction(text=page_tags) + _log.debug( + f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)." + ) + page.predictions.vlm_response = VlmPrediction( + text=page_tags, + generation_time=generation_time, + generated_tokens=tokens, + ) yield page diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 4269900c..88317fd3 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -8,7 +8,7 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend -from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores +from docling.datamodel.base_models import AssembledUnit, Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.settings import settings @@ -55,11 +55,13 @@ class StandardPdfPipeline(PaginatedPipeline): "When defined, it must point to a folder containing all models required by the pipeline." ) - self.keep_images = ( - self.pipeline_options.generate_page_images - or self.pipeline_options.generate_picture_images - or self.pipeline_options.generate_table_images - ) + with warnings.catch_warnings(): # deprecated generate_table_images + warnings.filterwarnings("ignore", category=DeprecationWarning) + self.keep_images = ( + self.pipeline_options.generate_page_images + or self.pipeline_options.generate_picture_images + or self.pipeline_options.generate_table_images + ) self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions()) @@ -210,64 +212,74 @@ class StandardPdfPipeline(PaginatedPipeline): ) # Generate images of the requested element types - if ( - self.pipeline_options.generate_picture_images - or self.pipeline_options.generate_table_images - ): - scale = self.pipeline_options.images_scale - for element, _level in conv_res.document.iterate_items(): - if not isinstance(element, DocItem) or len(element.prov) == 0: - continue - if ( - isinstance(element, PictureItem) - and self.pipeline_options.generate_picture_images - ) or ( - isinstance(element, TableItem) - and self.pipeline_options.generate_table_images - ): - page_ix = element.prov[0].page_no - 1 - page = next( - (p for p in conv_res.pages if p.page_no == page_ix), - cast("Page", None), - ) - assert page is not None - assert page.size is not None - assert page.image is not None + with warnings.catch_warnings(): # deprecated generate_table_images + warnings.filterwarnings("ignore", category=DeprecationWarning) + if ( + self.pipeline_options.generate_picture_images + or self.pipeline_options.generate_table_images + ): + scale = self.pipeline_options.images_scale + for element, _level in conv_res.document.iterate_items(): + if not isinstance(element, DocItem) or len(element.prov) == 0: + continue + if ( + isinstance(element, PictureItem) + and self.pipeline_options.generate_picture_images + ) or ( + isinstance(element, TableItem) + and self.pipeline_options.generate_table_images + ): + page_ix = element.prov[0].page_no - 1 + page = next( + (p for p in conv_res.pages if p.page_no == page_ix), + cast("Page", None), + ) + assert page is not None + assert page.size is not None + assert page.image is not None - crop_bbox = ( - element.prov[0] - .bbox.scaled(scale=scale) - .to_top_left_origin(page_height=page.size.height * scale) - ) + crop_bbox = ( + element.prov[0] + .bbox.scaled(scale=scale) + .to_top_left_origin( + page_height=page.size.height * scale + ) + ) - cropped_im = page.image.crop(crop_bbox.as_tuple()) - element.image = ImageRef.from_pil( - cropped_im, dpi=int(72 * scale) - ) + cropped_im = page.image.crop(crop_bbox.as_tuple()) + element.image = ImageRef.from_pil( + cropped_im, dpi=int(72 * scale) + ) # Aggregate confidence values for document: if len(conv_res.pages) > 0: - conv_res.confidence.layout_score = float( - np.nanmean( - [c.layout_score for c in conv_res.confidence.pages.values()] + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + category=RuntimeWarning, + message="Mean of empty slice|All-NaN slice encountered", ) - ) - conv_res.confidence.parse_score = float( - np.nanquantile( - [c.parse_score for c in conv_res.confidence.pages.values()], - q=0.1, # parse score should relate to worst 10% of pages. + conv_res.confidence.layout_score = float( + np.nanmean( + [c.layout_score for c in conv_res.confidence.pages.values()] + ) ) - ) - conv_res.confidence.table_score = float( - np.nanmean( - [c.table_score for c in conv_res.confidence.pages.values()] + conv_res.confidence.parse_score = float( + np.nanquantile( + [c.parse_score for c in conv_res.confidence.pages.values()], + q=0.1, # parse score should relate to worst 10% of pages. + ) ) - ) - conv_res.confidence.ocr_score = float( - np.nanmean( - [c.ocr_score for c in conv_res.confidence.pages.values()] + conv_res.confidence.table_score = float( + np.nanmean( + [c.table_score for c in conv_res.confidence.pages.values()] + ) + ) + conv_res.confidence.ocr_score = float( + np.nanmean( + [c.ocr_score for c in conv_res.confidence.pages.values()] + ) ) - ) return conv_res diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 70a46920..2ecfe55a 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -1,29 +1,46 @@ import logging +import re from io import BytesIO from pathlib import Path from typing import List, Optional, Union, cast -from docling_core.types import DoclingDocument -from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem +from docling_core.types.doc import ( + BoundingBox, + DocItem, + DoclingDocument, + ImageRef, + PictureItem, + ProvenanceItem, + TextItem, +) +from docling_core.types.doc.base import ( + BoundingBox, + Size, +) from docling_core.types.doc.document import DocTagsDocument from PIL import Image as PILImage from docling.backend.abstract_backend import AbstractDocumentBackend +from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import InputFormat, Page from docling.datamodel.document import ConversionResult, InputDocument from docling.datamodel.pipeline_options import ( - ApiVlmOptions, - HuggingFaceVlmOptions, - InferenceFramework, - ResponseFormat, VlmPipelineOptions, ) +from docling.datamodel.pipeline_options_vlm_model import ( + ApiVlmOptions, + InferenceFramework, + InlineVlmOptions, + ResponseFormat, +) from docling.datamodel.settings import settings from docling.models.api_vlm_model import ApiVlmModel -from docling.models.hf_mlx_model import HuggingFaceMlxModel -from docling.models.hf_vlm_model import HuggingFaceVlmModel +from docling.models.vlm_models_inline.hf_transformers_model import ( + HuggingFaceTransformersVlmModel, +) +from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel from docling.pipeline.base_pipeline import PaginatedPipeline from docling.utils.profiling import ProfilingScope, TimeRecorder @@ -66,8 +83,8 @@ class VlmPipeline(PaginatedPipeline): vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options), ), ] - elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions): - vlm_options = cast(HuggingFaceVlmOptions, self.pipeline_options.vlm_options) + elif isinstance(self.pipeline_options.vlm_options, InlineVlmOptions): + vlm_options = cast(InlineVlmOptions, self.pipeline_options.vlm_options) if vlm_options.inference_framework == InferenceFramework.MLX: self.build_pipe = [ HuggingFaceMlxModel( @@ -77,15 +94,19 @@ class VlmPipeline(PaginatedPipeline): vlm_options=vlm_options, ), ] - else: + elif vlm_options.inference_framework == InferenceFramework.TRANSFORMERS: self.build_pipe = [ - HuggingFaceVlmModel( + HuggingFaceTransformersVlmModel( enabled=True, # must be always enabled for this pipeline to make sense. artifacts_path=artifacts_path, accelerator_options=pipeline_options.accelerator_options, vlm_options=vlm_options, ), ] + else: + raise ValueError( + f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}" + ) self.enrichment_pipe = [ # Other models working on `NodeItem` elements in the DoclingDocument @@ -116,49 +137,19 @@ class VlmPipeline(PaginatedPipeline): self.pipeline_options.vlm_options.response_format == ResponseFormat.DOCTAGS ): - doctags_list = [] - image_list = [] - for page in conv_res.pages: - predicted_doctags = "" - img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)") - if page.predictions.vlm_response: - predicted_doctags = page.predictions.vlm_response.text - if page.image: - img = page.image - image_list.append(img) - doctags_list.append(predicted_doctags) + conv_res.document = self._turn_dt_into_doc(conv_res) - doctags_list_c = cast(List[Union[Path, str]], doctags_list) - image_list_c = cast(List[Union[Path, PILImage.Image]], image_list) - doctags_doc = DocTagsDocument.from_doctags_and_image_pairs( - doctags_list_c, image_list_c - ) - conv_res.document = DoclingDocument.load_from_doctags(doctags_doc) - - # If forced backend text, replace model predicted text with backend one - if self.force_backend_text: - scale = self.pipeline_options.images_scale - for element, _level in conv_res.document.iterate_items(): - if not isinstance(element, TextItem) or len(element.prov) == 0: - continue - page_ix = element.prov[0].page_no - 1 - page = conv_res.pages[page_ix] - if not page.size: - continue - crop_bbox = ( - element.prov[0] - .bbox.scaled(scale=scale) - .to_top_left_origin(page_height=page.size.height * scale) - ) - txt = self.extract_text_from_backend(page, crop_bbox) - element.text = txt - element.orig = txt elif ( self.pipeline_options.vlm_options.response_format == ResponseFormat.MARKDOWN ): conv_res.document = self._turn_md_into_doc(conv_res) + elif ( + self.pipeline_options.vlm_options.response_format == ResponseFormat.HTML + ): + conv_res.document = self._turn_html_into_doc(conv_res) + else: raise RuntimeError( f"Unsupported VLM response format {self.pipeline_options.vlm_options.response_format}" @@ -192,23 +183,199 @@ class VlmPipeline(PaginatedPipeline): return conv_res - def _turn_md_into_doc(self, conv_res): - predicted_text = "" - for pg_idx, page in enumerate(conv_res.pages): + def _turn_dt_into_doc(self, conv_res) -> DoclingDocument: + doctags_list = [] + image_list = [] + for page in conv_res.pages: + predicted_doctags = "" + img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)") if page.predictions.vlm_response: - predicted_text += page.predictions.vlm_response.text + "\n\n" - response_bytes = BytesIO(predicted_text.encode("utf8")) - out_doc = InputDocument( - path_or_stream=response_bytes, - filename=conv_res.input.file.name, - format=InputFormat.MD, - backend=MarkdownDocumentBackend, + predicted_doctags = page.predictions.vlm_response.text + if page.image: + img = page.image + image_list.append(img) + doctags_list.append(predicted_doctags) + + doctags_list_c = cast(List[Union[Path, str]], doctags_list) + image_list_c = cast(List[Union[Path, PILImage.Image]], image_list) + doctags_doc = DocTagsDocument.from_doctags_and_image_pairs( + doctags_list_c, image_list_c ) - backend = MarkdownDocumentBackend( - in_doc=out_doc, - path_or_stream=response_bytes, + conv_res.document = DoclingDocument.load_from_doctags( + doctag_document=doctags_doc ) - return backend.convert() + + # If forced backend text, replace model predicted text with backend one + if page.size: + if self.force_backend_text: + scale = self.pipeline_options.images_scale + for element, _level in conv_res.document.iterate_items(): + if not isinstance(element, TextItem) or len(element.prov) == 0: + continue + crop_bbox = ( + element.prov[0] + .bbox.scaled(scale=scale) + .to_top_left_origin(page_height=page.size.height * scale) + ) + txt = self.extract_text_from_backend(page, crop_bbox) + element.text = txt + element.orig = txt + + return conv_res.document + + def _turn_md_into_doc(self, conv_res): + def _extract_markdown_code(text): + """ + Extracts text from markdown code blocks (enclosed in triple backticks). + If no code blocks are found, returns the original text. + + Args: + text (str): Input text that may contain markdown code blocks + + Returns: + str: Extracted code if code blocks exist, otherwise original text + """ + # Regex pattern to match content between triple backticks + # This handles multiline content and optional language specifier + pattern = r"^```(?:\w*\n)?(.*?)```(\n)*$" + + # Search with DOTALL flag to match across multiple lines + mtch = re.search(pattern, text, re.DOTALL) + + if mtch: + # Return only the content of the first capturing group + return mtch.group(1) + else: + # No code blocks found, return original text + return text + + for pg_idx, page in enumerate(conv_res.pages): + page_no = pg_idx + 1 # FIXME: might be incorrect + + predicted_text = "" + if page.predictions.vlm_response: + predicted_text = page.predictions.vlm_response.text + "\n\n" + + predicted_text = _extract_markdown_code(text=predicted_text) + + response_bytes = BytesIO(predicted_text.encode("utf8")) + out_doc = InputDocument( + path_or_stream=response_bytes, + filename=conv_res.input.file.name, + format=InputFormat.MD, + backend=MarkdownDocumentBackend, + ) + backend = MarkdownDocumentBackend( + in_doc=out_doc, + path_or_stream=response_bytes, + ) + page_doc = backend.convert() + + if page.image is not None: + pg_width = page.image.width + pg_height = page.image.height + else: + pg_width = 1 + pg_height = 1 + + conv_res.document.add_page( + page_no=page_no, + size=Size(width=pg_width, height=pg_height), + image=ImageRef.from_pil(image=page.image, dpi=72) + if page.image + else None, + ) + + for item, level in page_doc.iterate_items(): + item.prov = [ + ProvenanceItem( + page_no=pg_idx + 1, + bbox=BoundingBox( + t=0.0, b=0.0, l=0.0, r=0.0 + ), # FIXME: would be nice not to have to "fake" it + charspan=[0, 0], + ) + ] + conv_res.document.append_child_item(child=item) + + return conv_res.document + + def _turn_html_into_doc(self, conv_res): + def _extract_html_code(text): + """ + Extracts text from markdown code blocks (enclosed in triple backticks). + If no code blocks are found, returns the original text. + + Args: + text (str): Input text that may contain markdown code blocks + + Returns: + str: Extracted code if code blocks exist, otherwise original text + """ + # Regex pattern to match content between triple backticks + # This handles multiline content and optional language specifier + pattern = r"^```(?:\w*\n)?(.*?)```(\n)*$" + + # Search with DOTALL flag to match across multiple lines + mtch = re.search(pattern, text, re.DOTALL) + + if mtch: + # Return only the content of the first capturing group + return mtch.group(1) + else: + # No code blocks found, return original text + return text + + for pg_idx, page in enumerate(conv_res.pages): + page_no = pg_idx + 1 # FIXME: might be incorrect + + predicted_text = "" + if page.predictions.vlm_response: + predicted_text = page.predictions.vlm_response.text + "\n\n" + + predicted_text = _extract_html_code(text=predicted_text) + + response_bytes = BytesIO(predicted_text.encode("utf8")) + out_doc = InputDocument( + path_or_stream=response_bytes, + filename=conv_res.input.file.name, + format=InputFormat.MD, + backend=HTMLDocumentBackend, + ) + backend = HTMLDocumentBackend( + in_doc=out_doc, + path_or_stream=response_bytes, + ) + page_doc = backend.convert() + + if page.image is not None: + pg_width = page.image.width + pg_height = page.image.height + else: + pg_width = 1 + pg_height = 1 + + conv_res.document.add_page( + page_no=page_no, + size=Size(width=pg_width, height=pg_height), + image=ImageRef.from_pil(image=page.image, dpi=72) + if page.image + else None, + ) + + for item, level in page_doc.iterate_items(): + item.prov = [ + ProvenanceItem( + page_no=pg_idx + 1, + bbox=BoundingBox( + t=0.0, b=0.0, l=0.0, r=0.0 + ), # FIXME: would be nice not to have to "fake" it + charspan=[0, 0], + ) + ] + conv_res.document.append_child_item(child=item) + + return conv_res.document @classmethod def get_default_options(cls) -> VlmPipelineOptions: diff --git a/docling/utils/accelerator_utils.py b/docling/utils/accelerator_utils.py index 8c930250..09b6651e 100644 --- a/docling/utils/accelerator_utils.py +++ b/docling/utils/accelerator_utils.py @@ -1,13 +1,16 @@ import logging +from typing import List, Optional import torch -from docling.datamodel.pipeline_options import AcceleratorDevice +from docling.datamodel.accelerator_options import AcceleratorDevice _log = logging.getLogger(__name__) -def decide_device(accelerator_device: str) -> str: +def decide_device( + accelerator_device: str, supported_devices: Optional[List[AcceleratorDevice]] = None +) -> str: r""" Resolve the device based on the acceleration options and the available devices in the system. @@ -20,6 +23,18 @@ def decide_device(accelerator_device: str) -> str: has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available() has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available() + if supported_devices is not None: + if has_cuda and AcceleratorDevice.CUDA not in supported_devices: + _log.info( + f"Removing CUDA from available devices because it is not in {supported_devices=}" + ) + has_cuda = False + if has_mps and AcceleratorDevice.MPS not in supported_devices: + _log.info( + f"Removing MPS from available devices because it is not in {supported_devices=}" + ) + has_mps = False + if accelerator_device == AcceleratorDevice.AUTO.value: # Handle 'auto' if has_cuda: device = "cuda:0" diff --git a/docling/utils/model_downloader.py b/docling/utils/model_downloader.py index eb20f255..55383c03 100644 --- a/docling/utils/model_downloader.py +++ b/docling/utils/model_downloader.py @@ -4,18 +4,20 @@ from typing import Optional from docling.datamodel.pipeline_options import ( granite_picture_description, - smoldocling_vlm_conversion_options, - smoldocling_vlm_mlx_conversion_options, smolvlm_picture_description, ) from docling.datamodel.settings import settings +from docling.datamodel.vlm_model_specs import ( + SMOLDOCLING_MLX, + SMOLDOCLING_TRANSFORMERS, +) from docling.models.code_formula_model import CodeFormulaModel from docling.models.document_picture_classifier import DocumentPictureClassifier from docling.models.easyocr_model import EasyOcrModel -from docling.models.hf_vlm_model import HuggingFaceVlmModel from docling.models.layout_model import LayoutModel from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel from docling.models.table_structure_model import TableStructureModel +from docling.models.utils.hf_model_download import download_hf_model _log = logging.getLogger(__name__) @@ -75,7 +77,7 @@ def download_models( if with_smolvlm: _log.info("Downloading SmolVlm model...") - PictureDescriptionVlmModel.download_models( + download_hf_model( repo_id=smolvlm_picture_description.repo_id, local_dir=output_dir / smolvlm_picture_description.repo_cache_folder, force=force, @@ -84,26 +86,25 @@ def download_models( if with_smoldocling: _log.info("Downloading SmolDocling model...") - HuggingFaceVlmModel.download_models( - repo_id=smoldocling_vlm_conversion_options.repo_id, - local_dir=output_dir / smoldocling_vlm_conversion_options.repo_cache_folder, + download_hf_model( + repo_id=SMOLDOCLING_TRANSFORMERS.repo_id, + local_dir=output_dir / SMOLDOCLING_TRANSFORMERS.repo_cache_folder, force=force, progress=progress, ) if with_smoldocling_mlx: _log.info("Downloading SmolDocling MLX model...") - HuggingFaceVlmModel.download_models( - repo_id=smoldocling_vlm_mlx_conversion_options.repo_id, - local_dir=output_dir - / smoldocling_vlm_mlx_conversion_options.repo_cache_folder, + download_hf_model( + repo_id=SMOLDOCLING_MLX.repo_id, + local_dir=output_dir / SMOLDOCLING_MLX.repo_cache_folder, force=force, progress=progress, ) if with_granite_vision: _log.info("Downloading Granite Vision model...") - PictureDescriptionVlmModel.download_models( + download_hf_model( repo_id=granite_picture_description.repo_id, local_dir=output_dir / granite_picture_description.repo_cache_folder, force=force, diff --git a/docs/examples/compare_vlm_models.py b/docs/examples/compare_vlm_models.py new file mode 100644 index 00000000..f9bd2dcd --- /dev/null +++ b/docs/examples/compare_vlm_models.py @@ -0,0 +1,160 @@ +# Compare VLM models +# ================== +# +# This example runs the VLM pipeline with different vision-language models. +# Their runtime as well output quality is compared. + +import json +import sys +import time +from pathlib import Path + +from docling_core.types.doc import DocItemLabel, ImageRefMode +from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS +from tabulate import tabulate + +from docling.datamodel import vlm_model_specs +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import ( + VlmPipelineOptions, +) +from docling.datamodel.pipeline_options_vlm_model import InferenceFramework +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.pipeline.vlm_pipeline import VlmPipeline + + +def convert(sources: list[Path], converter: DocumentConverter): + model_id = pipeline_options.vlm_options.repo_id.replace("/", "_") + framework = pipeline_options.vlm_options.inference_framework + for source in sources: + print("================================================") + print("Processing...") + print(f"Source: {source}") + print("---") + print(f"Model: {model_id}") + print(f"Framework: {framework}") + print("================================================") + print("") + + res = converter.convert(source) + + print("") + + fname = f"{res.input.file.stem}-{model_id}-{framework}" + + inference_time = 0.0 + for i, page in enumerate(res.pages): + inference_time += page.predictions.vlm_response.generation_time + print("") + print( + f" ---------- Predicted page {i} in {pipeline_options.vlm_options.response_format} in {page.predictions.vlm_response.generation_time} [sec]:" + ) + print(page.predictions.vlm_response.text) + print(" ---------- ") + + print("===== Final output of the converted document =======") + + with (out_path / f"{fname}.json").open("w") as fp: + fp.write(json.dumps(res.document.export_to_dict())) + + res.document.save_as_json( + out_path / f"{fname}.json", + image_mode=ImageRefMode.PLACEHOLDER, + ) + print(f" => produced {out_path / fname}.json") + + res.document.save_as_markdown( + out_path / f"{fname}.md", + image_mode=ImageRefMode.PLACEHOLDER, + ) + print(f" => produced {out_path / fname}.md") + + res.document.save_as_html( + out_path / f"{fname}.html", + image_mode=ImageRefMode.EMBEDDED, + labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE], + split_page_view=True, + ) + print(f" => produced {out_path / fname}.html") + + pg_num = res.document.num_pages() + print("") + print( + f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}" + ) + print("====================================================") + + return [ + source, + model_id, + str(framework), + pg_num, + inference_time, + ] + + +if __name__ == "__main__": + sources = [ + "tests/data/pdf/2305.03393v1-pg9.pdf", + ] + + out_path = Path("scratch") + out_path.mkdir(parents=True, exist_ok=True) + + ## Use VlmPipeline + pipeline_options = VlmPipelineOptions() + pipeline_options.generate_page_images = True + + ## On GPU systems, enable flash_attention_2 with CUDA: + # pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA + # pipeline_options.accelerator_options.cuda_use_flash_attention2 = True + + vlm_models = [ + ## DocTags / SmolDocling models + vlm_model_specs.SMOLDOCLING_MLX, + vlm_model_specs.SMOLDOCLING_TRANSFORMERS, + ## Markdown models (using MLX framework) + vlm_model_specs.QWEN25_VL_3B_MLX, + vlm_model_specs.PIXTRAL_12B_MLX, + vlm_model_specs.GEMMA3_12B_MLX, + ## Markdown models (using Transformers framework) + vlm_model_specs.GRANITE_VISION_TRANSFORMERS, + vlm_model_specs.PHI4_TRANSFORMERS, + vlm_model_specs.PIXTRAL_12B_TRANSFORMERS, + ] + + # Remove MLX models if not on Mac + if sys.platform != "darwin": + vlm_models = [ + m for m in vlm_models if m.inference_framework != InferenceFramework.MLX + ] + + rows = [] + for vlm_options in vlm_models: + pipeline_options.vlm_options = vlm_options + + ## Set up pipeline for PDF or image inputs + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=VlmPipeline, + pipeline_options=pipeline_options, + ), + InputFormat.IMAGE: PdfFormatOption( + pipeline_cls=VlmPipeline, + pipeline_options=pipeline_options, + ), + }, + ) + + row = convert(sources=sources, converter=converter) + rows.append(row) + + print( + tabulate( + rows, headers=["source", "model_id", "framework", "num_pages", "time"] + ) + ) + + print("see if memory gets released ...") + time.sleep(10) diff --git a/docs/examples/custom_convert.py b/docs/examples/custom_convert.py index 3b8ae6df..12dfacd5 100644 --- a/docs/examples/custom_convert.py +++ b/docs/examples/custom_convert.py @@ -3,10 +3,9 @@ import logging import time from pathlib import Path +from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, PdfPipelineOptions, ) from docling.document_converter import DocumentConverter, PdfFormatOption diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py index 96288b08..666c0604 100644 --- a/docs/examples/minimal_vlm_pipeline.py +++ b/docs/examples/minimal_vlm_pipeline.py @@ -1,101 +1,46 @@ -import json -import time -from pathlib import Path - -from docling_core.types.doc import DocItemLabel, ImageRefMode -from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS - +from docling.datamodel import vlm_model_specs from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( VlmPipelineOptions, - smoldocling_vlm_mlx_conversion_options, ) from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline -sources = [ - # "tests/data/2305.03393v1-pg9-img.png", - "tests/data/pdf/2305.03393v1-pg9.pdf", -] +source = "https://arxiv.org/pdf/2501.17887" -## Use experimental VlmPipeline -pipeline_options = VlmPipelineOptions() -# If force_backend_text = True, text from backend will be used instead of generated text -pipeline_options.force_backend_text = False +###### USING SIMPLE DEFAULT VALUES +# - SmolDocling model +# - Using the transformers framework -## On GPU systems, enable flash_attention_2 with CUDA: -# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA -# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True +converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=VlmPipeline, + ), + } +) -## Pick a VLM model. We choose SmolDocling-256M by default -# pipeline_options.vlm_options = smoldocling_vlm_conversion_options +doc = converter.convert(source=source).document -## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX -pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options +print(doc.export_to_markdown()) -## Alternative VLM models: -# pipeline_options.vlm_options = granite_vision_vlm_conversion_options -## Set up pipeline for PDF or image inputs +###### USING MACOS MPS ACCELERATOR +# For more options see the compare_vlm_models.py example. + +pipeline_options = VlmPipelineOptions( + vlm_options=vlm_model_specs.SMOLDOCLING_MLX, +) + converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_cls=VlmPipeline, pipeline_options=pipeline_options, ), - InputFormat.IMAGE: PdfFormatOption( - pipeline_cls=VlmPipeline, - pipeline_options=pipeline_options, - ), } ) -out_path = Path("scratch") -out_path.mkdir(parents=True, exist_ok=True) +doc = converter.convert(source=source).document -for source in sources: - start_time = time.time() - print("================================================") - print(f"Processing... {source}") - print("================================================") - print("") - - res = converter.convert(source) - - print("") - print(res.document.export_to_markdown()) - - for page in res.pages: - print("") - print("Predicted page in DOCTAGS:") - print(page.predictions.vlm_response.text) - - res.document.save_as_html( - filename=Path(f"{out_path}/{res.input.file.stem}.html"), - image_mode=ImageRefMode.REFERENCED, - labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE], - ) - - with (out_path / f"{res.input.file.stem}.json").open("w") as fp: - fp.write(json.dumps(res.document.export_to_dict())) - - res.document.save_as_json( - out_path / f"{res.input.file.stem}.json", - image_mode=ImageRefMode.PLACEHOLDER, - ) - - res.document.save_as_markdown( - out_path / f"{res.input.file.stem}.md", - image_mode=ImageRefMode.PLACEHOLDER, - ) - - pg_num = res.document.num_pages() - print("") - inference_time = time.time() - start_time - print( - f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}" - ) - -print("================================================") -print("done!") -print("================================================") +print(doc.export_to_markdown()) diff --git a/docs/examples/run_with_accelerator.py b/docs/examples/run_with_accelerator.py index a5380740..6b3ddc68 100644 --- a/docs/examples/run_with_accelerator.py +++ b/docs/examples/run_with_accelerator.py @@ -1,9 +1,8 @@ from pathlib import Path +from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, PdfPipelineOptions, ) from docling.datamodel.settings import settings diff --git a/docs/examples/translate.py b/docs/examples/translate.py index 229d5451..f2711a23 100644 --- a/docs/examples/translate.py +++ b/docs/examples/translate.py @@ -1,5 +1,4 @@ import logging -import time from pathlib import Path from docling_core.types.doc import ImageRefMode, TableItem, TextItem diff --git a/docs/examples/vlm_pipeline_api_model.py b/docs/examples/vlm_pipeline_api_model.py index 504cecc5..ec29e21c 100644 --- a/docs/examples/vlm_pipeline_api_model.py +++ b/docs/examples/vlm_pipeline_api_model.py @@ -7,10 +7,9 @@ from dotenv import load_dotenv from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( - ApiVlmOptions, - ResponseFormat, VlmPipelineOptions, ) +from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, ResponseFormat from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline diff --git a/docs/index.md b/docs/index.md index abb1b487..ad9ac80e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -27,7 +27,7 @@ Docling simplifies document processing, parsing diverse formats — including ad * 🔒 Local execution capabilities for sensitive data and air-gapped environments * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI * 🔍 Extensive OCR support for scanned PDFs and images -* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕🔥 +* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🔥 * 💻 Simple and convenient CLI ### Coming soon @@ -39,7 +39,7 @@ Docling simplifies document processing, parsing diverse formats — including ad ## Get started
- Concepts
Learn Docling fundamendals
+ Concepts
Learn Docling fundamentals
Examples
Try out recipes for various use cases, including conversion, RAG, and more
Integrations
Check out integrations with popular frameworks and tools
Reference
See more API details
diff --git a/docs/usage/vision_models.md b/docs/usage/vision_models.md new file mode 100644 index 00000000..ba3fc3eb --- /dev/null +++ b/docs/usage/vision_models.md @@ -0,0 +1,121 @@ + +The `VlmPipeline` in Docling allows to convert documents end-to-end using a vision-language model. + +Docling supports vision-language models which output: + +- DocTags (e.g. [SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)), the preferred choice +- Markdown +- HTML + + +For running Docling using local models with the `VlmPipeline`: + +=== "CLI" + + ```bash + docling --pipeline vlm FILE + ``` + +=== "Python" + + See also the example [minimal_vlm_pipeline.py](./../examples/minimal_vlm_pipeline.py). + + ```python + from docling.datamodel.base_models import InputFormat + from docling.document_converter import DocumentConverter, PdfFormatOption + from docling.pipeline.vlm_pipeline import VlmPipeline + + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=VlmPipeline, + ), + } + ) + + doc = converter.convert(source="FILE").document + ``` + +## Available local models + +By default, the vision-language models are running locally. +Docling allows to choose between the Hugging Face [Transformers](https://github.com/huggingface/transformers) framweork and the [MLX](https://github.com/Blaizzy/mlx-vlm) (for Apple devices with MPS acceleration) one. + +The following table reports the models currently available out-of-the-box. + +| Model instance | Model | Framework | Device | Num pages | Inference time (sec) | +| ---------------|------ | --------- | ------ | --------- | ---------------------| +| `vlm_model_specs.SMOLDOCLING_TRANSFORMERS` | [ds4sd/SmolDocling-256M-preview](https://huggingface.co/ds4sd/SmolDocling-256M-preview) | `Transformers/AutoModelForVision2Seq` | MPS | 1 | 102.212 | +| `vlm_model_specs.SMOLDOCLING_MLX` | [ds4sd/SmolDocling-256M-preview-mlx-bf16](https://huggingface.co/ds4sd/SmolDocling-256M-preview-mlx-bf16) | `MLX`| MPS | 1 | 6.15453 | +| `vlm_model_specs.QWEN25_VL_3B_MLX` | [mlx-community/Qwen2.5-VL-3B-Instruct-bf16](https://huggingface.co/mlx-community/Qwen2.5-VL-3B-Instruct-bf16) | `MLX`| MPS | 1 | 23.4951 | +| `vlm_model_specs.PIXTRAL_12B_MLX` | [mlx-community/pixtral-12b-bf16](https://huggingface.co/mlx-community/pixtral-12b-bf16) | `MLX` | MPS | 1 | 308.856 | +| `vlm_model_specs.GEMMA3_12B_MLX` | [mlx-community/gemma-3-12b-it-bf16](https://huggingface.co/mlx-community/gemma-3-12b-it-bf16) | `MLX` | MPS | 1 | 378.486 | +| `vlm_model_specs.GRANITE_VISION_TRANSFORMERS` | [ibm-granite/granite-vision-3.2-2b](https://huggingface.co/ibm-granite/granite-vision-3.2-2b) | `Transformers/AutoModelForVision2Seq` | MPS | 1 | 104.75 | +| `vlm_model_specs.PHI4_TRANSFORMERS` | [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) | `Transformers/AutoModelForCasualLM` | CPU | 1 | 1175.67 | +| `vlm_model_specs.PIXTRAL_12B_TRANSFORMERS` | [mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b) | `Transformers/AutoModelForVision2Seq` | CPU | 1 | 1828.21 | + +_Inference time is computed on a Macbook M3 Max using the example page `tests/data/pdf/2305.03393v1-pg9.pdf`. The comparision is done with the example [compare_vlm_models.py](./../examples/compare_vlm_models.py)._ + +For choosing the model, the code snippet above can be extended as follow + +```python +from docling.datamodel.base_models import InputFormat +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.pipeline.vlm_pipeline import VlmPipeline +from docling.datamodel.pipeline_options import ( + VlmPipelineOptions, +) +from docling.datamodel import vlm_model_specs + +pipeline_options = VlmPipelineOptions( + vlm_options=vlm_model_specs.SMOLDOCLING_MLX, # <-- change the model here +) + +converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=VlmPipeline, + pipeline_options=pipeline_options, + ), + } +) + +doc = converter.convert(source="FILE").document +``` + +### Other models + +Other models can be configured by directly providing the Hugging Face `repo_id`, the prompt and a few more options. + +For example: + +```python +from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions, InferenceFramework, TransformersModelType + +pipeline_options = VlmPipelineOptions( + vlm_options=InlineVlmOptions( + repo_id="ibm-granite/granite-vision-3.2-2b", + prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS, + transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ, + supported_devices=[ + AcceleratorDevice.CPU, + AcceleratorDevice.CUDA, + AcceleratorDevice.MPS, + ], + scale=2.0, + temperature=0.0, + ) +) +``` + + +## Remote models + +Additionally to local models, the `VlmPipeline` allows to offload the inference to a remote service hosting the models. +Many remote inference services are provided, the key requirement is to offer an OpenAI-compatible API. This includes vLLM, Ollama, etc. + +More examples on how to connect with the remote inference services can be found in the following examples: + +- [vlm_pipeline_api_model.py](./../examples/vlm_pipeline_api_model.py) diff --git a/mkdocs.yml b/mkdocs.yml index 2e40158e..db8bf27e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -60,6 +60,7 @@ nav: - Usage: usage/index.md - Supported formats: usage/supported_formats.md - Enrichment features: usage/enrichments.md + - Vision models: usage/vision_models.md - FAQ: - FAQ: faq/index.md - Concepts: @@ -78,6 +79,7 @@ nav: - "Multi-format conversion": examples/run_with_formats.py - "VLM pipeline with SmolDocling": examples/minimal_vlm_pipeline.py - "VLM pipeline with remote model": examples/vlm_pipeline_api_model.py + - "VLM comparison": examples/compare_vlm_models.py - "Figure export": examples/export_figures.py - "Table export": examples/export_tables.py - "Multimodal export": examples/export_multimodal.py diff --git a/pyproject.toml b/pyproject.toml index c28c6588..6075a9fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "docling" -version = "2.34.0" # DO NOT EDIT, updated automatically +version = "2.35.0" # DO NOT EDIT, updated automatically description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." license = "MIT" keywords = [ diff --git a/tests/data/groundtruth/docling_v2/example_08.html.itxt b/tests/data/groundtruth/docling_v2/example_08.html.itxt new file mode 100644 index 00000000..505408e3 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_08.html.itxt @@ -0,0 +1,8 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group header-1 + item-2 at level 2: section_header: Pivot table with with 1 row header + item-3 at level 3: table with [6x4] + item-4 at level 2: section_header: Pivot table with 2 row headers + item-5 at level 3: table with [6x5] + item-6 at level 2: section_header: Equivalent pivot table + item-7 at level 3: table with [6x5] \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_08.html.json b/tests/data/groundtruth/docling_v2/example_08.html.json new file mode 100644 index 00000000..085be7ef --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_08.html.json @@ -0,0 +1,2008 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.3.0", + "name": "example_08", + "origin": { + "mimetype": "text/html", + "binary_hash": 12799593797322619937, + "filename": "example_08.html" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + } + ], + "content_layer": "body", + "name": "header-1", + "label": "section" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/tables/0" + } + ], + "content_layer": "body", + "label": "section_header", + "prov": [], + "orig": "Pivot table with with 1 row header", + "text": "Pivot table with with 1 row header", + "level": 1 + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/tables/1" + } + ], + "content_layer": "body", + "label": "section_header", + "prov": [], + "orig": "Pivot table with 2 row headers", + "text": "Pivot table with 2 row headers", + "level": 1 + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/tables/2" + } + ], + "content_layer": "body", + "label": "section_header", + "prov": [], + "orig": "Equivalent pivot table", + "text": "Equivalent pivot table", + "level": 1 + } + ], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Month", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Revenue", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cost", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 5, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "January", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$134", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$162", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "February", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$155", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "March", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$160", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$143", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "April", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$210", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "May", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$280", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$120", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 6, + "num_cols": 4, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Month", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Revenue", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cost", + "column_header": true, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 5, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "January", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$134", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$162", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 5, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "February", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$155", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 5, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "March", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$160", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$143", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 5, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "April", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$210", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 5, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "May", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$280", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$120", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/1", + "parent": { + "$ref": "#/texts/1" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Quarter", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Month", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Revenue", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Cost", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 6, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q1", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "January", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$134", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$162", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "February", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$155", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "March", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$160", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$143", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q2", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "April", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$210", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "May", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$280", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$120", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 6, + "num_cols": 5, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Quarter", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Month", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Revenue", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Cost", + "column_header": true, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 6, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q1", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "January", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$134", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$162", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 6, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q1", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "February", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$155", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 6, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q1", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "March", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$160", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$143", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 6, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q2", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "April", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$210", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 6, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q2", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "May", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$280", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$120", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/2", + "parent": { + "$ref": "#/texts/2" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Quarter", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Month", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Revenue", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Cost", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 7, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q1", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "January", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$134", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$162", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "February", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$155", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "March", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$160", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$143", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q2", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "April", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$210", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "May", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$280", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$120", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 6, + "num_cols": 5, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Quarter", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Month", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Revenue", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Cost", + "column_header": true, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 7, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q1", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "January", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$134", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$162", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 7, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q1", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "February", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$155", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 7, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q1", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "March", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$160", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$143", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 7, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q2", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "April", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$210", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 7, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q2", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "May", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$280", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$120", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + } + ], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_08.html.md b/tests/data/groundtruth/docling_v2/example_08.html.md new file mode 100644 index 00000000..462a8101 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_08.html.md @@ -0,0 +1,29 @@ +## Pivot table with with 1 row header + +| Year | Month | Revenue | Cost | +|--------|----------|-----------|--------| +| 2025 | January | $134 | $162 | +| 2025 | February | $150 | $155 | +| 2025 | March | $160 | $143 | +| 2025 | April | $210 | $150 | +| 2025 | May | $280 | $120 | + +## Pivot table with 2 row headers + +| Year | Quarter | Month | Revenue | Cost | +|--------|-----------|----------|-----------|--------| +| 2025 | Q1 | January | $134 | $162 | +| 2025 | Q1 | February | $150 | $155 | +| 2025 | Q1 | March | $160 | $143 | +| 2025 | Q2 | April | $210 | $150 | +| 2025 | Q2 | May | $280 | $120 | + +## Equivalent pivot table + +| Year | Quarter | Month | Revenue | Cost | +|--------|-----------|----------|-----------|--------| +| 2025 | Q1 | January | $134 | $162 | +| 2025 | Q1 | February | $150 | $155 | +| 2025 | Q1 | March | $160 | $143 | +| 2025 | Q2 | April | $210 | $150 | +| 2025 | Q2 | May | $280 | $120 | \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.itxt b/tests/data/groundtruth/docling_v2/textbox.docx.itxt new file mode 100644 index 00000000..2933724f --- /dev/null +++ b/tests/data/groundtruth/docling_v2/textbox.docx.itxt @@ -0,0 +1,94 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: paragraph: Chiayi County Shuishang Township ... mentary School Affiliated Kindergarten + item-2 at level 1: paragraph: Infectious Disease Reporting Pro ... r the 113th Academic Year Kindergarten + item-3 at level 1: paragraph: + item-4 at level 1: section: group textbox + item-5 at level 2: paragraph: Student falls ill + item-6 at level 2: paragraph: + item-7 at level 2: paragraph: + item-8 at level 2: list: group list + item-9 at level 3: list_item: Suggested Reportable Symptoms: +* ... sh +* Blisters +* Headache +* Sore throat + item-10 at level 1: list_item: + item-11 at level 1: paragraph: + item-12 at level 1: paragraph: + item-13 at level 1: section: group textbox + item-14 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms + item-15 at level 1: paragraph: + item-16 at level 1: paragraph: + item-17 at level 1: paragraph: + item-18 at level 1: paragraph: + item-19 at level 1: section: group textbox + item-20 at level 2: paragraph: Yes + item-21 at level 1: paragraph: + item-22 at level 1: paragraph: + item-23 at level 1: section: group textbox + item-24 at level 2: paragraph:  A report must be submitted wi ... saster Prevention Information Network. + item-25 at level 2: paragraph:  A report must also be submitt ... d Infectious Disease Reporting System. + item-26 at level 2: paragraph: + item-27 at level 2: paragraph: + item-28 at level 1: paragraph: + item-29 at level 1: paragraph: + item-30 at level 1: paragraph: + item-31 at level 1: paragraph: + item-32 at level 1: paragraph: + item-33 at level 1: paragraph: + item-34 at level 1: section: group textbox + item-35 at level 2: paragraph: Health Bureau: + item-36 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control. + item-37 at level 2: list: group list + item-38 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection. + item-39 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act. + item-40 at level 2: paragraph: + item-41 at level 2: paragraph: + item-42 at level 1: list: group list + item-43 at level 2: list_item: + item-44 at level 1: paragraph: + item-45 at level 1: section: group textbox + item-46 at level 2: paragraph: Department of Education: +Collabo ... vention measures at all school levels. + item-47 at level 1: paragraph: + item-48 at level 1: paragraph: + item-49 at level 1: paragraph: + item-50 at level 1: paragraph: + item-51 at level 1: paragraph: + item-52 at level 1: paragraph: + item-53 at level 1: paragraph: + item-54 at level 1: section: group textbox + item-55 at level 2: inline: group group + item-56 at level 3: paragraph: The Health Bureau will handle + item-57 at level 3: paragraph: reporting and specimen collection + item-58 at level 3: paragraph: . + item-59 at level 2: paragraph: + item-60 at level 2: paragraph: + item-61 at level 1: paragraph: + item-62 at level 1: paragraph: + item-63 at level 1: paragraph: + item-64 at level 1: section: group textbox + item-65 at level 2: paragraph: Whether the epidemic has eased. + item-66 at level 2: paragraph: + item-67 at level 2: paragraph: + item-68 at level 1: paragraph: + item-69 at level 1: section: group textbox + item-70 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease. + item-71 at level 2: paragraph: No + item-72 at level 1: paragraph: + item-73 at level 1: paragraph: + item-74 at level 1: section: group textbox + item-75 at level 1: paragraph: + item-76 at level 1: section: group textbox + item-77 at level 1: paragraph: + item-78 at level 1: paragraph: + item-79 at level 1: section: group textbox + item-80 at level 2: paragraph: Case closed. + item-81 at level 2: paragraph: + item-82 at level 2: paragraph: + item-83 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary. + item-84 at level 1: paragraph: + item-85 at level 1: section: group textbox + item-86 at level 1: paragraph: + item-87 at level 1: paragraph: + item-88 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.json b/tests/data/groundtruth/docling_v2/textbox.docx.json new file mode 100644 index 00000000..c7985b24 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/textbox.docx.json @@ -0,0 +1,1470 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.3.0", + "name": "textbox", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 830302052279341882, + "filename": "textbox.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/groups/2" + }, + { + "$ref": "#/texts/11" + }, + { + "$ref": "#/texts/12" + }, + { + "$ref": "#/texts/13" + }, + { + "$ref": "#/texts/14" + }, + { + "$ref": "#/groups/3" + }, + { + "$ref": "#/texts/16" + }, + { + "$ref": "#/texts/17" + }, + { + "$ref": "#/groups/4" + }, + { + "$ref": "#/texts/22" + }, + { + "$ref": "#/texts/23" + }, + { + "$ref": "#/texts/24" + }, + { + "$ref": "#/texts/25" + }, + { + "$ref": "#/texts/26" + }, + { + "$ref": "#/texts/27" + }, + { + "$ref": "#/groups/5" + }, + { + "$ref": "#/groups/7" + }, + { + "$ref": "#/texts/35" + }, + { + "$ref": "#/groups/8" + }, + { + "$ref": "#/texts/37" + }, + { + "$ref": "#/texts/38" + }, + { + "$ref": "#/texts/39" + }, + { + "$ref": "#/texts/40" + }, + { + "$ref": "#/texts/41" + }, + { + "$ref": "#/texts/42" + }, + { + "$ref": "#/texts/43" + }, + { + "$ref": "#/groups/9" + }, + { + "$ref": "#/texts/49" + }, + { + "$ref": "#/texts/50" + }, + { + "$ref": "#/texts/51" + }, + { + "$ref": "#/groups/11" + }, + { + "$ref": "#/texts/55" + }, + { + "$ref": "#/groups/12" + }, + { + "$ref": "#/texts/58" + }, + { + "$ref": "#/texts/59" + }, + { + "$ref": "#/groups/13" + }, + { + "$ref": "#/texts/60" + }, + { + "$ref": "#/groups/14" + }, + { + "$ref": "#/texts/61" + }, + { + "$ref": "#/texts/62" + }, + { + "$ref": "#/groups/15" + }, + { + "$ref": "#/texts/67" + }, + { + "$ref": "#/groups/16" + }, + { + "$ref": "#/texts/68" + }, + { + "$ref": "#/texts/69" + }, + { + "$ref": "#/texts/70" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/groups/1" + } + ], + "content_layer": "body", + "name": "textbox", + "label": "section" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/texts/6" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/10" + } + ], + "content_layer": "body", + "name": "textbox", + "label": "section" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/15" + } + ], + "content_layer": "body", + "name": "textbox", + "label": "section" + }, + { + "self_ref": "#/groups/4", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/18" + }, + { + "$ref": "#/texts/19" + }, + { + "$ref": "#/texts/20" + }, + { + "$ref": "#/texts/21" + } + ], + "content_layer": "body", + "name": "textbox", + "label": "section" + }, + { + "self_ref": "#/groups/5", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/28" + }, + { + "$ref": "#/texts/29" + }, + { + "$ref": "#/groups/6" + }, + { + "$ref": "#/texts/32" + }, + { + "$ref": "#/texts/33" + } + ], + "content_layer": "body", + "name": "textbox", + "label": "section" + }, + { + "self_ref": "#/groups/6", + "parent": { + "$ref": "#/groups/5" + }, + "children": [ + { + "$ref": "#/texts/30" + }, + { + "$ref": "#/texts/31" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/7", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/34" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/8", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/36" + } + ], + "content_layer": "body", + "name": "textbox", + "label": "section" + }, + { + "self_ref": "#/groups/9", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/groups/10" + }, + { + "$ref": "#/texts/47" + }, + { + "$ref": "#/texts/48" + } + ], + "content_layer": "body", + "name": "textbox", + "label": "section" + }, + { + "self_ref": "#/groups/10", + "parent": { + "$ref": "#/groups/9" + }, + "children": [ + { + "$ref": "#/texts/44" + }, + { + "$ref": "#/texts/45" + }, + { + "$ref": "#/texts/46" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/11", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/52" + }, + { + "$ref": "#/texts/53" + }, + { + "$ref": "#/texts/54" + } + ], + "content_layer": "body", + "name": "textbox", + "label": "section" + }, + { + "self_ref": "#/groups/12", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/56" + }, + { + "$ref": "#/texts/57" + } + ], + "content_layer": "body", + "name": "textbox", + "label": "section" + }, + { + "self_ref": "#/groups/13", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "name": "textbox", + "label": "section" + }, + { + "self_ref": "#/groups/14", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "name": "textbox", + "label": "section" + }, + { + "self_ref": "#/groups/15", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/63" + }, + { + "$ref": "#/texts/64" + }, + { + "$ref": "#/texts/65" + }, + { + "$ref": "#/texts/66" + } + ], + "content_layer": "body", + "name": "textbox", + "label": "section" + }, + { + "self_ref": "#/groups/16", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "name": "textbox", + "label": "section" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten", + "text": "Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten", + "text": "Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Student falls ill", + "text": "Student falls ill", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Suggested Reportable Symptoms:\n* Fever\n* Cough\n* Diarrhea\n* Vomiting\n* Rash\n* Blisters\n* Headache\n* Sore throat", + "text": "Suggested Reportable Symptoms:\n* Fever\n* Cough\n* Diarrhea\n* Vomiting\n* Rash\n* Blisters\n* Headache\n* Sore throat", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "", + "text": "", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students)\nshow the same suggested reportable symptoms", + "text": "If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students)\nshow the same suggested reportable symptoms", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Yes", + "text": "Yes", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/16", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/17", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/18", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": " A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.", + "text": " A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": " A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.", + "text": " A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/24", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/25", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/26", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/27", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/28", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Health Bureau:", + "text": "Health Bureau:", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/29", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control.", + "text": "Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/30", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "If necessary, provide health education and important reminders at the kindergarten, or notify the individual to undergo specimen collection.", + "text": "If necessary, provide health education and important reminders at the kindergarten, or notify the individual to undergo specimen collection.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/31", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Implement appropriate epidemic prevention measures in accordance with the Communicable Disease Control Act.", + "text": "Implement appropriate epidemic prevention measures in accordance with the Communicable Disease Control Act.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/32", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/33", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/34", + "parent": { + "$ref": "#/groups/7" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "", + "text": "", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/35", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/36", + "parent": { + "$ref": "#/groups/8" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", + "text": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/37", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/38", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/39", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/40", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/41", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/42", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/43", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/44", + "parent": { + "$ref": "#/groups/10" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "The Health Bureau will handle", + "text": "The Health Bureau will handle", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/45", + "parent": { + "$ref": "#/groups/10" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "reporting and specimen collection", + "text": "reporting and specimen collection", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/46", + "parent": { + "$ref": "#/groups/10" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": ".", + "text": ".", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/47", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/48", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/49", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/50", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/51", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/52", + "parent": { + "$ref": "#/groups/11" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Whether the epidemic has eased.", + "text": "Whether the epidemic has eased.", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/53", + "parent": { + "$ref": "#/groups/11" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/54", + "parent": { + "$ref": "#/groups/11" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/55", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/56", + "parent": { + "$ref": "#/groups/12" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Whether the test results are positive for a legally designated infectious disease.", + "text": "Whether the test results are positive for a legally designated infectious disease.", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/57", + "parent": { + "$ref": "#/groups/12" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "No", + "text": "No", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/58", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/59", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/60", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/61", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/62", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/63", + "parent": { + "$ref": "#/groups/15" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Case closed.", + "text": "Case closed.", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/64", + "parent": { + "$ref": "#/groups/15" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/65", + "parent": { + "$ref": "#/groups/15" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/66", + "parent": { + "$ref": "#/groups/15" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.", + "text": "The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/67", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/68", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/69", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/70", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.md b/tests/data/groundtruth/docling_v2/textbox.docx.md new file mode 100644 index 00000000..829abad9 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/textbox.docx.md @@ -0,0 +1,46 @@ +**Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten** + +**Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten** + +**Student falls ill** + +- Suggested Reportable Symptoms: +* Fever +* Cough +* Diarrhea +* Vomiting +* Rash +* Blisters +* Headache +* Sore throat + +If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students) +show the same suggested reportable symptoms + +Yes + + A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network. + + A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System. + +**Health Bureau:** + +Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control. + +- If necessary, provide health education and important reminders at the kindergarten, or notify the individual to undergo specimen collection. +- Implement appropriate epidemic prevention measures in accordance with the Communicable Disease Control Act. + +Department of Education: +Collaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels. + +The Health Bureau will handle **reporting and specimen collection** . + +**Whether the epidemic has eased.** + +**Whether the test results are positive for a legally designated infectious disease.** + +No + +**Case closed.** + +The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary. \ No newline at end of file diff --git a/tests/data/html/example_8.html b/tests/data/html/example_08.html similarity index 100% rename from tests/data/html/example_8.html rename to tests/data/html/example_08.html diff --git a/tests/test_backend_csv.py b/tests/test_backend_csv.py index d929ae19..f7b5d309 100644 --- a/tests/test_backend_csv.py +++ b/tests/test_backend_csv.py @@ -39,8 +39,15 @@ def test_e2e_valid_csv_conversions(): print(f"converting {csv_path}") gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name - - conv_result: ConversionResult = converter.convert(csv_path) + if csv_path.stem in ( + "csv-too-few-columns", + "csv-too-many-columns", + "csv-inconsistent-header", + ): + with warns(UserWarning, match="Inconsistent column lengths"): + conv_result: ConversionResult = converter.convert(csv_path) + else: + conv_result: ConversionResult = converter.convert(csv_path) doc: DoclingDocument = conv_result.document diff --git a/tests/test_e2e_conversion.py b/tests/test_e2e_conversion.py index 5dc2e89a..93655527 100644 --- a/tests/test_e2e_conversion.py +++ b/tests/test_e2e_conversion.py @@ -1,9 +1,10 @@ from pathlib import Path from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.datamodel.accelerator_options import AcceleratorDevice from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions +from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption from .test_data_gen_flag import GEN_TEST_DATA diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index af6e62ce..b34824a1 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -3,10 +3,10 @@ from pathlib import Path from typing import List, Tuple from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.datamodel.accelerator_options import AcceleratorDevice from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( - AcceleratorDevice, EasyOcrOptions, OcrMacOptions, OcrOptions, diff --git a/tests/test_input_doc.py b/tests/test_input_doc.py index 94a68873..d5e40f0f 100644 --- a/tests/test_input_doc.py +++ b/tests/test_input_doc.py @@ -132,6 +132,13 @@ def test_guess_format(tmp_path): doc_path = Path("./tests/data/html/wiki_duck.html") assert dci._guess_format(doc_path) == InputFormat.HTML + html_str = ( # HTML starting with a script + "" + '\n' + ) + stream = DocumentStream(name="lorem_ipsum", stream=BytesIO(f"{html_str}".encode())) + assert dci._guess_format(stream) == InputFormat.HTML + # Valid MD buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read()) stream = DocumentStream(name="wiki.md", stream=buf) diff --git a/tests/test_legacy_format_transform.py b/tests/test_legacy_format_transform.py index caef8ffc..73c73c5b 100644 --- a/tests/test_legacy_format_transform.py +++ b/tests/test_legacy_format_transform.py @@ -38,17 +38,15 @@ def get_converter(): def test_compare_legacy_output(test_doc_paths): converter = get_converter() - res = converter.convert_all(test_doc_paths, raises_on_error=True) - for conv_res in res: print(f"Results for {conv_res.input.file}") - print( - json.dumps( - conv_res.legacy_document.model_dump( - mode="json", by_alias=True, exclude_none=True + with pytest.warns(DeprecationWarning, match="Use document instead"): + print( + json.dumps( + conv_res.legacy_document.model_dump( + mode="json", by_alias=True, exclude_none=True + ) ) ) - ) - # assert res.legacy_output == res.legacy_output_transformed diff --git a/tests/test_options.py b/tests/test_options.py index 7addda4b..a0835d1c 100644 --- a/tests/test_options.py +++ b/tests/test_options.py @@ -7,11 +7,10 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend +from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions from docling.datamodel.base_models import ConversionStatus, InputFormat, QualityGrade from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, PdfPipelineOptions, TableFormerMode, ) diff --git a/tests/verify_utils.py b/tests/verify_utils.py index 46a46ace..96cf9c3c 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -4,6 +4,7 @@ import warnings from pathlib import Path from typing import List, Optional +import pytest from docling_core.types.doc import ( DocItem, DoclingDocument, @@ -302,9 +303,8 @@ def verify_conversion_result_v1( ) doc_pred_pages: List[Page] = doc_result.pages - doc_pred: DsDocument = doc_result.legacy_document - with warnings.catch_warnings(): - warnings.simplefilter("ignore", DeprecationWarning) + with pytest.warns(DeprecationWarning, match="Use document instead"): + doc_pred: DsDocument = doc_result.legacy_document doc_pred_md = doc_result.legacy_document.export_to_markdown() doc_pred_dt = doc_result.legacy_document.export_to_document_tokens() @@ -323,33 +323,33 @@ def verify_conversion_result_v1( if generate: # only used when re-generating truth pages_path.parent.mkdir(parents=True, exist_ok=True) - with open(pages_path, "w") as fw: + with open(pages_path, mode="w", encoding="utf-8") as fw: fw.write( json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent) ) json_path.parent.mkdir(parents=True, exist_ok=True) - with open(json_path, "w") as fw: + with open(json_path, mode="w", encoding="utf-8") as fw: fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent)) md_path.parent.mkdir(parents=True, exist_ok=True) - with open(md_path, "w") as fw: + with open(md_path, mode="w", encoding="utf-8") as fw: fw.write(doc_pred_md) dt_path.parent.mkdir(parents=True, exist_ok=True) - with open(dt_path, "w") as fw: + with open(dt_path, mode="w", encoding="utf-8") as fw: fw.write(doc_pred_dt) else: # default branch in test - with open(pages_path) as fr: + with open(pages_path, encoding="utf-8") as fr: doc_true_pages = PageList.validate_json(fr.read()) - with open(json_path) as fr: + with open(json_path, encoding="utf-8") as fr: doc_true: DsDocument = DsDocument.model_validate_json(fr.read()) - with open(md_path) as fr: + with open(md_path, encoding="utf-8") as fr: doc_true_md = fr.read() - with open(dt_path) as fr: + with open(dt_path, encoding="utf-8") as fr: doc_true_dt = fr.read() if not fuzzy: @@ -391,7 +391,7 @@ def verify_conversion_result_v2( doc_pred_pages: List[Page] = doc_result.pages doc_pred: DoclingDocument = doc_result.document doc_pred_md = doc_result.document.export_to_markdown() - doc_pred_dt = doc_result.document.export_to_document_tokens() + doc_pred_dt = doc_result.document.export_to_doctags() engine_suffix = "" if ocr_engine is None else f".{ocr_engine}" @@ -408,33 +408,33 @@ def verify_conversion_result_v2( if generate: # only used when re-generating truth pages_path.parent.mkdir(parents=True, exist_ok=True) - with open(pages_path, "w") as fw: + with open(pages_path, mode="w", encoding="utf-8") as fw: fw.write( json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent) ) json_path.parent.mkdir(parents=True, exist_ok=True) - with open(json_path, "w") as fw: + with open(json_path, mode="w", encoding="utf-8") as fw: fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent)) md_path.parent.mkdir(parents=True, exist_ok=True) - with open(md_path, "w") as fw: + with open(md_path, mode="w", encoding="utf-8") as fw: fw.write(doc_pred_md) dt_path.parent.mkdir(parents=True, exist_ok=True) - with open(dt_path, "w") as fw: + with open(dt_path, mode="w", encoding="utf-8") as fw: fw.write(doc_pred_dt) else: # default branch in test - with open(pages_path) as fr: + with open(pages_path, encoding="utf-8") as fr: doc_true_pages = PageList.validate_json(fr.read()) - with open(json_path) as fr: + with open(json_path, encoding="utf-8") as fr: doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read()) - with open(md_path) as fr: + with open(md_path, encoding="utf-8") as fr: doc_true_md = fr.read() - with open(dt_path) as fr: + with open(dt_path, encoding="utf-8") as fr: doc_true_dt = fr.read() if not fuzzy: @@ -461,12 +461,12 @@ def verify_conversion_result_v2( def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False): if not os.path.exists(gtfile) or generate: - with open(gtfile, "w") as fw: + with open(gtfile, mode="w", encoding="utf-8") as fw: json.dump(pred_doc.export_to_dict(), fw, ensure_ascii=False, indent=2) return True else: - with open(gtfile) as fr: + with open(gtfile, encoding="utf-8") as fr: true_doc = DoclingDocument.model_validate_json(fr.read()) return verify_docitems(pred_doc, true_doc, fuzzy=False) @@ -476,11 +476,11 @@ def verify_export(pred_text: str, gtfile: str, generate: bool = False) -> bool: file = Path(gtfile) if not file.exists() or generate: - with file.open("w") as fw: + with file.open(mode="w", encoding="utf-8") as fw: fw.write(pred_text) return True - with file.open("r") as fr: + with file.open(encoding="utf-8") as fr: true_text = fr.read() return pred_text == true_text diff --git a/uv.lock b/uv.lock index 498c2379..d986422e 100644 --- a/uv.lock +++ b/uv.lock @@ -775,7 +775,7 @@ wheels = [ [[package]] name = "docling" -version = "2.34.0" +version = "2.35.0" source = { editable = "." } dependencies = [ { name = "beautifulsoup4" },