From 1d680b0a321d95fc6bd65b7bb4d5e15005a0250a Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Wed, 19 Mar 2025 09:05:57 +0100 Subject: [PATCH 01/10] docs: Linux Foundation AI & Data (#1183) * point the auxiliary files to the community repo and add lfai in README Signed-off-by: Michele Dolfi * update docs index Signed-off-by: Michele Dolfi --------- Signed-off-by: Michele Dolfi --- CODE_OF_CONDUCT.md | 128 +-------------------------------------------- CONTRIBUTING.md | 80 +--------------------------- MAINTAINERS.md | 3 -- README.md | 9 +++- docs/index.md | 11 ++-- 5 files changed, 17 insertions(+), 214 deletions(-) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index c7d5ea1c..2c90084e 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,129 +1,3 @@ # Contributor Covenant Code of Conduct -## Our Pledge - -We as members, contributors, and leaders pledge to make participation in our -community a harassment-free experience for everyone, regardless of age, body -size, visible or invisible disability, ethnicity, sex characteristics, gender -identity and expression, level of experience, education, socio-economic status, -nationality, personal appearance, race, religion, or sexual identity -and orientation. - -We pledge to act and interact in ways that contribute to an open, welcoming, -diverse, inclusive, and healthy community. - -## Our Standards - -Examples of behavior that contributes to a positive environment for our -community include: - -* Demonstrating empathy and kindness toward other people -* Being respectful of differing opinions, viewpoints, and experiences -* Giving and gracefully accepting constructive feedback -* Accepting responsibility and apologizing to those affected by our mistakes, - and learning from the experience -* Focusing on what is best not just for us as individuals, but for the - overall community - -Examples of unacceptable behavior include: - -* The use of sexualized language or imagery, and sexual attention or - advances of any kind -* Trolling, insulting or derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or email - address, without their explicit permission -* Other conduct which could reasonably be considered inappropriate in a - professional setting - -## Enforcement Responsibilities - -Community leaders are responsible for clarifying and enforcing our standards of -acceptable behavior and will take appropriate and fair corrective action in -response to any behavior that they deem inappropriate, threatening, offensive, -or harmful. - -Community leaders have the right and responsibility to remove, edit, or reject -comments, commits, code, wiki edits, issues, and other contributions that are -not aligned to this Code of Conduct, and will communicate reasons for moderation -decisions when appropriate. - -## Scope - -This Code of Conduct applies within all community spaces, and also applies when -an individual is officially representing the community in public spaces. -Examples of representing our community include using an official e-mail address, -posting via an official social media account, or acting as an appointed -representative at an online or offline event. - -## Enforcement - -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported to the community leaders responsible for enforcement using -[deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com). - -All complaints will be reviewed and investigated promptly and fairly. - -All community leaders are obligated to respect the privacy and security of the -reporter of any incident. - -## Enforcement Guidelines - -Community leaders will follow these Community Impact Guidelines in determining -the consequences for any action they deem in violation of this Code of Conduct: - -### 1. Correction - -**Community Impact**: Use of inappropriate language or other behavior deemed -unprofessional or unwelcome in the community. - -**Consequence**: A private, written warning from community leaders, providing -clarity around the nature of the violation and an explanation of why the -behavior was inappropriate. A public apology may be requested. - -### 2. Warning - -**Community Impact**: A violation through a single incident or series -of actions. - -**Consequence**: A warning with consequences for continued behavior. No -interaction with the people involved, including unsolicited interaction with -those enforcing the Code of Conduct, for a specified period of time. This -includes avoiding interactions in community spaces as well as external channels -like social media. Violating these terms may lead to a temporary or -permanent ban. - -### 3. Temporary Ban - -**Community Impact**: A serious violation of community standards, including -sustained inappropriate behavior. - -**Consequence**: A temporary ban from any sort of interaction or public -communication with the community for a specified period of time. No public or -private interaction with the people involved, including unsolicited interaction -with those enforcing the Code of Conduct, is allowed during this period. -Violating these terms may lead to a permanent ban. - -### 4. Permanent Ban - -**Community Impact**: Demonstrating a pattern of violation of community -standards, including sustained inappropriate behavior, harassment of an -individual, or aggression toward or disparagement of classes of individuals. - -**Consequence**: A permanent ban from any sort of public interaction within -the community. - -## Attribution - -This Code of Conduct is adapted from the [Contributor Covenant][homepage], -version 2.0, available at -[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html](https://www.contributor-covenant.org/version/2/0/code_of_conduct.html). - -Community Impact Guidelines were inspired by [Mozilla's code of conduct -enforcement ladder](https://github.com/mozilla/diversity). - -Homepage: [https://www.contributor-covenant.org](https://www.contributor-covenant.org) - -For answers to common questions about this code of conduct, see the FAQ at -[https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq). Translations are available at -[https://www.contributor-covenant.org/translations](https://www.contributor-covenant.org/translations). \ No newline at end of file +This project adheres to the [Docling - Code of Conduct and Covenant](https://github.com/docling-project/community/blob/main/CODE_OF_CONDUCT.md). By participating, you are expected to uphold this code. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 843c6daf..5146de13 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,85 +2,7 @@ Our project welcomes external contributions. If you have an itch, please feel free to scratch it. -To contribute code or documentation, please submit a [pull request](https://github.com/docling-project/docling/pulls). - -A good way to familiarize yourself with the codebase and contribution process is -to look for and tackle low-hanging fruit in the [issue tracker](https://github.com/docling-project/docling/issues). -Before embarking on a more ambitious contribution, please quickly [get in touch](#communication) with us. - -For general questions or support requests, please refer to the [discussion section](https://github.com/docling-project/docling/discussions). - -**Note: We appreciate your effort and want to avoid situations where a contribution -requires extensive rework (by you or by us), sits in the backlog for a long time, or -cannot be accepted at all!** - -### Proposing New Features - -If you would like to implement a new feature, please [raise an issue](https://github.com/docling-project/docling/issues) -before sending a pull request so the feature can be discussed. This is to avoid -you spending valuable time working on a feature that the project developers -are not interested in accepting into the codebase. - -### Fixing Bugs - -If you would like to fix a bug, please [raise an issue](https://github.com/docling-project/docling/issues) before sending a -pull request so it can be tracked. - -### Merge Approval - -The project maintainers use LGTM (Looks Good To Me) in comments on the code -review to indicate acceptance. A change requires LGTMs from two of the -maintainers of each component affected. - -For a list of the maintainers, see the [MAINTAINERS.md](MAINTAINERS.md) page. - - -## Legal - -Each source file must include a license header for the MIT -Software. Using the SPDX format is the simplest approach, -e.g. - -``` -/* -Copyright IBM Inc. All rights reserved. - -SPDX-License-Identifier: MIT -*/ -``` - -We have tried to make it as easy as possible to make contributions. This -applies to how we handle the legal aspects of contribution. We use the -same approach - the [Developer's Certificate of Origin 1.1 (DCO)](https://github.com/hyperledger/fabric/blob/master/docs/source/DCO1.1.txt) - that the Linux® Kernel [community](https://elinux.org/Developer_Certificate_Of_Origin) -uses to manage code contributions. - -We simply ask that when submitting a patch for review, the developer -must include a sign-off statement in the commit message. - -Here is an example Signed-off-by line, which indicates that the -submitter accepts the DCO: - -``` -Signed-off-by: John Doe -``` - -You can include this automatically when you commit a change to your -local git repository using the following command: - -``` -git commit -s -``` - -### New dependencies - -This project strictly adheres to using dependencies that are compatible with the MIT license to ensure maximum flexibility and permissiveness in its usage and distribution. As a result, dependencies licensed under restrictive terms such as GPL, LGPL, AGPL, or similar are explicitly excluded. These licenses impose additional requirements and limitations that are incompatible with the MIT license's minimal restrictions, potentially affecting derivative works and redistribution. By maintaining this policy, the project ensures simplicity and freedom for both developers and users, avoiding conflicts with stricter copyleft provisions. - - -## Communication - -Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions). - - +For more details on the contributing guidelines head to the Docling Project [community repository](https://github.com/docling-project/community). ## Developing diff --git a/MAINTAINERS.md b/MAINTAINERS.md index a2b8a4ff..69b32aad 100644 --- a/MAINTAINERS.md +++ b/MAINTAINERS.md @@ -2,9 +2,6 @@ - Christoph Auer - [@cau-git](https://github.com/cau-git) - Michele Dolfi - [@dolfim-ibm](https://github.com/dolfim-ibm) -- Maxim Lysak - [@maxmnemonic](https://github.com/maxmnemonic) -- Nikos Livathinos - [@nikos-livathinos](https://github.com/nikos-livathinos) -- Ahmed Nassar - [@nassarofficial](https://github.com/nassarofficial) - Panos Vagenas - [@vagenas](https://github.com/vagenas) - Peter Staar - [@PeterStaar-IBM](https://github.com/PeterStaar-IBM) diff --git a/README.md b/README.md index e5168347..208de0d0 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT) [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling) [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling) +[![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/) Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem. @@ -120,9 +121,13 @@ If you use Docling in your projects, please consider citing the following: The Docling codebase is under MIT license. For individual model usage, please refer to the model licenses found in the original packages. -## IBM ❤️ Open Source AI +## LF AI & Data -Docling has been brought to you by IBM. +Docling is hosted as a project in the [LF AI & Data Foundation](https://lfaidata.foundation/projects/). + +### IBM ❤️ Open Source AI + +The project was started by the AI for knowledge team at IBM Research Zurich. [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/ [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/ diff --git a/docs/index.md b/docs/index.md index 9d291b1b..789dae8f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -13,6 +13,7 @@ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT) [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling) +[![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/) Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem. @@ -25,12 +26,12 @@ Docling simplifies document processing, parsing diverse formats — including ad * 🔒 Local execution capabilities for sensitive data and air-gapped environments * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI * 🔍 Extensive OCR support for scanned PDFs and images +* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) * 💻 Simple and convenient CLI ### Coming soon * 📝 Metadata extraction, including title, authors, references & language -* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling)) * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc) * 📝 Complex chemistry understanding (Molecular structures) @@ -43,9 +44,13 @@ Docling simplifies document processing, parsing diverse formats — including ad Reference
See more API details
-## IBM ❤️ Open Source AI +## LF AI & Data -Docling has been brought to you by IBM. +Docling is hosted as a project in the [LF AI & Data Foundation](https://lfaidata.foundation/projects/). + +### IBM ❤️ Open Source AI + +The project was started by the AI for knowledge team at IBM Research Zurich. [supported_formats]: ./usage/supported_formats.md [docling_document]: ./concepts/docling_document.md From 0b707d0882f5be42505871799387d0b1882bffbf Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Wed, 19 Mar 2025 10:34:25 +0100 Subject: [PATCH 02/10] fix(msword): Fixing function return in equations handling (#1194) * Fixing function return Signed-off-by: Rafael Teixeira de Lima * Add message Signed-off-by: Rafael Teixeira de Lima --------- Signed-off-by: Rafael Teixeira de Lima --- docling/backend/msword_backend.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 926ce083..78fe7df8 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -275,8 +275,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): only_equations.append(latex_equation) texts_and_equations.append(latex_equation) - if "".join(only_texts) != text: - return text + if "".join(only_texts).strip() != text.strip(): + # If we are not able to reconstruct the initial raw text + # do not try to parse equations and return the original + return text, [] return "".join(texts_and_equations), only_equations @@ -365,6 +367,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): for eq in equations: if len(text_tmp) == 0: break + pre_eq_text = text_tmp.split(eq, maxsplit=1)[0] text_tmp = text_tmp.split(eq, maxsplit=1)[1] if len(pre_eq_text) > 0: From d5f7798763be56b9b490c91379acec27f1f2a33f Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Wed, 19 Mar 2025 11:03:46 +0100 Subject: [PATCH 03/10] test(html): fix regression test after docling-core update (#1197) Update docling-core dependency to version 2.23.3. Fix regression test of HTML backend after docling-core dependency update. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- poetry.lock | 8 ++++---- tests/data/groundtruth/docling_v2/example_03.html.md | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 45b53c64..0f20f5e2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand. [[package]] name = "accelerate" @@ -870,13 +870,13 @@ files = [ [[package]] name = "docling-core" -version = "2.23.1" +version = "2.23.3" description = "A python library to define and validate data types in Docling." optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "docling_core-2.23.1-py3-none-any.whl", hash = "sha256:4a3f7bcc55a735a070d69d74cf1278f7e40cb403c5059d4149672c7ca163992f"}, - {file = "docling_core-2.23.1.tar.gz", hash = "sha256:0708f4ffe61faef9a2dee48e71cf3890248bf1d9b409f6414cd9c0dd6c7a1681"}, + {file = "docling_core-2.23.3-py3-none-any.whl", hash = "sha256:a2166ffc41f8fdf6fdb99b33da6c7146eccf6382712ea92e95772604fb5af5e5"}, + {file = "docling_core-2.23.3.tar.gz", hash = "sha256:a64ce41e0881c06962a2b3ec80e0665f84de0809dedf1bf84f3a14b75dd665c4"}, ] [package.dependencies] diff --git a/tests/data/groundtruth/docling_v2/example_03.html.md b/tests/data/groundtruth/docling_v2/example_03.html.md index 7e4f5611..f969a146 100644 --- a/tests/data/groundtruth/docling_v2/example_03.html.md +++ b/tests/data/groundtruth/docling_v2/example_03.html.md @@ -16,7 +16,7 @@ Some background information here. 1. First item in ordered list 1. Nested ordered item 1 2. Nested ordered item 2 -3. Second item in ordered list +2. Second item in ordered list ### Data Table From f5adfb9724aae1207f23e21d74033f331e6e1ffb Mon Sep 17 00:00:00 2001 From: Christoph Auer <60343111+cau-git@users.noreply.github.com> Date: Wed, 19 Mar 2025 11:05:42 +0100 Subject: [PATCH 04/10] fix: Determine correct page size in DoclingParseV4Backend (#1196) Signed-off-by: Christoph Auer --- docling/backend/docling_parse_v4_backend.py | 33 +++++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/docling/backend/docling_parse_v4_backend.py b/docling/backend/docling_parse_v4_backend.py index 9ec0aee2..e1e74301 100644 --- a/docling/backend/docling_parse_v4_backend.py +++ b/docling/backend/docling_parse_v4_backend.py @@ -112,23 +112,30 @@ class DoclingParseV4PageBackend(PdfPageBackend): padbox.r = page_size.width - padbox.r padbox.t = page_size.height - padbox.t - image = ( - self._ppage.render( - scale=scale * 1.5, - rotation=0, # no additional rotation - crop=padbox.as_tuple(), - ) - .to_pil() - .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale))) - ) # We resize the image from 1.5x the given scale to make it sharper. + with pypdfium2_lock: + image = ( + self._ppage.render( + scale=scale * 1.5, + rotation=0, # no additional rotation + crop=padbox.as_tuple(), + ) + .to_pil() + .resize( + size=(round(cropbox.width * scale), round(cropbox.height * scale)) + ) + ) # We resize the image from 1.5x the given scale to make it sharper. return image def get_size(self) -> Size: - return Size( - width=self._dpage.dimension.width, - height=self._dpage.dimension.height, - ) + with pypdfium2_lock: + return Size(width=self._ppage.get_width(), height=self._ppage.get_height()) + + # TODO: Take width and height from docling-parse. + # return Size( + # width=self._dpage.dimension.width, + # height=self._dpage.dimension.height, + # ) def unload(self): self._ppage = None From e7c29a89d098449762698d903f81860e30787d13 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Wed, 19 Mar 2025 10:51:20 +0100 Subject: [PATCH 05/10] Initial implementation to support MLX for VLM pipeline and SmolDocling Signed-off-by: Maksym Lysak --- docling/datamodel/pipeline_options.py | 16 ++++++++++++ docling/pipeline/vlm_pipeline.py | 36 ++++++++++++++++++++------- docs/examples/minimal_vlm_pipeline.py | 6 ++++- 3 files changed, 48 insertions(+), 10 deletions(-) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index d28b5826..c96603c4 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -263,6 +263,11 @@ class ResponseFormat(str, Enum): MARKDOWN = "markdown" +class InferenceFramework(str, Enum): + MLX = "mlx" + TRANSFORMERS = "transformers" + + class HuggingFaceVlmOptions(BaseVlmOptions): kind: Literal["hf_model_options"] = "hf_model_options" @@ -271,6 +276,7 @@ class HuggingFaceVlmOptions(BaseVlmOptions): llm_int8_threshold: float = 6.0 quantized: bool = False + inference_framework: InferenceFramework response_format: ResponseFormat @property @@ -278,10 +284,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions): return self.repo_id.replace("/", "--") +smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions( + repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16", + prompt="Convert this page to docling.", + response_format=ResponseFormat.DOCTAGS, + inference_framework=InferenceFramework.MLX, +) + + smoldocling_vlm_conversion_options = HuggingFaceVlmOptions( repo_id="ds4sd/SmolDocling-256M-preview", prompt="Convert this page to docling.", response_format=ResponseFormat.DOCTAGS, + inference_framework=InferenceFramework.TRANSFORMERS, ) granite_vision_vlm_conversion_options = HuggingFaceVlmOptions( @@ -289,6 +304,7 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions( # prompt="OCR the full page to markdown.", prompt="OCR this image.", response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS, ) diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 4afb918d..d01ee7f1 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -14,8 +14,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import InputFormat, Page from docling.datamodel.document import ConversionResult, InputDocument -from docling.datamodel.pipeline_options import ResponseFormat, VlmPipelineOptions +from docling.datamodel.pipeline_options import ( + InferenceFramework, + ResponseFormat, + VlmPipelineOptions, +) from docling.datamodel.settings import settings +from docling.models.hf_mlx_model import HuggingFaceMlxModel from docling.models.hf_vlm_model import HuggingFaceVlmModel from docling.pipeline.base_pipeline import PaginatedPipeline from docling.utils.profiling import ProfilingScope, TimeRecorder @@ -58,14 +63,27 @@ class VlmPipeline(PaginatedPipeline): self.keep_images = self.pipeline_options.generate_page_images - self.build_pipe = [ - HuggingFaceVlmModel( - enabled=True, # must be always enabled for this pipeline to make sense. - artifacts_path=artifacts_path, - accelerator_options=pipeline_options.accelerator_options, - vlm_options=self.pipeline_options.vlm_options, - ), - ] + if ( + self.pipeline_options.vlm_options.inference_framework + == InferenceFramework.MLX + ): + self.build_pipe = [ + HuggingFaceMlxModel( + enabled=True, # must be always enabled for this pipeline to make sense. + artifacts_path=artifacts_path, + accelerator_options=pipeline_options.accelerator_options, + vlm_options=self.pipeline_options.vlm_options, + ), + ] + else: + self.build_pipe = [ + HuggingFaceVlmModel( + enabled=True, # must be always enabled for this pipeline to make sense. + artifacts_path=artifacts_path, + accelerator_options=pipeline_options.accelerator_options, + vlm_options=self.pipeline_options.vlm_options, + ), + ] self.enrichment_pipe = [ # Other models working on `NodeItem` elements in the DoclingDocument diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py index 948ecc64..c10b000f 100644 --- a/docs/examples/minimal_vlm_pipeline.py +++ b/docs/examples/minimal_vlm_pipeline.py @@ -10,6 +10,7 @@ from docling.datamodel.pipeline_options import ( VlmPipelineOptions, granite_vision_vlm_conversion_options, smoldocling_vlm_conversion_options, + smoldocling_vlm_mlx_conversion_options, ) from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter, PdfFormatOption @@ -29,7 +30,10 @@ pipeline_options.force_backend_text = False # pipeline_options.accelerator_options.cuda_use_flash_attention2 = True ## Pick a VLM model. We choose SmolDocling-256M by default -pipeline_options.vlm_options = smoldocling_vlm_conversion_options +# pipeline_options.vlm_options = smoldocling_vlm_conversion_options + +## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX +pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options ## Alternative VLM models: # pipeline_options.vlm_options = granite_vision_vlm_conversion_options From bd0c4dfe10849e723fab7220b0fcbebd351c0277 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Wed, 19 Mar 2025 11:02:26 +0100 Subject: [PATCH 06/10] mlx_model unit Signed-off-by: Maksym Lysak --- docling/models/hf_mlx_model.py | 132 +++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 docling/models/hf_mlx_model.py diff --git a/docling/models/hf_mlx_model.py b/docling/models/hf_mlx_model.py new file mode 100644 index 00000000..1c4577ac --- /dev/null +++ b/docling/models/hf_mlx_model.py @@ -0,0 +1,132 @@ +import logging +import time +from pathlib import Path +from typing import Iterable, List, Optional + +from docling.datamodel.base_models import Page, VlmPrediction +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import ( + AcceleratorDevice, + AcceleratorOptions, + HuggingFaceVlmOptions, +) +from docling.datamodel.settings import settings +from docling.models.base_model import BasePageModel +from docling.utils.accelerator_utils import decide_device +from docling.utils.profiling import TimeRecorder + +_log = logging.getLogger(__name__) + + +class HuggingFaceMlxModel(BasePageModel): + + def __init__( + self, + enabled: bool, + artifacts_path: Optional[Path], + accelerator_options: AcceleratorOptions, + vlm_options: HuggingFaceVlmOptions, + ): + self.enabled = enabled + + self.vlm_options = vlm_options + + if self.enabled: + from mlx_vlm import generate, load # type: ignore + from mlx_vlm.prompt_utils import apply_chat_template # type: ignore + from mlx_vlm.utils import load_config, stream_generate # type: ignore + + repo_cache_folder = vlm_options.repo_id.replace("/", "--") + self.apply_chat_template = apply_chat_template + self.stream_generate = stream_generate + + # PARAMETERS: + if artifacts_path is None: + artifacts_path = self.download_models(self.vlm_options.repo_id) + elif (artifacts_path / repo_cache_folder).exists(): + artifacts_path = artifacts_path / repo_cache_folder + + self.param_question = vlm_options.prompt # "Perform Layout Analysis." + + ## Load the model + self.vlm_model, self.processor = load(artifacts_path) + self.config = load_config(artifacts_path) + + @staticmethod + def download_models( + repo_id: str, + local_dir: Optional[Path] = None, + force: bool = False, + progress: bool = False, + ) -> Path: + from huggingface_hub import snapshot_download + from huggingface_hub.utils import disable_progress_bars + + if not progress: + disable_progress_bars() + download_path = snapshot_download( + repo_id=repo_id, + force_download=force, + local_dir=local_dir, + # revision="v0.0.1", + ) + + return Path(download_path) + + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: + for page in page_batch: + assert page._backend is not None + if not page._backend.is_valid(): + yield page + else: + with TimeRecorder(conv_res, "vlm"): + assert page.size is not None + + hi_res_image = page.get_image(scale=2.0) # 144dpi + # hi_res_image = page.get_image(scale=1.0) # 72dpi + + if hi_res_image is not None: + im_width, im_height = hi_res_image.size + + # populate page_tags with predicted doc tags + page_tags = "" + + if hi_res_image: + if hi_res_image.mode != "RGB": + hi_res_image = hi_res_image.convert("RGB") + + prompt = self.apply_chat_template( + self.processor, self.config, self.param_question, num_images=1 + ) + + start_time = time.time() + # Call model to generate: + output = "" + for token in self.stream_generate( + self.vlm_model, + self.processor, + prompt, + [hi_res_image], + max_tokens=4096, + verbose=False, + ): + output += token.text + print(token.text, end="") + if "" in token.text: + break + + generation_time = time.time() - start_time + page_tags = output + + # inference_time = time.time() - start_time + # tokens_per_second = num_tokens / generation_time + # print("") + # print(f"Page Inference Time: {inference_time:.2f} seconds") + # print(f"Total tokens on page: {num_tokens:.2f}") + # print(f"Tokens/sec: {tokens_per_second:.2f}") + # print("") + page.predictions.vlm_response = VlmPrediction(text=page_tags) + + yield page From 0875388ce6a82e25c95a75a74fa09275abf73672 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Wed, 19 Mar 2025 13:18:04 +0100 Subject: [PATCH 07/10] Add CLI choices for VLM pipeline and model Signed-off-by: Christoph Auer --- docling/cli/main.py | 119 ++++++++++++++++++-------- docling/datamodel/pipeline_options.py | 14 ++- docling/models/hf_mlx_model.py | 13 ++- docling/pipeline/vlm_pipeline.py | 6 -- pyproject.toml | 1 + 5 files changed, 103 insertions(+), 50 deletions(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index 7f0f20bf..c85a04f3 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -32,13 +32,21 @@ from docling.datamodel.pipeline_options import ( AcceleratorOptions, EasyOcrOptions, OcrOptions, + PaginatedPipelineOptions, PdfBackend, + PdfPipeline, PdfPipelineOptions, TableFormerMode, + VlmModelType, + VlmPipelineOptions, + granite_vision_vlm_conversion_options, + smoldocling_vlm_conversion_options, + smoldocling_vlm_mlx_conversion_options, ) from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption from docling.models.factories import get_ocr_factory +from docling.pipeline.vlm_pipeline import VlmPipeline warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr") @@ -200,6 +208,14 @@ def convert( help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.", ), ] = ImageRefMode.EMBEDDED, + pipeline: Annotated[ + PdfPipeline, + typer.Option(..., help="Choose the pipeline to process PDF or image files."), + ] = PdfPipeline.STANDARD, + vlm_model: Annotated[ + VlmModelType, + typer.Option(..., help="Choose the VLM model to use with PDF or image files."), + ] = VlmModelType.SMOLDOCLING, ocr: Annotated[ bool, typer.Option( @@ -420,50 +436,77 @@ def convert( ocr_options.lang = ocr_lang_list accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device) - pipeline_options = PdfPipelineOptions( - allow_external_plugins=allow_external_plugins, - enable_remote_services=enable_remote_services, - accelerator_options=accelerator_options, - do_ocr=ocr, - ocr_options=ocr_options, - do_table_structure=True, - do_code_enrichment=enrich_code, - do_formula_enrichment=enrich_formula, - do_picture_description=enrich_picture_description, - do_picture_classification=enrich_picture_classes, - document_timeout=document_timeout, - ) - pipeline_options.table_structure_options.do_cell_matching = ( - True # do_cell_matching - ) - pipeline_options.table_structure_options.mode = table_mode + pipeline_options: PaginatedPipelineOptions - if image_export_mode != ImageRefMode.PLACEHOLDER: - pipeline_options.generate_page_images = True - pipeline_options.generate_picture_images = ( - True # FIXME: to be deprecated in verson 3 + if pipeline == PdfPipeline.STANDARD: + pipeline_options = PdfPipelineOptions( + allow_external_plugins=allow_external_plugins, + enable_remote_services=enable_remote_services, + accelerator_options=accelerator_options, + do_ocr=ocr, + ocr_options=ocr_options, + do_table_structure=True, + do_code_enrichment=enrich_code, + do_formula_enrichment=enrich_formula, + do_picture_description=enrich_picture_description, + do_picture_classification=enrich_picture_classes, + document_timeout=document_timeout, + ) + pipeline_options.table_structure_options.do_cell_matching = ( + True # do_cell_matching + ) + pipeline_options.table_structure_options.mode = table_mode + + if image_export_mode != ImageRefMode.PLACEHOLDER: + pipeline_options.generate_page_images = True + pipeline_options.generate_picture_images = ( + True # FIXME: to be deprecated in verson 3 + ) + pipeline_options.images_scale = 2 + + backend: Type[PdfDocumentBackend] + if pdf_backend == PdfBackend.DLPARSE_V1: + backend = DoclingParseDocumentBackend + elif pdf_backend == PdfBackend.DLPARSE_V2: + backend = DoclingParseV2DocumentBackend + elif pdf_backend == PdfBackend.DLPARSE_V4: + backend = DoclingParseV4DocumentBackend # type: ignore + elif pdf_backend == PdfBackend.PYPDFIUM2: + backend = PyPdfiumDocumentBackend # type: ignore + else: + raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}") + + pdf_format_option = PdfFormatOption( + pipeline_options=pipeline_options, + backend=backend, # pdf_backend + ) + elif pipeline == PdfPipeline.VLM: + pipeline_options = VlmPipelineOptions() + + if vlm_model == VlmModelType.GRANITE_VISION: + pipeline_options.vlm_options = granite_vision_vlm_conversion_options + elif vlm_model == VlmModelType.SMOLDOCLING: + pipeline_options.vlm_options = smoldocling_vlm_conversion_options + if sys.platform == "darwin": + try: + import mlx_vlm + + pipeline_options.vlm_options = ( + smoldocling_vlm_mlx_conversion_options + ) + except ImportError: + _log.warning( + "To run SmolDocling faster, please install mlx-vlm:\n" + "pip install mlx-vlm" + ) + + pdf_format_option = PdfFormatOption( + pipeline_cls=VlmPipeline, pipeline_options=pipeline_options ) - pipeline_options.images_scale = 2 if artifacts_path is not None: pipeline_options.artifacts_path = artifacts_path - backend: Type[PdfDocumentBackend] - if pdf_backend == PdfBackend.DLPARSE_V1: - backend = DoclingParseDocumentBackend - elif pdf_backend == PdfBackend.DLPARSE_V2: - backend = DoclingParseV2DocumentBackend - elif pdf_backend == PdfBackend.DLPARSE_V4: - backend = DoclingParseV4DocumentBackend # type: ignore - elif pdf_backend == PdfBackend.PYPDFIUM2: - backend = PyPdfiumDocumentBackend # type: ignore - else: - raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}") - - pdf_format_option = PdfFormatOption( - pipeline_options=pipeline_options, - backend=backend, # pdf_backend - ) format_options: Dict[InputFormat, FormatOption] = { InputFormat.PDF: pdf_format_option, InputFormat.IMAGE: pdf_format_option, diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index c96603c4..654e04df 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -308,6 +308,11 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions( ) +class VlmModelType(str, Enum): + SMOLDOCLING = "smoldocling" + GRANITE_VISION = "granite_vision" + + # Define an enum for the backend options class PdfBackend(str, Enum): """Enum of valid PDF backends.""" @@ -343,13 +348,14 @@ class PipelineOptions(BaseModel): class PaginatedPipelineOptions(PipelineOptions): + artifacts_path: Optional[Union[Path, str]] = None + images_scale: float = 1.0 generate_page_images: bool = False generate_picture_images: bool = False class VlmPipelineOptions(PaginatedPipelineOptions): - artifacts_path: Optional[Union[Path, str]] = None generate_page_images: bool = True force_backend_text: bool = ( @@ -362,7 +368,6 @@ class VlmPipelineOptions(PaginatedPipelineOptions): class PdfPipelineOptions(PaginatedPipelineOptions): """Options for the PDF pipeline.""" - artifacts_path: Optional[Union[Path, str]] = None do_table_structure: bool = True # True: perform table structure extraction do_ocr: bool = True # True: perform OCR, replace programmatic PDF text do_code_enrichment: bool = False # True: perform code OCR @@ -393,3 +398,8 @@ class PdfPipelineOptions(PaginatedPipelineOptions): ) generate_parsed_pages: bool = False + + +class PdfPipeline(str, Enum): + STANDARD = "standard" + VLM = "vlm" diff --git a/docling/models/hf_mlx_model.py b/docling/models/hf_mlx_model.py index 1c4577ac..762a6557 100644 --- a/docling/models/hf_mlx_model.py +++ b/docling/models/hf_mlx_model.py @@ -32,9 +32,15 @@ class HuggingFaceMlxModel(BasePageModel): self.vlm_options = vlm_options if self.enabled: - from mlx_vlm import generate, load # type: ignore - from mlx_vlm.prompt_utils import apply_chat_template # type: ignore - from mlx_vlm.utils import load_config, stream_generate # type: ignore + + try: + from mlx_vlm import generate, load # type: ignore + from mlx_vlm.prompt_utils import apply_chat_template # type: ignore + from mlx_vlm.utils import load_config, stream_generate # type: ignore + except ImportError: + raise ImportError( + "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models." + ) repo_cache_folder = vlm_options.repo_id.replace("/", "--") self.apply_chat_template = apply_chat_template @@ -113,7 +119,6 @@ class HuggingFaceMlxModel(BasePageModel): verbose=False, ): output += token.text - print(token.text, end="") if "" in token.text: break diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index d01ee7f1..5237f2ae 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -34,12 +34,6 @@ class VlmPipeline(PaginatedPipeline): super().__init__(pipeline_options) self.keep_backend = True - warnings.warn( - "The VlmPipeline is currently experimental and may change in upcoming versions without notice.", - category=UserWarning, - stacklevel=2, - ) - self.pipeline_options: VlmPipelineOptions artifacts_path: Optional[Path] = None diff --git a/pyproject.toml b/pyproject.toml index 0f85915f..8d121d36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -192,6 +192,7 @@ module = [ "docling_ibm_models.*", "easyocr.*", "ocrmac.*", + "mlx_vlm.*", "lxml.*", "huggingface_hub.*", "transformers.*", From 9182d8a6225b67b7993fa86c2c0680e652aa9c22 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Wed, 19 Mar 2025 13:48:44 +0100 Subject: [PATCH 08/10] Updated minimal vlm pipeline example Signed-off-by: Maksym Lysak --- docs/examples/minimal_vlm_pipeline.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py index c10b000f..023cbc30 100644 --- a/docs/examples/minimal_vlm_pipeline.py +++ b/docs/examples/minimal_vlm_pipeline.py @@ -17,7 +17,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline sources = [ - "tests/data/2305.03393v1-pg9-img.png", + # "tests/data/2305.03393v1-pg9-img.png", + "tests/data/pdf/2305.03393v1-pg9.pdf", ] ## Use experimental VlmPipeline @@ -73,10 +74,12 @@ for source in sources: print("") print(res.document.export_to_markdown()) + doctags = "" for page in res.pages: print("") print("Predicted page in DOCTAGS:") print(page.predictions.vlm_response.text) + doctags += page.predictions.vlm_response.text res.document.save_as_html( filename=Path("{}/{}.html".format(out_path, res.input.file.stem)), @@ -87,6 +90,12 @@ for source in sources: with (out_path / f"{res.input.file.stem}.json").open("w") as fp: fp.write(json.dumps(res.document.export_to_dict())) + with (out_path / f"{res.input.file.stem}.md").open("w") as fp: + fp.write(res.document.export_to_markdown()) + + with (out_path / f"{res.input.file.stem}.doctag").open("w") as fp: + fp.write(doctags) + pg_num = res.document.num_pages() print("") From a9cf823187941485a60ad1f95c387534e38f6e99 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Wed, 19 Mar 2025 13:56:57 +0100 Subject: [PATCH 09/10] make vlm_pipeline python3.9 compatible Signed-off-by: Maksym Lysak --- docling/pipeline/vlm_pipeline.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 5237f2ae..9e4adab0 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -91,7 +91,9 @@ class VlmPipeline(PaginatedPipeline): return page - def extract_text_from_backend(self, page: Page, bbox: BoundingBox | None) -> str: + def extract_text_from_backend( + self, page: Page, bbox: Union[BoundingBox | None] + ) -> str: # Convert bounding box normalized to 0-100 into page coordinates for cropping text = "" if bbox: From 16664f2cd6cc1fbc92529aabdce44c0b2ae406a5 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Wed, 19 Mar 2025 14:13:05 +0100 Subject: [PATCH 10/10] Fixed extract_text_from_backend definition Signed-off-by: Maksym Lysak --- docling/pipeline/vlm_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 9e4adab0..d4defa89 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -92,7 +92,7 @@ class VlmPipeline(PaginatedPipeline): return page def extract_text_from_backend( - self, page: Page, bbox: Union[BoundingBox | None] + self, page: Page, bbox: Union[BoundingBox, None] ) -> str: # Convert bounding box normalized to 0-100 into page coordinates for cropping text = ""