From 90b766e2ae1695a759191df37c272efc09be5ee3 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Fri, 7 Feb 2025 12:55:12 +0100 Subject: [PATCH 1/6] fix(markdown): handle nested lists (#910) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling/backend/md_backend.py | 108 ++++++++++-------- .../data/groundtruth/docling_v2/nested.md.md | 31 +++++ tests/data/md/nested.md | 66 +++++++++++ tests/test_backend_markdown.py | 12 +- tests/test_data_gen_flag.py | 9 ++ 5 files changed, 177 insertions(+), 49 deletions(-) create mode 100644 tests/data/groundtruth/docling_v2/nested.md.md create mode 100644 tests/data/md/nested.md create mode 100644 tests/test_data_gen_flag.py diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index eaf47537..19a21c19 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -36,7 +36,7 @@ _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#" class MarkdownDocumentBackend(DeclarativeDocumentBackend): - def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10): + def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10): # This regex will match any sequence of underscores pattern = r"_+" @@ -81,7 +81,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): # very long sequences of underscores will lead to unnecessary long processing times. # In any proper Markdown files, underscores have to be escaped, # otherwise they represent emphasis (bold or italic) - self.markdown = self.shorten_underscore_sequences(text_stream) + self.markdown = self._shorten_underscore_sequences(text_stream) if isinstance(self.path_or_stream, Path): with open(self.path_or_stream, "r", encoding="utf-8") as f: md_content = f.read() @@ -89,7 +89,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): # very long sequences of underscores will lead to unnecessary long processing times. # In any proper Markdown files, underscores have to be escaped, # otherwise they represent emphasis (bold or italic) - self.markdown = self.shorten_underscore_sequences(md_content) + self.markdown = self._shorten_underscore_sequences(md_content) self.valid = True _log.debug(self.markdown) @@ -99,7 +99,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): ) from e return - def close_table(self, doc: DoclingDocument): + def _close_table(self, doc: DoclingDocument): if self.in_table: _log.debug("=== TABLE START ===") for md_table_row in self.md_table_buffer: @@ -156,30 +156,35 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): doc.add_table(data=table_data) return - def process_inline_text( - self, parent_element: Optional[NodeItem], doc: DoclingDocument + def _process_inline_text( + self, parent_item: Optional[NodeItem], doc: DoclingDocument ): txt = " ".join(self.inline_texts) if len(txt) > 0: doc.add_text( label=DocItemLabel.PARAGRAPH, - parent=parent_element, + parent=parent_item, text=txt, ) self.inline_texts = [] - def iterate_elements( + def _iterate_elements( self, element: marko.element.Element, depth: int, doc: DoclingDocument, - parent_element: Optional[NodeItem] = None, + visited: Set[marko.element.Element], + parent_item: Optional[NodeItem] = None, ): + + if element in visited: + return + # Iterates over all elements in the AST # Check for different element types and process relevant details if isinstance(element, marko.block.Heading) and len(element.children) > 0: - self.close_table(doc) - self.process_inline_text(parent_element, doc) + self._close_table(doc) + self._process_inline_text(parent_item, doc) _log.debug( f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore ) @@ -207,8 +212,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): traverse(element) snippet_text = "".join(strings) if len(snippet_text) > 0: - parent_element = doc.add_text( - label=doc_label, parent=parent_element, text=snippet_text + parent_item = doc.add_text( + label=doc_label, parent=parent_item, text=snippet_text ) elif isinstance(element, marko.block.List): @@ -218,35 +223,37 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): has_non_empty_list_items = True break - self.close_table(doc) - self.process_inline_text(parent_element, doc) + self._close_table(doc) + self._process_inline_text(parent_item, doc) _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}") if has_non_empty_list_items: label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST - parent_element = doc.add_group( - label=label, name=f"list", parent=parent_element + parent_item = doc.add_group( + label=label, name=f"list", parent=parent_item ) elif isinstance(element, marko.block.ListItem) and len(element.children) > 0: - self.close_table(doc) - self.process_inline_text(parent_element, doc) + self._close_table(doc) + self._process_inline_text(parent_item, doc) _log.debug(" - List item") - snippet_text = str(element.children[0].children[0].children) # type: ignore + first_child = element.children[0] + snippet_text = str(first_child.children[0].children) # type: ignore is_numbered = False if ( - parent_element is not None - and isinstance(parent_element, DocItem) - and parent_element.label == GroupLabel.ORDERED_LIST + parent_item is not None + and isinstance(parent_item, DocItem) + and parent_item.label == GroupLabel.ORDERED_LIST ): is_numbered = True doc.add_list_item( - enumerated=is_numbered, parent=parent_element, text=snippet_text + enumerated=is_numbered, parent=parent_item, text=snippet_text ) + visited.add(first_child) elif isinstance(element, marko.inline.Image): - self.close_table(doc) - self.process_inline_text(parent_element, doc) + self._close_table(doc) + self._process_inline_text(parent_item, doc) _log.debug(f" - Image with alt: {element.title}, url: {element.dest}") fig_caption: Optional[TextItem] = None @@ -255,10 +262,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): label=DocItemLabel.CAPTION, text=element.title ) - doc.add_picture(parent=parent_element, caption=fig_caption) + doc.add_picture(parent=parent_item, caption=fig_caption) elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0: - self.process_inline_text(parent_element, doc) + self._process_inline_text(parent_item, doc) elif isinstance(element, marko.inline.RawText): _log.debug(f" - Paragraph (raw text): {element.children}") @@ -272,17 +279,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): else: self.md_table_buffer.append(snippet_text) else: - self.close_table(doc) - self.in_table = False + self._close_table(doc) # most likely just inline text self.inline_texts.append(str(element.children)) elif isinstance(element, marko.inline.CodeSpan): - self.close_table(doc) - self.process_inline_text(parent_element, doc) + self._close_table(doc) + self._process_inline_text(parent_item, doc) _log.debug(f" - Code Span: {element.children}") snippet_text = str(element.children).strip() - doc.add_code(parent=parent_element, text=snippet_text) + doc.add_code(parent=parent_item, text=snippet_text) elif ( isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode)) @@ -290,10 +296,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): and isinstance((first_child := element.children[0]), marko.inline.RawText) and len(snippet_text := (first_child.children.strip())) > 0 ): - self.close_table(doc) - self.process_inline_text(parent_element, doc) + self._close_table(doc) + self._process_inline_text(parent_item, doc) _log.debug(f" - Code Block: {element.children}") - doc.add_code(parent=parent_element, text=snippet_text) + doc.add_code(parent=parent_item, text=snippet_text) elif isinstance(element, marko.inline.LineBreak): if self.in_table: @@ -302,8 +308,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): elif isinstance(element, marko.block.HTMLBlock): self._html_blocks += 1 - self.process_inline_text(parent_element, doc) - self.close_table(doc) + self._process_inline_text(parent_item, doc) + self._close_table(doc) _log.debug("HTML Block: {}".format(element)) if ( len(element.body) > 0 @@ -312,18 +318,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): # wrap in markers to enable post-processing in convert() text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}" - doc.add_code(parent=parent_element, text=text_to_add) + doc.add_code(parent=parent_item, text=text_to_add) else: if not isinstance(element, str): - self.close_table(doc) + self._close_table(doc) _log.debug("Some other element: {}".format(element)) processed_block_types = ( - marko.block.ListItem, marko.block.Heading, marko.block.CodeBlock, marko.block.FencedCode, - # marko.block.Paragraph, marko.inline.RawText, ) @@ -332,7 +336,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): element, processed_block_types ): for child in element.children: - self.iterate_elements(child, depth + 1, doc, parent_element) + self._iterate_elements( + element=child, + depth=depth + 1, + doc=doc, + visited=visited, + parent_item=parent_item, + ) def is_valid(self) -> bool: return self.valid @@ -366,9 +376,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): marko_parser = Markdown() parsed_ast = marko_parser.parse(self.markdown) # Start iterating from the root of the AST - self.iterate_elements(parsed_ast, 0, doc, None) - self.process_inline_text(None, doc) # handle last hanging inline text - self.close_table(doc=doc) # handle any last hanging table + self._iterate_elements( + element=parsed_ast, + depth=0, + doc=doc, + parent_item=None, + visited=set(), + ) + self._process_inline_text(None, doc) # handle last hanging inline text + self._close_table(doc=doc) # handle any last hanging table # if HTML blocks were detected, export to HTML and delegate to HTML backend if self._html_blocks > 0: diff --git a/tests/data/groundtruth/docling_v2/nested.md.md b/tests/data/groundtruth/docling_v2/nested.md.md new file mode 100644 index 00000000..6e430e0c --- /dev/null +++ b/tests/data/groundtruth/docling_v2/nested.md.md @@ -0,0 +1,31 @@ +# Nesting + +A list featuring nesting: + +- abc + - abc123 + - abc1234 + - abc12345 + - a. + - b. + - abcd1234: + - abcd12345: + - a. + - b. +- def: + - def1234: + - def12345。 +- after one empty line + - foo +- afer two empty lines + - bar + +- changing symbol + +A nested HTML list: + +- First item +- Second item with subitems: + - Subitem 1 + - Subitem 2 +- Last list item diff --git a/tests/data/md/nested.md b/tests/data/md/nested.md new file mode 100644 index 00000000..4e203eec --- /dev/null +++ b/tests/data/md/nested.md @@ -0,0 +1,66 @@ +# Nesting + +A list featuring nesting: + +- abc + - abc123 + - abc1234 + - abc12345 + - a. + - b. + - abcd1234: + - abcd12345: + - a. + - b. +- def: + - def1234: + - def12345。 + +- after one empty line + - foo + + +- afer two empty lines + - bar +* changing symbol + +A nested HTML list: + + + + diff --git a/tests/test_backend_markdown.py b/tests/test_backend_markdown.py index caa94d9f..5a201ab2 100644 --- a/tests/test_backend_markdown.py +++ b/tests/test_backend_markdown.py @@ -4,6 +4,8 @@ from docling.backend.md_backend import MarkdownDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument +from .test_data_gen_flag import GEN_TEST_DATA + def test_convert_valid(): fmt = InputFormat.MD @@ -30,6 +32,10 @@ def test_convert_valid(): act_doc = backend.convert() act_data = act_doc.export_to_markdown() - with open(gt_path, "r", encoding="utf-8") as f: - exp_data = f.read().rstrip() - assert act_data == exp_data + if GEN_TEST_DATA: + with open(gt_path, mode="w", encoding="utf-8") as f: + f.write(f"{act_data}\n") + else: + with open(gt_path, encoding="utf-8") as f: + exp_data = f.read().rstrip() + assert exp_data == act_data diff --git a/tests/test_data_gen_flag.py b/tests/test_data_gen_flag.py new file mode 100644 index 00000000..a4baff66 --- /dev/null +++ b/tests/test_data_gen_flag.py @@ -0,0 +1,9 @@ +import os + +from pydantic import TypeAdapter + +GEN_TEST_DATA = TypeAdapter(bool).validate_python(os.getenv("DOCLING_GEN_TEST_DATA", 0)) + + +def test_gen_test_data_flag(): + assert not GEN_TEST_DATA From 02faf5376b22e174a6aa90dc7bd95feb14a94754 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Fri, 7 Feb 2025 13:58:05 +0100 Subject: [PATCH 2/6] refactor: use org--name in artifacts-path (#912) use org--name in artifacts-path Signed-off-by: Michele Dolfi --- docling/models/code_formula_model.py | 2 +- docling/models/document_picture_classifier.py | 2 +- docling/models/layout_model.py | 2 +- docling/models/table_structure_model.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docling/models/code_formula_model.py b/docling/models/code_formula_model.py index 8bb29af2..1a0f0bf0 100644 --- a/docling/models/code_formula_model.py +++ b/docling/models/code_formula_model.py @@ -62,7 +62,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel): Processes the given batch of elements and enriches them with predictions. """ - _model_repo_folder = "CodeFormula" + _model_repo_folder = "ds4sd--CodeFormula" elements_batch_size = 5 images_scale = 1.66 # = 120 dpi, aligned with training data resolution expansion_factor = 0.03 diff --git a/docling/models/document_picture_classifier.py b/docling/models/document_picture_classifier.py index 302d18cb..6e71246b 100644 --- a/docling/models/document_picture_classifier.py +++ b/docling/models/document_picture_classifier.py @@ -56,7 +56,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel): Processes a batch of elements and adds classification annotations. """ - _model_repo_folder = "DocumentFigureClassifier" + _model_repo_folder = "ds4sd--DocumentFigureClassifier" images_scale = 2 def __init__( diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index c88f91cb..b3cbd954 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -22,7 +22,7 @@ _log = logging.getLogger(__name__) class LayoutModel(BasePageModel): - _model_repo_folder = "docling-models" + _model_repo_folder = "ds4sd--docling-models" _model_path = "model_artifacts/layout" TEXT_ELEM_LABELS = [ diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index b5ab5a2a..64979157 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -23,7 +23,7 @@ from docling.utils.profiling import TimeRecorder class TableStructureModel(BasePageModel): - _model_repo_folder = "docling-models" + _model_repo_folder = "ds4sd--docling-models" _model_path = "model_artifacts/tableformer" def __init__( From fba3cf9be75e239896c353a1c0f3bdd0fa4a92fa Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 7 Feb 2025 13:36:54 +0000 Subject: [PATCH 3/6] chore: bump version to 2.19.0 [skip ci] --- CHANGELOG.md | 17 +++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ad3b47d..8dc85cf5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,20 @@ +## [v2.19.0](https://github.com/DS4SD/docling/releases/tag/v2.19.0) - 2025-02-07 + +### Feature + +* New artifacts path and CLI utility ([#876](https://github.com/DS4SD/docling/issues/876)) ([`ed74fe2`](https://github.com/DS4SD/docling/commit/ed74fe2ec0a702834f0deacfdb5717c8c587dab1)) + +### Fix + +* **markdown:** Handle nested lists ([#910](https://github.com/DS4SD/docling/issues/910)) ([`90b766e`](https://github.com/DS4SD/docling/commit/90b766e2ae1695a759191df37c272efc09be5ee3)) +* Test cases for RTL programmatic PDFs and fixes for the formula model ([#903](https://github.com/DS4SD/docling/issues/903)) ([`9114ada`](https://github.com/DS4SD/docling/commit/9114ada7bc4dd45ce0046de2f9d00a80ccb25c79)) +* **msword_backend:** Handle conversion error in label parsing ([#896](https://github.com/DS4SD/docling/issues/896)) ([`722a6eb`](https://github.com/DS4SD/docling/commit/722a6eb7b994a0261312a356df80b2fced121812)) +* Enrichment models batch size and expose picture classifier ([#878](https://github.com/DS4SD/docling/issues/878)) ([`5ad6de0`](https://github.com/DS4SD/docling/commit/5ad6de05600315617b574bd12af553e00b4d316e)) + +### Documentation + +* Introduce example with custom models for RapidOCR ([#874](https://github.com/DS4SD/docling/issues/874)) ([`6d3fea0`](https://github.com/DS4SD/docling/commit/6d3fea019635bd6ca94bd36c3928b28c245d638d)) + ## [v2.18.0](https://github.com/DS4SD/docling/releases/tag/v2.18.0) - 2025-02-03 ### Feature diff --git a/pyproject.toml b/pyproject.toml index e1c30a3c..3bc88b05 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "docling" -version = "2.18.0" # DO NOT EDIT, updated automatically +version = "2.19.0" # DO NOT EDIT, updated automatically description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Panos Vagenas ", "Peter Staar "] license = "MIT" From 4cc6e3ea5e858b367136acc729b723ea0552d22a Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Fri, 7 Feb 2025 16:30:42 +0100 Subject: [PATCH 4/6] feat: Describe pictures using vision models (#259) * draft for picture description models Signed-off-by: Michele Dolfi * vlm description using AutoModelForVision2Seq Signed-off-by: Michele Dolfi * add generation options Signed-off-by: Michele Dolfi * update vlm API Signed-off-by: Michele Dolfi * allow only localhost traffic Signed-off-by: Michele Dolfi * rename model Signed-off-by: Michele Dolfi * do not run with vlm api Signed-off-by: Michele Dolfi * more renaming Signed-off-by: Michele Dolfi * fix examples path Signed-off-by: Michele Dolfi * apply CLI download login Signed-off-by: Michele Dolfi * fix name of cli argument Signed-off-by: Michele Dolfi * use with_smolvlm in models download Signed-off-by: Michele Dolfi --------- Signed-off-by: Michele Dolfi --- .github/workflows/checks.yml | 2 +- docling/cli/main.py | 5 + docling/cli/models.py | 2 + docling/datamodel/pipeline_options.py | 54 ++++++++- docling/models/base_model.py | 4 +- .../models/picture_description_api_model.py | 105 +++++++++++++++++ .../models/picture_description_base_model.py | 64 ++++++++++ .../models/picture_description_vlm_model.py | 109 ++++++++++++++++++ docling/pipeline/standard_pdf_pipeline.py | 42 ++++++- docling/utils/model_downloader.py | 12 ++ docs/examples/pictures_description.py | 48 ++++++++ docs/examples/pictures_description_api.py | 55 +++++++++ poetry.lock | 9 +- pyproject.toml | 8 +- 14 files changed, 508 insertions(+), 11 deletions(-) create mode 100644 docling/models/picture_description_api_model.py create mode 100644 docling/models/picture_description_base_model.py create mode 100644 docling/models/picture_description_vlm_model.py create mode 100644 docs/examples/pictures_description.py create mode 100644 docs/examples/pictures_description_api.py diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 75ea5970..89bcfd79 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -28,7 +28,7 @@ jobs: run: | for file in docs/examples/*.py; do # Skip batch_convert.py - if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert).py ]]; then + if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api).py ]]; then echo "Skipping $file" continue fi diff --git a/docling/cli/main.py b/docling/cli/main.py index 19f77e4e..e2bc0dd6 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -226,6 +226,10 @@ def convert( help="Enable the picture classification enrichment model in the pipeline.", ), ] = False, + enrich_picture_description: Annotated[ + bool, + typer.Option(..., help="Enable the picture description model in the pipeline."), + ] = False, artifacts_path: Annotated[ Optional[Path], typer.Option(..., help="If provided, the location of the model artifacts."), @@ -382,6 +386,7 @@ def convert( do_table_structure=True, do_code_enrichment=enrich_code, do_formula_enrichment=enrich_formula, + do_picture_description=enrich_picture_description, do_picture_classification=enrich_picture_classes, document_timeout=document_timeout, ) diff --git a/docling/cli/models.py b/docling/cli/models.py index aea498c5..3b62ad6b 100644 --- a/docling/cli/models.py +++ b/docling/cli/models.py @@ -31,6 +31,7 @@ class _AvailableModels(str, Enum): TABLEFORMER = "tableformer" CODE_FORMULA = "code_formula" PICTURE_CLASSIFIER = "picture_classifier" + SMOLVLM = "smolvlm" EASYOCR = "easyocr" @@ -81,6 +82,7 @@ def download( with_tableformer=_AvailableModels.TABLEFORMER in to_download, with_code_formula=_AvailableModels.CODE_FORMULA in to_download, with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download, + with_smolvlm=_AvailableModels.SMOLVLM in to_download, with_easyocr=_AvailableModels.EASYOCR in to_download, ) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 14ca75bf..3b6401b6 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -2,9 +2,9 @@ import logging import os from enum import Enum from pathlib import Path -from typing import Any, List, Literal, Optional, Union +from typing import Annotated, Any, Dict, List, Literal, Optional, Union -from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic import AnyUrl, BaseModel, ConfigDict, Field, model_validator from pydantic_settings import BaseSettings, SettingsConfigDict _log = logging.getLogger(__name__) @@ -184,6 +184,51 @@ class OcrMacOptions(OcrOptions): ) +class PictureDescriptionBaseOptions(BaseModel): + kind: str + batch_size: int = 8 + scale: float = 2 + + bitmap_area_threshold: float = ( + 0.2 # percentage of the area for a bitmap to processed with the models + ) + + +class PictureDescriptionApiOptions(PictureDescriptionBaseOptions): + kind: Literal["api"] = "api" + + url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions") + headers: Dict[str, str] = {} + params: Dict[str, Any] = {} + timeout: float = 20 + + prompt: str = "Describe this image in a few sentences." + provenance: str = "" + + +class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions): + kind: Literal["vlm"] = "vlm" + + repo_id: str + prompt: str = "Describe this image in a few sentences." + # Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig + generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False) + + @property + def repo_cache_folder(self) -> str: + return self.repo_id.replace("/", "--") + + +smolvlm_picture_description = PictureDescriptionVlmOptions( + repo_id="HuggingFaceTB/SmolVLM-256M-Instruct" +) +# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct") +granite_picture_description = PictureDescriptionVlmOptions( + repo_id="ibm-granite/granite-vision-3.1-2b-preview", + prompt="What is shown in this image?", +) + + # Define an enum for the backend options class PdfBackend(str, Enum): """Enum of valid PDF backends.""" @@ -223,6 +268,7 @@ class PdfPipelineOptions(PipelineOptions): do_code_enrichment: bool = False # True: perform code OCR do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code do_picture_classification: bool = False # True: classify pictures in documents + do_picture_description: bool = False # True: run describe pictures in documents table_structure_options: TableStructureOptions = TableStructureOptions() ocr_options: Union[ @@ -232,6 +278,10 @@ class PdfPipelineOptions(PipelineOptions): OcrMacOptions, RapidOcrOptions, ] = Field(EasyOcrOptions(), discriminator="kind") + picture_description_options: Annotated[ + Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions], + Field(discriminator="kind"), + ] = smolvlm_picture_description images_scale: float = 1.0 generate_page_images: bool = False diff --git a/docling/models/base_model.py b/docling/models/base_model.py index a2bc776e..9cdc0ecb 100644 --- a/docling/models/base_model.py +++ b/docling/models/base_model.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from typing import Any, Generic, Iterable, Optional -from docling_core.types.doc import BoundingBox, DoclingDocument, NodeItem, TextItem +from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem from typing_extensions import TypeVar from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page @@ -64,7 +64,7 @@ class BaseItemAndImageEnrichmentModel( if not self.is_processable(doc=conv_res.document, element=element): return None - assert isinstance(element, TextItem) + assert isinstance(element, DocItem) element_prov = element.prov[0] bbox = element_prov.bbox diff --git a/docling/models/picture_description_api_model.py b/docling/models/picture_description_api_model.py new file mode 100644 index 00000000..6c7e02fc --- /dev/null +++ b/docling/models/picture_description_api_model.py @@ -0,0 +1,105 @@ +import base64 +import io +import logging +from typing import Iterable, List, Optional + +import httpx +from docling_core.types.doc import PictureItem +from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc + PictureDescriptionData, +) +from PIL import Image +from pydantic import BaseModel, ConfigDict + +from docling.datamodel.pipeline_options import PictureDescriptionApiOptions +from docling.models.picture_description_base_model import PictureDescriptionBaseModel + +_log = logging.getLogger(__name__) + + +class ChatMessage(BaseModel): + role: str + content: str + + +class ResponseChoice(BaseModel): + index: int + message: ChatMessage + finish_reason: str + + +class ResponseUsage(BaseModel): + prompt_tokens: int + completion_tokens: int + total_tokens: int + + +class ApiResponse(BaseModel): + model_config = ConfigDict( + protected_namespaces=(), + ) + + id: str + model: Optional[str] = None # returned by openai + choices: List[ResponseChoice] + created: int + usage: ResponseUsage + + +class PictureDescriptionApiModel(PictureDescriptionBaseModel): + # elements_batch_size = 4 + + def __init__(self, enabled: bool, options: PictureDescriptionApiOptions): + super().__init__(enabled=enabled, options=options) + self.options: PictureDescriptionApiOptions + + if self.enabled: + if options.url.host != "localhost": + raise NotImplementedError( + "The options try to connect to remote APIs which are not yet allowed." + ) + + def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]: + # Note: technically we could make a batch request here, + # but not all APIs will allow for it. For example, vllm won't allow more than 1. + for image in images: + img_io = io.BytesIO() + image.save(img_io, "PNG") + image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8") + + messages = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": self.options.prompt, + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{image_base64}" + }, + }, + ], + } + ] + + payload = { + "messages": messages, + **self.options.params, + } + + r = httpx.post( + str(self.options.url), + headers=self.options.headers, + json=payload, + timeout=self.options.timeout, + ) + if not r.is_success: + _log.error(f"Error calling the API. Reponse was {r.text}") + r.raise_for_status() + + api_resp = ApiResponse.model_validate_json(r.text) + generated_text = api_resp.choices[0].message.content.strip() + yield generated_text diff --git a/docling/models/picture_description_base_model.py b/docling/models/picture_description_base_model.py new file mode 100644 index 00000000..b653e0e3 --- /dev/null +++ b/docling/models/picture_description_base_model.py @@ -0,0 +1,64 @@ +import logging +from pathlib import Path +from typing import Any, Iterable, List, Optional, Union + +from docling_core.types.doc import ( + DoclingDocument, + NodeItem, + PictureClassificationClass, + PictureItem, +) +from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc + PictureDescriptionData, +) +from PIL import Image + +from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions +from docling.models.base_model import ( + BaseItemAndImageEnrichmentModel, + ItemAndImageEnrichmentElement, +) + + +class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel): + images_scale: float = 2.0 + + def __init__( + self, + enabled: bool, + options: PictureDescriptionBaseOptions, + ): + self.enabled = enabled + self.options = options + self.provenance = "not-implemented" + + def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool: + return self.enabled and isinstance(element, PictureItem) + + def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]: + raise NotImplementedError + + def __call__( + self, + doc: DoclingDocument, + element_batch: Iterable[ItemAndImageEnrichmentElement], + ) -> Iterable[NodeItem]: + if not self.enabled: + for element in element_batch: + yield element.item + return + + images: List[Image.Image] = [] + elements: List[PictureItem] = [] + for el in element_batch: + assert isinstance(el.item, PictureItem) + elements.append(el.item) + images.append(el.image) + + outputs = self._annotate_images(images) + + for item, output in zip(elements, outputs): + item.annotations.append( + PictureDescriptionData(text=output, provenance=self.provenance) + ) + yield item diff --git a/docling/models/picture_description_vlm_model.py b/docling/models/picture_description_vlm_model.py new file mode 100644 index 00000000..9fa4826d --- /dev/null +++ b/docling/models/picture_description_vlm_model.py @@ -0,0 +1,109 @@ +from pathlib import Path +from typing import Iterable, Optional, Union + +from PIL import Image + +from docling.datamodel.pipeline_options import ( + AcceleratorOptions, + PictureDescriptionVlmOptions, +) +from docling.models.picture_description_base_model import PictureDescriptionBaseModel +from docling.utils.accelerator_utils import decide_device + + +class PictureDescriptionVlmModel(PictureDescriptionBaseModel): + + def __init__( + self, + enabled: bool, + artifacts_path: Optional[Union[Path, str]], + options: PictureDescriptionVlmOptions, + accelerator_options: AcceleratorOptions, + ): + super().__init__(enabled=enabled, options=options) + self.options: PictureDescriptionVlmOptions + + if self.enabled: + + if artifacts_path is None: + artifacts_path = self.download_models(repo_id=self.options.repo_id) + else: + artifacts_path = Path(artifacts_path) / self.options.repo_cache_folder + + self.device = decide_device(accelerator_options.device) + + try: + import torch + from transformers import AutoModelForVision2Seq, AutoProcessor + except ImportError: + raise ImportError( + "transformers >=4.46 is not installed. Please install Docling with the required extras `pip install docling[vlm]`." + ) + + # Initialize processor and model + self.processor = AutoProcessor.from_pretrained(self.options.repo_id) + self.model = AutoModelForVision2Seq.from_pretrained( + self.options.repo_id, + torch_dtype=torch.bfloat16, + _attn_implementation=( + "flash_attention_2" if self.device.startswith("cuda") else "eager" + ), + ).to(self.device) + + self.provenance = f"{self.options.repo_id}" + + @staticmethod + def download_models( + repo_id: str, + local_dir: Optional[Path] = None, + force: bool = False, + progress: bool = False, + ) -> Path: + from huggingface_hub import snapshot_download + from huggingface_hub.utils import disable_progress_bars + + if not progress: + disable_progress_bars() + download_path = snapshot_download( + repo_id=repo_id, + force_download=force, + local_dir=local_dir, + ) + + return Path(download_path) + + def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]: + from transformers import GenerationConfig + + # Create input messages + messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": self.options.prompt}, + ], + }, + ] + + # TODO: do batch generation + + for image in images: + # Prepare inputs + prompt = self.processor.apply_chat_template( + messages, add_generation_prompt=True + ) + inputs = self.processor(text=prompt, images=[image], return_tensors="pt") + inputs = inputs.to(self.device) + + # Generate outputs + generated_ids = self.model.generate( + **inputs, + generation_config=GenerationConfig(**self.options.generation_config), + ) + generated_texts = self.processor.batch_decode( + generated_ids[:, inputs["input_ids"].shape[1] :], + skip_special_tokens=True, + ) + + yield generated_texts[0].strip() diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 4e66415f..13e435f9 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -14,6 +14,8 @@ from docling.datamodel.pipeline_options import ( EasyOcrOptions, OcrMacOptions, PdfPipelineOptions, + PictureDescriptionApiOptions, + PictureDescriptionVlmOptions, RapidOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, @@ -34,6 +36,9 @@ from docling.models.page_preprocessing_model import ( PagePreprocessingModel, PagePreprocessingOptions, ) +from docling.models.picture_description_api_model import PictureDescriptionApiModel +from docling.models.picture_description_base_model import PictureDescriptionBaseModel +from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel from docling.models.rapid_ocr_model import RapidOcrModel from docling.models.table_structure_model import TableStructureModel from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel @@ -95,8 +100,17 @@ class StandardPdfPipeline(PaginatedPipeline): PageAssembleModel(options=PageAssembleOptions()), ] + # Picture description model + if ( + picture_description_model := self.get_picture_description_model( + artifacts_path=artifacts_path + ) + ) is None: + raise RuntimeError( + f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}." + ) + self.enrichment_pipe = [ - # Other models working on `NodeItem` elements in the DoclingDocument # Code Formula Enrichment Model CodeFormulaModel( enabled=pipeline_options.do_code_enrichment @@ -115,11 +129,14 @@ class StandardPdfPipeline(PaginatedPipeline): options=DocumentPictureClassifierOptions(), accelerator_options=pipeline_options.accelerator_options, ), + # Document Picture description + picture_description_model, ] if ( self.pipeline_options.do_formula_enrichment or self.pipeline_options.do_code_enrichment + or self.pipeline_options.do_picture_description ): self.keep_backend = True @@ -175,6 +192,29 @@ class StandardPdfPipeline(PaginatedPipeline): ) return None + def get_picture_description_model( + self, artifacts_path: Optional[Path] = None + ) -> Optional[PictureDescriptionBaseModel]: + if isinstance( + self.pipeline_options.picture_description_options, + PictureDescriptionApiOptions, + ): + return PictureDescriptionApiModel( + enabled=self.pipeline_options.do_picture_description, + options=self.pipeline_options.picture_description_options, + ) + elif isinstance( + self.pipeline_options.picture_description_options, + PictureDescriptionVlmOptions, + ): + return PictureDescriptionVlmModel( + enabled=self.pipeline_options.do_picture_description, + artifacts_path=artifacts_path, + options=self.pipeline_options.picture_description_options, + accelerator_options=self.pipeline_options.accelerator_options, + ) + return None + def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page: with TimeRecorder(conv_res, "page_init"): page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore diff --git a/docling/utils/model_downloader.py b/docling/utils/model_downloader.py index 504618ec..7d22b77b 100644 --- a/docling/utils/model_downloader.py +++ b/docling/utils/model_downloader.py @@ -2,11 +2,13 @@ import logging from pathlib import Path from typing import Optional +from docling.datamodel.pipeline_options import smolvlm_picture_description from docling.datamodel.settings import settings from docling.models.code_formula_model import CodeFormulaModel from docling.models.document_picture_classifier import DocumentPictureClassifier from docling.models.easyocr_model import EasyOcrModel from docling.models.layout_model import LayoutModel +from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel from docling.models.table_structure_model import TableStructureModel _log = logging.getLogger(__name__) @@ -21,6 +23,7 @@ def download_models( with_tableformer: bool = True, with_code_formula: bool = True, with_picture_classifier: bool = True, + with_smolvlm: bool = True, with_easyocr: bool = True, ): if output_dir is None: @@ -61,6 +64,15 @@ def download_models( progress=progress, ) + if with_smolvlm: + _log.info(f"Downloading SmolVlm model...") + PictureDescriptionVlmModel.download_models( + repo_id=smolvlm_picture_description.repo_id, + local_dir=output_dir / smolvlm_picture_description.repo_cache_folder, + force=force, + progress=progress, + ) + if with_easyocr: _log.info(f"Downloading easyocr models...") EasyOcrModel.download_models( diff --git a/docs/examples/pictures_description.py b/docs/examples/pictures_description.py new file mode 100644 index 00000000..f60ac29d --- /dev/null +++ b/docs/examples/pictures_description.py @@ -0,0 +1,48 @@ +import logging +from pathlib import Path + +from docling_core.types.doc import PictureItem + +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import ( + PdfPipelineOptions, + granite_picture_description, + smolvlm_picture_description, +) +from docling.document_converter import DocumentConverter, PdfFormatOption + + +def main(): + logging.basicConfig(level=logging.INFO) + + input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + + pipeline_options = PdfPipelineOptions() + pipeline_options.do_picture_description = True + pipeline_options.picture_description_options = smolvlm_picture_description + # pipeline_options.picture_description_options = granite_picture_description + + pipeline_options.picture_description_options.prompt = ( + "Describe the image in three sentences. Be consise and accurate." + ) + + doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + ) + } + ) + result = doc_converter.convert(input_doc_path) + + for element, _level in result.document.iterate_items(): + if isinstance(element, PictureItem): + print( + f"Picture {element.self_ref}\n" + f"Caption: {element.caption_text(doc=result.document)}\n" + f"Annotations: {element.annotations}" + ) + + +if __name__ == "__main__": + main() diff --git a/docs/examples/pictures_description_api.py b/docs/examples/pictures_description_api.py new file mode 100644 index 00000000..3da37edf --- /dev/null +++ b/docs/examples/pictures_description_api.py @@ -0,0 +1,55 @@ +import logging +from pathlib import Path + +from docling_core.types.doc import PictureItem + +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import ( + PdfPipelineOptions, + PictureDescriptionApiOptions, +) +from docling.document_converter import DocumentConverter, PdfFormatOption + + +def main(): + logging.basicConfig(level=logging.INFO) + + input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + + # This is using a local API server to do picture description. + # For example, you can launch it locally with: + # $ vllm serve "HuggingFaceTB/SmolVLM-256M-Instruct" + + pipeline_options = PdfPipelineOptions() + pipeline_options.do_picture_description = True + pipeline_options.picture_description_options = PictureDescriptionApiOptions( + url="http://localhost:8000/v1/chat/completions", + params=dict( + model="HuggingFaceTB/SmolVLM-256M-Instruct", + seed=42, + max_completion_tokens=200, + ), + prompt="Describe the image in three sentences. Be consise and accurate.", + timeout=90, + ) + + doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + ) + } + ) + result = doc_converter.convert(input_doc_path) + + for element, _level in result.document.iterate_items(): + if isinstance(element, PictureItem): + print( + f"Picture {element.self_ref}\n" + f"Caption: {element.caption_text(doc=result.document)}\n" + f"Annotations: {element.annotations}" + ) + + +if __name__ == "__main__": + main() diff --git a/poetry.lock b/poetry.lock index b261db4b..691dd844 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2727,13 +2727,13 @@ pygments = ">2.12.0" [[package]] name = "mkdocs-material" -version = "9.6.2" +version = "9.6.3" description = "Documentation that simply works" optional = false python-versions = ">=3.8" files = [ - {file = "mkdocs_material-9.6.2-py3-none-any.whl", hash = "sha256:71d90dbd63b393ad11a4d90151dfe3dcbfcd802c0f29ce80bebd9bbac6abc753"}, - {file = "mkdocs_material-9.6.2.tar.gz", hash = "sha256:a3de1c5d4c745f10afa78b1a02f917b9dce0808fb206adc0f5bb48b58c1ca21f"}, + {file = "mkdocs_material-9.6.3-py3-none-any.whl", hash = "sha256:1125622067e26940806701219303b27c0933e04533560725d97ec26fd16a39cf"}, + {file = "mkdocs_material-9.6.3.tar.gz", hash = "sha256:c87f7d1c39ce6326da5e10e232aed51bae46252e646755900f4b0fc9192fa832"}, ] [package.dependencies] @@ -7846,8 +7846,9 @@ type = ["pytest-mypy"] ocrmac = ["ocrmac"] rapidocr = ["onnxruntime", "onnxruntime", "rapidocr-onnxruntime"] tesserocr = ["tesserocr"] +vlm = ["transformers", "transformers"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "ca0464df452664834ae9bccc59f89240e2f5e8f3b179761de615548c799680e7" +content-hash = "86d266adc6272f3db65ab07f5cce35cbe9626368dc0e09ab374c861f0809f693" diff --git a/pyproject.toml b/pyproject.toml index 3bc88b05..9b1b5e9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,6 +59,10 @@ onnxruntime = [ { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" }, { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" } ] +transformers = [ + {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true }, + {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true } +] pillow = "^10.0.0" tqdm = "^4.65.0" @@ -121,6 +125,7 @@ torchvision = [ [tool.poetry.extras] tesserocr = ["tesserocr"] ocrmac = ["ocrmac"] +vlm = ["transformers"] rapidocr = ["rapidocr-onnxruntime", "onnxruntime"] [tool.poetry.scripts] @@ -162,7 +167,8 @@ module = [ "deepsearch_glm.*", "lxml.*", "bs4.*", - "huggingface_hub.*" + "huggingface_hub.*", + "transformers.*", ] ignore_missing_imports = true From c18f47c5c032c49bf3175aecd2236df37c0e9ae1 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Fri, 7 Feb 2025 17:51:31 +0100 Subject: [PATCH 5/6] fix: remove unused httpx (#919) * remove unused httpx Signed-off-by: Michele Dolfi * use requests instead of httpx Signed-off-by: Michele Dolfi * remove more usage of httpx Signed-off-by: Michele Dolfi --------- Signed-off-by: Michele Dolfi --- docling/models/easyocr_model.py | 2 -- docling/models/picture_description_api_model.py | 10 +++------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index 9b1b2a02..0eccb988 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -4,9 +4,7 @@ import zipfile from pathlib import Path from typing import Iterable, List, Optional -import httpx import numpy -import torch from docling_core.types.doc import BoundingBox, CoordOrigin from docling.datamodel.base_models import Cell, OcrCell, Page diff --git a/docling/models/picture_description_api_model.py b/docling/models/picture_description_api_model.py index 6c7e02fc..86b76944 100644 --- a/docling/models/picture_description_api_model.py +++ b/docling/models/picture_description_api_model.py @@ -3,11 +3,7 @@ import io import logging from typing import Iterable, List, Optional -import httpx -from docling_core.types.doc import PictureItem -from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc - PictureDescriptionData, -) +import requests from PIL import Image from pydantic import BaseModel, ConfigDict @@ -90,13 +86,13 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel): **self.options.params, } - r = httpx.post( + r = requests.post( str(self.options.url), headers=self.options.headers, json=payload, timeout=self.options.timeout, ) - if not r.is_success: + if not r.ok: _log.error(f"Error calling the API. Reponse was {r.text}") r.raise_for_status() From 3e26597995f236fe81ccd7f1a247d05b8a8420cb Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 7 Feb 2025 17:46:36 +0000 Subject: [PATCH 6/6] chore: bump version to 2.20.0 [skip ci] --- CHANGELOG.md | 10 ++++++++++ pyproject.toml | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8dc85cf5..030c6954 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## [v2.20.0](https://github.com/DS4SD/docling/releases/tag/v2.20.0) - 2025-02-07 + +### Feature + +* Describe pictures using vision models ([#259](https://github.com/DS4SD/docling/issues/259)) ([`4cc6e3e`](https://github.com/DS4SD/docling/commit/4cc6e3ea5e858b367136acc729b723ea0552d22a)) + +### Fix + +* Remove unused httpx ([#919](https://github.com/DS4SD/docling/issues/919)) ([`c18f47c`](https://github.com/DS4SD/docling/commit/c18f47c5c032c49bf3175aecd2236df37c0e9ae1)) + ## [v2.19.0](https://github.com/DS4SD/docling/releases/tag/v2.19.0) - 2025-02-07 ### Feature diff --git a/pyproject.toml b/pyproject.toml index 9b1b5e9b..e4425ffe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "docling" -version = "2.19.0" # DO NOT EDIT, updated automatically +version = "2.20.0" # DO NOT EDIT, updated automatically description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Panos Vagenas ", "Peter Staar "] license = "MIT"