From 2cece27208c4bce715d20000b845794dfb97843d Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Mon, 28 Oct 2024 14:28:26 +0100 Subject: [PATCH 1/4] docs: update LlamaIndex docs for Docling v2 (#182) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docs/examples/rag_llamaindex.ipynb | 137 +++++++++++++++++------------ docs/integrations/llamaindex.md | 11 +-- 2 files changed, 86 insertions(+), 62 deletions(-) diff --git a/docs/examples/rag_llamaindex.ipynb b/docs/examples/rag_llamaindex.ipynb index e5b8d68d..0252bc4f 100644 --- a/docs/examples/rag_llamaindex.ipynb +++ b/docs/examples/rag_llamaindex.ipynb @@ -14,13 +14,6 @@ "# RAG with LlamaIndex đŸĻ™" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> â„šī¸ 👉 **The LlamaIndex Docling extension update to Docling v2 is ongoing; in the meanwhile, this notebook is showing current extension output, based on Docling v1.**" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -35,8 +28,8 @@ "This example leverages the official [LlamaIndex Docling extension](../../integrations/llamaindex/).\n", "\n", "Presented extensions `DoclingReader` and `DoclingNodeParser` enable you to:\n", - "- use PDF documents in your LLM applications with ease and speed, and\n", - "- harness Docling's rich format for advanced, document-native grounding." + "- use various document types in your LLM applications with ease and speed, and\n", + "- leverage Docling's rich format for advanced, document-native grounding." ] }, { @@ -69,7 +62,7 @@ } ], "source": [ - "%pip install -q --progress-bar off --no-warn-conflicts llama-index-core llama-index-readers-docling llama-index-node-parser-docling llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-readers-file python-dotenv" + "%pip install -q --progress-bar off --no-warn-conflicts llama-index-core llama-index-readers-docling llama-index-node-parser-docling llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-vector-stores-milvus llama-index-readers-file python-dotenv" ] }, { @@ -161,7 +154,7 @@ "output_type": "stream", "text": [ "Q: Which are the main AI models in Docling?\n", - "A: 1. A layout analysis model, an accurate object-detector for page elements. 2. TableFormer, a state-of-the-art table structure recognition model.\n", + "A: The main AI models in Docling are a layout analysis model, which is an accurate object-detector for page elements, and TableFormer, a state-of-the-art table structure recognition model.\n", "\n", "Sources:\n" ] @@ -170,11 +163,9 @@ "data": { "text/plain": [ "[('3.2 AI models\\n\\nAs part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.',\n", - " {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n", - " 'Header_2': '3.2 AI models'}),\n", + " {'Header_2': '3.2 AI models'}),\n", " (\"5 Applications\\n\\nThanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.\",\n", - " {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n", - " 'Header_2': '5 Applications'})]" + " {'Header_2': '5 Applications'})]" ] }, "metadata": {}, @@ -243,23 +234,41 @@ "data": { "text/plain": [ "[('As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.',\n", - " {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n", - " 'path': '#/main-text/37',\n", - " 'heading': '3.2 AI models',\n", - " 'page': 3,\n", - " 'bbox': [107.36903381347656,\n", - " 330.07513427734375,\n", - " 506.29705810546875,\n", - " 407.3725280761719]}),\n", + " {'schema_name': 'docling_core.transforms.chunker.DocMeta',\n", + " 'version': '1.0.0',\n", + " 'doc_items': [{'self_ref': '#/texts/34',\n", + " 'parent': {'$ref': '#/body'},\n", + " 'children': [],\n", + " 'label': 'text',\n", + " 'prov': [{'page_no': 3,\n", + " 'bbox': {'l': 107.07593536376953,\n", + " 't': 406.1695251464844,\n", + " 'r': 504.1148681640625,\n", + " 'b': 330.2677307128906,\n", + " 'coord_origin': 'BOTTOMLEFT'},\n", + " 'charspan': [0, 608]}]}],\n", + " 'headings': ['3.2 AI models'],\n", + " 'origin': {'mimetype': 'application/pdf',\n", + " 'binary_hash': 14981478401387673002,\n", + " 'filename': '2408.09869v3.pdf'}}),\n", " ('With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.',\n", - " {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n", - " 'path': '#/main-text/10',\n", - " 'heading': '1 Introduction',\n", - " 'page': 1,\n", - " 'bbox': [107.33261108398438,\n", - " 83.3067626953125,\n", - " 504.0033874511719,\n", - " 136.45367431640625]})]" + " {'schema_name': 'docling_core.transforms.chunker.DocMeta',\n", + " 'version': '1.0.0',\n", + " 'doc_items': [{'self_ref': '#/texts/9',\n", + " 'parent': {'$ref': '#/body'},\n", + " 'children': [],\n", + " 'label': 'text',\n", + " 'prov': [{'page_no': 1,\n", + " 'bbox': {'l': 107.0031967163086,\n", + " 't': 136.7283935546875,\n", + " 'r': 504.04998779296875,\n", + " 'b': 83.30133056640625,\n", + " 'coord_origin': 'BOTTOMLEFT'},\n", + " 'charspan': [0, 488]}]}],\n", + " 'headings': ['1 Introduction'],\n", + " 'origin': {'mimetype': 'application/pdf',\n", + " 'binary_hash': 14981478401387673002,\n", + " 'filename': '2408.09869v3.pdf'}})]" ] }, "metadata": {}, @@ -335,7 +344,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Loading files: 100%|██████████| 1/1 [00:11<00:00, 11.15s/file]\n" + "Loading files: 100%|██████████| 1/1 [00:11<00:00, 11.27s/file]\n" ] }, { @@ -343,7 +352,7 @@ "output_type": "stream", "text": [ "Q: Which are the main AI models in Docling?\n", - "A: The main AI models in Docling are a layout analysis model and TableFormer. The layout analysis model is an accurate object-detector for page elements, and TableFormer is a state-of-the-art table structure recognition model.\n", + "A: 1. A layout analysis model, an accurate object-detector for page elements. 2. TableFormer, a state-of-the-art table structure recognition model.\n", "\n", "Sources:\n" ] @@ -352,35 +361,53 @@ "data": { "text/plain": [ "[('As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.',\n", - " {'file_path': '/var/folders/76/4wwfs06x6835kcwj4186c0nc0000gn/T/tmp4vsev3_r/2408.09869.pdf',\n", + " {'file_path': '/var/folders/76/4wwfs06x6835kcwj4186c0nc0000gn/T/tmp2ooyusg5/2408.09869.pdf',\n", " 'file_name': '2408.09869.pdf',\n", " 'file_type': 'application/pdf',\n", " 'file_size': 5566574,\n", - " 'creation_date': '2024-10-09',\n", - " 'last_modified_date': '2024-10-09',\n", - " 'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n", - " 'path': '#/main-text/37',\n", - " 'heading': '3.2 AI models',\n", - " 'page': 3,\n", - " 'bbox': [107.36903381347656,\n", - " 330.07513427734375,\n", - " 506.29705810546875,\n", - " 407.3725280761719]}),\n", + " 'creation_date': '2024-10-28',\n", + " 'last_modified_date': '2024-10-28',\n", + " 'schema_name': 'docling_core.transforms.chunker.DocMeta',\n", + " 'version': '1.0.0',\n", + " 'doc_items': [{'self_ref': '#/texts/34',\n", + " 'parent': {'$ref': '#/body'},\n", + " 'children': [],\n", + " 'label': 'text',\n", + " 'prov': [{'page_no': 3,\n", + " 'bbox': {'l': 107.07593536376953,\n", + " 't': 406.1695251464844,\n", + " 'r': 504.1148681640625,\n", + " 'b': 330.2677307128906,\n", + " 'coord_origin': 'BOTTOMLEFT'},\n", + " 'charspan': [0, 608]}]}],\n", + " 'headings': ['3.2 AI models'],\n", + " 'origin': {'mimetype': 'application/pdf',\n", + " 'binary_hash': 14981478401387673002,\n", + " 'filename': '2408.09869.pdf'}}),\n", " ('With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.',\n", - " {'file_path': '/var/folders/76/4wwfs06x6835kcwj4186c0nc0000gn/T/tmp4vsev3_r/2408.09869.pdf',\n", + " {'file_path': '/var/folders/76/4wwfs06x6835kcwj4186c0nc0000gn/T/tmp2ooyusg5/2408.09869.pdf',\n", " 'file_name': '2408.09869.pdf',\n", " 'file_type': 'application/pdf',\n", " 'file_size': 5566574,\n", - " 'creation_date': '2024-10-09',\n", - " 'last_modified_date': '2024-10-09',\n", - " 'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n", - " 'path': '#/main-text/10',\n", - " 'heading': '1 Introduction',\n", - " 'page': 1,\n", - " 'bbox': [107.33261108398438,\n", - " 83.3067626953125,\n", - " 504.0033874511719,\n", - " 136.45367431640625]})]" + " 'creation_date': '2024-10-28',\n", + " 'last_modified_date': '2024-10-28',\n", + " 'schema_name': 'docling_core.transforms.chunker.DocMeta',\n", + " 'version': '1.0.0',\n", + " 'doc_items': [{'self_ref': '#/texts/9',\n", + " 'parent': {'$ref': '#/body'},\n", + " 'children': [],\n", + " 'label': 'text',\n", + " 'prov': [{'page_no': 1,\n", + " 'bbox': {'l': 107.0031967163086,\n", + " 't': 136.7283935546875,\n", + " 'r': 504.04998779296875,\n", + " 'b': 83.30133056640625,\n", + " 'coord_origin': 'BOTTOMLEFT'},\n", + " 'charspan': [0, 488]}]}],\n", + " 'headings': ['1 Introduction'],\n", + " 'origin': {'mimetype': 'application/pdf',\n", + " 'binary_hash': 14981478401387673002,\n", + " 'filename': '2408.09869.pdf'}})]" ] }, "metadata": {}, diff --git a/docs/integrations/llamaindex.md b/docs/integrations/llamaindex.md index af82da31..424532ab 100644 --- a/docs/integrations/llamaindex.md +++ b/docs/integrations/llamaindex.md @@ -2,11 +2,8 @@ Docling is available as an official LlamaIndex extension! -To get started, check out the [step-by-step guide in LlamaIndex \[↗\]](https://docs.llamaindex.ai/en/stable/examples/data_connectors/DoclingReaderDemo/). - -!!! info "Docling v2" - - The LlamaIndex Docling extension update to Docling v2 is ongoing. + +To get started, check out the [step-by-step guide \[↗\]](https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/docs/examples/data_connectors/DoclingReaderDemo.ipynb) ## Components @@ -15,15 +12,15 @@ To get started, check out the [step-by-step guide in LlamaIndex \[↗\]](https:/ Reads document files and uses Docling to populate LlamaIndex `Document` objects — either serializing Docling's data model (losslessly, e.g. as JSON) or exporting to a simplified format (lossily, e.g. as Markdown). - đŸ’ģ [GitHub \[↗\]](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/readers/llama-index-readers-docling) -- 📖 [API docs \[↗\]](https://docs.llamaindex.ai/en/stable/api_reference/readers/docling/) - đŸ“Ļ [PyPI \[↗\]](https://pypi.org/project/llama-index-readers-docling/) - đŸĻ™ [LlamaHub \[↗\]](https://llamahub.ai/l/readers/llama-index-readers-docling) + ### Docling Node Parser Reads LlamaIndex `Document` objects populated in Docling's format by Docling Reader and, using its knowledge of the Docling format, parses them to LlamaIndex `Node` objects for downstream usage in LlamaIndex applications, e.g. as chunks for embedding. - đŸ’ģ [GitHub \[↗\]](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/node_parser/llama-index-node-parser-docling) -- 📖 [API docs \[↗\]](https://docs.llamaindex.ai/en/stable/api_reference/node_parser/docling/) - đŸ“Ļ [PyPI \[↗\]](https://pypi.org/project/llama-index-node-parser-docling/) - đŸĻ™ [LlamaHub \[↗\]](https://llamahub.ai/l/node_parser/llama-index-node-parser-docling) + From 94d0729c500b0be8ac4a1cd3025b42048f6e8d5a Mon Sep 17 00:00:00 2001 From: Maxim Lysak <101627549+maxmnemonic@users.noreply.github.com> Date: Mon, 28 Oct 2024 16:34:48 +0100 Subject: [PATCH 2/4] fix: handling of long sequence of unescaped underscore chars in markdown (#173) * Fix for md hanging when encountering long sequence of unescaped underscore chars Signed-off-by: Maksym Lysak * Added comment explaining reason for fix Signed-off-by: Maksym Lysak * Fixed trailing inline text handling (at the end of a file), and corrected underscore sequence shortening Signed-off-by: Maksym Lysak * making fix more rare Signed-off-by: Maksym Lysak --------- Signed-off-by: Maksym Lysak Co-authored-by: Maksym Lysak --- docling/backend/md_backend.py | 38 +++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 900319c0..2bcc6d7d 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -1,4 +1,6 @@ import logging +import re +import warnings from io import BytesIO from pathlib import Path from typing import Set, Union @@ -25,6 +27,30 @@ _log = logging.getLogger(__name__) class MarkdownDocumentBackend(DeclarativeDocumentBackend): + + def shorten_underscore_sequences(self, markdown_text, max_length=10): + # This regex will match any sequence of underscores + pattern = r"_+" + + def replace_match(match): + underscore_sequence = match.group( + 0 + ) # Get the full match (sequence of underscores) + + # Shorten the sequence if it exceeds max_length + if len(underscore_sequence) > max_length: + return "_" * max_length + else: + return underscore_sequence # Leave it unchanged if it is shorter or equal to max_length + + # Use re.sub to replace long underscore sequences + shortened_text = re.sub(pattern, replace_match, markdown_text) + + if len(shortened_text) != len(markdown_text): + warnings.warn("Detected potentially incorrect Markdown, correcting...") + + return shortened_text + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) @@ -42,11 +68,19 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): try: if isinstance(self.path_or_stream, BytesIO): text_stream = self.path_or_stream.getvalue().decode("utf-8") - self.markdown = text_stream + # remove invalid sequences + # very long sequences of underscores will lead to unnecessary long processing times. + # In any proper Markdown files, underscores have to be escaped, + # otherwise they represent emphasis (bold or italic) + self.markdown = self.shorten_underscore_sequences(text_stream) if isinstance(self.path_or_stream, Path): with open(self.path_or_stream, "r", encoding="utf-8") as f: md_content = f.read() - self.markdown = md_content + # remove invalid sequences + # very long sequences of underscores will lead to unnecessary long processing times. + # In any proper Markdown files, underscores have to be escaped, + # otherwise they represent emphasis (bold or italic) + self.markdown = self.shorten_underscore_sequences(md_content) self.valid = True _log.debug(self.markdown) From b9f5c74a7d13827c2b7887ddbf0b4eb43edd0846 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Mon, 28 Oct 2024 17:02:52 +0100 Subject: [PATCH 3/4] fix: fix header levels for DOCX & HTML (#184) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling/backend/html_backend.py | 19 ++++++------------- docling/backend/msword_backend.py | 24 +++++++++--------------- tests/test_backend_html.py | 30 ++++++++++++++++++++++++++++++ tests/test_backend_msword.py | 30 ++++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 28 deletions(-) create mode 100644 tests/test_backend_html.py create mode 100644 tests/test_backend_msword.py diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index b8026057..7fd69cff 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -180,11 +180,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[0], label=DocItemLabel.TITLE, text=text ) - elif hlevel == self.level: - self.parents[hlevel] = doc.add_text( - parent=self.parents[hlevel - 1], label=label, text=text - ) - elif hlevel > self.level: # add invisible group @@ -194,10 +189,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): label=GroupLabel.SECTION, parent=self.parents[i - 1], ) - - self.parents[hlevel] = doc.add_text( - parent=self.parents[hlevel - 1], label=label, text=text - ) self.level = hlevel elif hlevel < self.level: @@ -206,12 +197,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): for key, val in self.parents.items(): if key > hlevel: self.parents[key] = None - - self.parents[hlevel] = doc.add_text( - parent=self.parents[hlevel - 1], label=label, text=text - ) self.level = hlevel + self.parents[hlevel] = doc.add_heading( + parent=self.parents[hlevel - 1], + text=text, + level=hlevel, + ) + def handle_paragraph(self, element, idx, doc): """Handles paragraph tags (p).""" if element.text is None: diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 5b166d5b..08529ea0 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -294,13 +294,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): level = self.get_level() if isinstance(curr_level, int): - if curr_level == level: - - self.parents[level] = doc.add_heading( - parent=self.parents[level - 1], text=text - ) - - elif curr_level > level: + if curr_level > level: # add invisible group for i in range(level, curr_level): @@ -310,10 +304,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): name=f"header-{i}", ) - self.parents[curr_level] = doc.add_heading( - parent=self.parents[curr_level - 1], text=text - ) - elif curr_level < level: # remove the tail @@ -321,13 +311,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if key >= curr_level: self.parents[key] = None - self.parents[curr_level] = doc.add_heading( - parent=self.parents[curr_level - 1], text=text - ) + self.parents[curr_level] = doc.add_heading( + parent=self.parents[curr_level - 1], + text=text, + level=curr_level, + ) else: self.parents[self.level] = doc.add_heading( - parent=self.parents[self.level - 1], text=text + parent=self.parents[self.level - 1], + text=text, + level=1, ) return diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py new file mode 100644 index 00000000..f5ec0351 --- /dev/null +++ b/tests/test_backend_html.py @@ -0,0 +1,30 @@ +from pathlib import Path + +from docling.backend.html_backend import HTMLDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument, SectionHeaderItem + + +def test_heading_levels(): + in_path = Path("tests/data/wiki_duck.html") + in_doc = InputDocument( + path_or_stream=in_path, + format=InputFormat.HTML, + backend=HTMLDocumentBackend, + ) + backend = HTMLDocumentBackend( + in_doc=in_doc, + path_or_stream=in_path, + ) + doc = backend.convert() + + found_lvl_2 = found_lvl_3 = False + for item, _ in doc.iterate_items(): + if isinstance(item, SectionHeaderItem): + if item.text == "Etymology": + found_lvl_2 = True + assert item.level == 2 + elif item.text == "Feeding": + found_lvl_3 = True + assert item.level == 3 + assert found_lvl_2 and found_lvl_3 diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py new file mode 100644 index 00000000..4544e717 --- /dev/null +++ b/tests/test_backend_msword.py @@ -0,0 +1,30 @@ +from pathlib import Path + +from docling.backend.msword_backend import MsWordDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument, SectionHeaderItem + + +def test_heading_levels(): + in_path = Path("tests/data/word_sample.docx") + in_doc = InputDocument( + path_or_stream=in_path, + format=InputFormat.DOCX, + backend=MsWordDocumentBackend, + ) + backend = MsWordDocumentBackend( + in_doc=in_doc, + path_or_stream=in_path, + ) + doc = backend.convert() + + found_lvl_1 = found_lvl_2 = False + for item, _ in doc.iterate_items(): + if isinstance(item, SectionHeaderItem): + if item.text == "Let\u2019s swim!": + found_lvl_1 = True + assert item.level == 1 + elif item.text == "Let\u2019s eat": + found_lvl_2 = True + assert item.level == 2 + assert found_lvl_1 and found_lvl_2 From dda2645d4c729c60b1b189bf01cdc802e2c33ee5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 28 Oct 2024 17:18:41 +0000 Subject: [PATCH 4/4] chore: bump version to 2.2.1 [skip ci] --- CHANGELOG.md | 15 +++++++++++++++ pyproject.toml | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1108f627..34247515 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,18 @@ +## [v2.2.1](https://github.com/DS4SD/docling/releases/tag/v2.2.1) - 2024-10-28 + +### Fix + +* Fix header levels for DOCX & HTML ([#184](https://github.com/DS4SD/docling/issues/184)) ([`b9f5c74`](https://github.com/DS4SD/docling/commit/b9f5c74a7d13827c2b7887ddbf0b4eb43edd0846)) +* Handling of long sequence of unescaped underscore chars in markdown ([#173](https://github.com/DS4SD/docling/issues/173)) ([`94d0729`](https://github.com/DS4SD/docling/commit/94d0729c500b0be8ac4a1cd3025b42048f6e8d5a)) +* HTML backend, fixes for Lists and nested texts ([#180](https://github.com/DS4SD/docling/issues/180)) ([`7d19418`](https://github.com/DS4SD/docling/commit/7d19418b779408c345473af684de6b7f60872b6e)) +* MD Backend, fixes to properly handle trailing inline text and emphasis in headers ([#178](https://github.com/DS4SD/docling/issues/178)) ([`88c1673`](https://github.com/DS4SD/docling/commit/88c16730571afdd3bfb8894f64d41b5e99bc5a5b)) + +### Documentation + +* Update LlamaIndex docs for Docling v2 ([#182](https://github.com/DS4SD/docling/issues/182)) ([`2cece27`](https://github.com/DS4SD/docling/commit/2cece27208c4bce715d20000b845794dfb97843d)) +* Fix batch convert ([#177](https://github.com/DS4SD/docling/issues/177)) ([`189d3c2`](https://github.com/DS4SD/docling/commit/189d3c2d44ec389856f48696eaa78ac9f02f8cde)) +* Add export with embedded images ([#175](https://github.com/DS4SD/docling/issues/175)) ([`8d356aa`](https://github.com/DS4SD/docling/commit/8d356aa24715433d458eff4f5f0937ff5cb9cc69)) + ## [v2.2.0](https://github.com/DS4SD/docling/releases/tag/v2.2.0) - 2024-10-23 ### Feature diff --git a/pyproject.toml b/pyproject.toml index b4ad3372..26c34f55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "docling" -version = "2.2.0" # DO NOT EDIT, updated automatically +version = "2.2.1" # DO NOT EDIT, updated automatically description = "Docling PDF conversion package" authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Peter Staar "] license = "MIT"