diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index cae81085..216d156d 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -1,5 +1,5 @@
import logging
-from io import BytesIO, TextIOWrapper
+from io import BytesIO
from pathlib import Path
from typing import Set, Union
@@ -81,9 +81,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
try:
self.analyse_element(element, idx, doc)
except Exception as exc_child:
+
_log.error(" -> error treating child: ", exc_child)
_log.error(" => element: ", element, "\n")
- pass
+ raise exc_child
except Exception as exc:
pass
@@ -212,10 +213,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def handle_list(self, element, idx, doc):
"""Handles list tags (ul, ol) and their list items."""
- # create a list group
- self.parents[self.level + 1] = doc.add_group(
- parent=self.parents[self.level], name="list", label=GroupLabel.LIST
- )
+ if element.name == "ul":
+ # create a list group
+ self.parents[self.level + 1] = doc.add_group(
+ parent=self.parents[self.level], name="list", label=GroupLabel.LIST
+ )
+ elif element.name == "ol":
+ # create a list group
+ self.parents[self.level + 1] = doc.add_group(
+ parent=self.parents[self.level],
+ name="ordered list",
+ label=GroupLabel.ORDERED_LIST,
+ )
self.level += 1
self.walk(element, doc)
@@ -226,13 +235,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def handle_listitem(self, element, idx, doc):
"""Handles listitem tags (li)."""
nested_lists = element.find(["ul", "ol"])
+
+ parent_list_label = self.parents[self.level].label
+ index_in_list = len(self.parents[self.level].children) + 1
+
if nested_lists:
name = element.name
text = self.get_direct_text(element)
+ marker = ""
+ enumerated = False
+ if parent_list_label == GroupLabel.ORDERED_LIST:
+ marker = str(index_in_list)
+ enumerated = True
+
# create a list-item
- self.parents[self.level + 1] = doc.add_text(
- label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level]
+ self.parents[self.level + 1] = doc.add_list_item(
+ text=text,
+ enumerated=enumerated,
+ marker=marker,
+ parent=self.parents[self.level],
)
self.level += 1
@@ -244,8 +266,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(element.text, str):
text = element.text.strip()
- doc.add_text(
- label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level]
+ marker = ""
+ enumerated = False
+ if parent_list_label == GroupLabel.ORDERED_LIST:
+ marker = f"{str(index_in_list)}."
+ enumerated = True
+ doc.add_list_item(
+ text=text,
+ enumerated=enumerated,
+ marker=marker,
+ parent=self.parents[self.level],
)
else:
_log.warn("list-item has no text: ", element)
diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py
index 7703e3b1..b67c3ca3 100644
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -43,7 +43,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
try:
self.pptx_obj = Presentation(self.path_or_stream)
self.valid = True
- except Exception:
+ except Exception as e:
raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
) from e
@@ -134,6 +134,8 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
if is_a_list:
+ # TODO: determine if this is an unordered list or an ordered list.
+ # Set GroupLabel.ORDERED_LIST when it fits.
new_list = doc.add_group(
label=GroupLabel.LIST, name=f"list", parent=parent_slide
)
@@ -157,9 +159,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
e_is_a_list_item = True
else:
e_is_a_list_item = False
+
if e_is_a_list_item:
- doc.add_text(
- label=DocItemLabel.LIST_ITEM,
+ # TODO: Set marker and enumerated arguments if this is an enumeration element.
+ doc.add_list_item(
parent=new_list,
text=e.text.strip(),
prov=prov,
diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py
index 49911009..c3504b33 100644
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -312,9 +312,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
)
- doc.add_text(
- label=DocItemLabel.LIST_ITEM, parent=self.parents[level], text=text
- )
+ # TODO: Set marker and enumerated arguments if this is an enumeration element.
+ doc.add_list_item(parent=self.parents[level], text=text)
elif (
self.prev_numid() == numid and self.prev_indent() < ilevel
@@ -323,12 +322,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level_at_new_list + self.prev_indent() + 1,
self.level_at_new_list + ilevel + 1,
):
+ # TODO: determine if this is an unordered list or an ordered list.
+ # Set GroupLabel.ORDERED_LIST when it fits.
self.parents[i] = doc.add_group(
label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
)
- doc.add_text(
- label=DocItemLabel.LIST_ITEM,
+ # TODO: Set marker and enumerated arguments if this is an enumeration element.
+ doc.add_list_item(
parent=self.parents[self.level_at_new_list + ilevel],
text=text,
)
@@ -338,16 +339,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if k > self.level_at_new_list + ilevel:
self.parents[k] = None
- doc.add_text(
- label=DocItemLabel.LIST_ITEM,
+ # TODO: Set marker and enumerated arguments if this is an enumeration element.
+ doc.add_list_item(
parent=self.parents[self.level_at_new_list + ilevel],
text=text,
)
elif self.prev_numid() == numid or self.prev_indent() == ilevel:
- doc.add_text(
- label=DocItemLabel.LIST_ITEM, parent=self.parents[level - 1], text=text
- )
+ # TODO: Set marker and enumerated arguments if this is an enumeration element.
+ doc.add_list_item(parent=self.parents[level - 1], text=text)
return
def handle_tables(self, element, docx_obj, doc):
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index ede4e328..5ec8b6d1 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -3,7 +3,7 @@ import re
from enum import Enum
from io import BytesIO
from pathlib import Path, PurePath
-from typing import Dict, Iterable, List, Optional, Tuple, Type, Union
+from typing import Dict, Iterable, List, Optional, Tuple, Union
import filetype
from docling_core.types import BaseText
@@ -23,10 +23,6 @@ from pydantic import BaseModel
from typing_extensions import deprecated
from docling.backend.abstract_backend import AbstractDocumentBackend
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.backend.html_backend import HTMLDocumentBackend
-from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
-from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import (
AssembledUnit,
ConversionStatus,
@@ -372,14 +368,20 @@ class ConvertedDocument(BaseModel):
strict_text: bool = False,
image_placeholder: str = "",
) -> str:
- return self.legacy_output.export_to_markdown(
- delim=delim,
- main_text_start=main_text_start,
- main_text_stop=main_text_stop,
- main_text_labels=main_text_labels,
- strict_text=strict_text,
- image_placeholder=image_placeholder,
- )
+ if self.legacy_output is None:
+ raise RuntimeError(
+ "No legacy output was produced, can not export as markdown. "
+ "Please use output.export_to_markdown() instead."
+ )
+ else:
+ return self.legacy_output.export_to_markdown(
+ delim=delim,
+ main_text_start=main_text_start,
+ main_text_stop=main_text_stop,
+ main_text_labels=main_text_labels,
+ strict_text=strict_text,
+ image_placeholder=image_placeholder,
+ )
@deprecated("Use output.export_to_text() instead.")
def render_as_text(
@@ -394,13 +396,19 @@ class ConvertedDocument(BaseModel):
"caption",
],
) -> str:
- return self.legacy_output.export_to_markdown(
- delim=delim,
- main_text_start=main_text_start,
- main_text_stop=main_text_stop,
- main_text_labels=main_text_labels,
- strict_text=True,
- )
+ if self.legacy_output is None:
+ raise RuntimeError(
+ "No legacy output was produced, can not export as text. "
+ "Please use output.export_to_markdown() instead."
+ )
+ else:
+ return self.legacy_output.export_to_markdown(
+ delim=delim,
+ main_text_start=main_text_start,
+ main_text_stop=main_text_stop,
+ main_text_labels=main_text_labels,
+ strict_text=True,
+ )
@deprecated("Use output.export_to_document_tokens() instead.")
def render_as_doctags(
@@ -426,21 +434,27 @@ class ConvertedDocument(BaseModel):
add_table_cell_label: bool = True,
add_table_cell_text: bool = True,
) -> str:
- return self.legacy_output.export_to_document_tokens(
- delim=delim,
- main_text_start=main_text_start,
- main_text_stop=main_text_stop,
- main_text_labels=main_text_labels,
- xsize=xsize,
- ysize=ysize,
- add_location=add_location,
- add_content=add_content,
- add_page_index=add_page_index,
- # table specific flags
- add_table_cell_location=add_table_cell_location,
- add_table_cell_label=add_table_cell_label,
- add_table_cell_text=add_table_cell_text,
- )
+ if self.legacy_output is None:
+ raise RuntimeError(
+ "No legacy output was produced, can not export as doctags. "
+ "Please use output.export_to_markdown() instead."
+ )
+ else:
+ return self.legacy_output.export_to_document_tokens(
+ delim=delim,
+ main_text_start=main_text_start,
+ main_text_stop=main_text_stop,
+ main_text_labels=main_text_labels,
+ xsize=xsize,
+ ysize=ysize,
+ add_location=add_location,
+ add_content=add_content,
+ add_page_index=add_page_index,
+ # table specific flags
+ add_table_cell_location=add_table_cell_location,
+ add_table_cell_label=add_table_cell_label,
+ add_table_cell_text=add_table_cell_text,
+ )
def render_element_images(
self, element_types: Tuple[PageElement] = (FigureElement,)
diff --git a/docling/document_converter.py b/docling/document_converter.py
index dc919883..44424dba 100644
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -11,7 +11,6 @@ from pydantic import (
ConfigDict,
TypeAdapter,
ValidationError,
- field_validator,
model_validator,
)
from typing_extensions import deprecated
diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py
index 5173c1bf..56202ca9 100644
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@@ -1,7 +1,6 @@
import logging
from typing import Iterable
-import numpy
from docling_core.types.experimental import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page
diff --git a/docling/pipeline/simple_model_pipeline.py b/docling/pipeline/simple_model_pipeline.py
index ceef4d06..98708a53 100644
--- a/docling/pipeline/simple_model_pipeline.py
+++ b/docling/pipeline/simple_model_pipeline.py
@@ -1,7 +1,4 @@
import logging
-from typing import Iterable
-
-from docling_core.types.experimental import NodeItem
from docling.backend.abstract_backend import (
AbstractDocumentBackend,
@@ -9,7 +6,7 @@ from docling.backend.abstract_backend import (
)
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, InputDocument
-from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
+from docling.datamodel.pipeline_options import PipelineOptions
from docling.pipeline.base_model_pipeline import AbstractModelPipeline
_log = logging.getLogger(__name__)
diff --git a/examples/custom_convert.py b/examples/custom_convert.py
index 0805837b..07b4f171 100644
--- a/examples/custom_convert.py
+++ b/examples/custom_convert.py
@@ -6,13 +6,8 @@ from typing import Iterable
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.datamodel.pipeline_options import (
- PdfPipelineOptions,
- TesseractCliOcrOptions,
- TesseractOcrOptions,
-)
-from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
-from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__)
diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py
index f086bae2..cf5cd1cf 100644
--- a/examples/run_with_formats.py
+++ b/examples/run_with_formats.py
@@ -2,14 +2,11 @@ import json
import logging
from pathlib import Path
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import (
DocumentConverter,
- FormatOption,
PdfFormatOption,
WordFormatOption,
)
@@ -40,6 +37,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
# InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
+ InputFormat.PPTX,
], # whitelist formats, other files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
diff --git a/tests/data/powerpoint_sample.pptx b/tests/data/powerpoint_sample.pptx
index f54963e2..9779aa6c 100644
Binary files a/tests/data/powerpoint_sample.pptx and b/tests/data/powerpoint_sample.pptx differ
diff --git a/tests/test_e2e_conversion.py b/tests/test_e2e_conversion.py
index d7432a10..8c952123 100644
--- a/tests/test_e2e_conversion.py
+++ b/tests/test_e2e_conversion.py
@@ -3,7 +3,7 @@ from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
+from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py
index ee7f3931..ffc62b43 100644
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@@ -8,7 +8,6 @@ from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrOptions,
PdfPipelineOptions,
- PipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
diff --git a/tests/test_interfaces.py b/tests/test_interfaces.py
index 80f5ea4e..4d3f96bc 100644
--- a/tests/test_interfaces.py
+++ b/tests/test_interfaces.py
@@ -6,7 +6,7 @@ import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
+from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
diff --git a/tests/verify_utils.py b/tests/verify_utils.py
index fb888bf8..23f21103 100644
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -2,7 +2,6 @@ import json
from pathlib import Path
from typing import List
-from docling_core.types import BaseText
from docling_core.types import Document as DsDocument
from docling_core.types.experimental import DoclingDocument
from pydantic import TypeAdapter