diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index cae81085..216d156d 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1,5 +1,5 @@ import logging -from io import BytesIO, TextIOWrapper +from io import BytesIO from pathlib import Path from typing import Set, Union @@ -81,9 +81,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): try: self.analyse_element(element, idx, doc) except Exception as exc_child: + _log.error(" -> error treating child: ", exc_child) _log.error(" => element: ", element, "\n") - pass + raise exc_child except Exception as exc: pass @@ -212,10 +213,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def handle_list(self, element, idx, doc): """Handles list tags (ul, ol) and their list items.""" - # create a list group - self.parents[self.level + 1] = doc.add_group( - parent=self.parents[self.level], name="list", label=GroupLabel.LIST - ) + if element.name == "ul": + # create a list group + self.parents[self.level + 1] = doc.add_group( + parent=self.parents[self.level], name="list", label=GroupLabel.LIST + ) + elif element.name == "ol": + # create a list group + self.parents[self.level + 1] = doc.add_group( + parent=self.parents[self.level], + name="ordered list", + label=GroupLabel.ORDERED_LIST, + ) self.level += 1 self.walk(element, doc) @@ -226,13 +235,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def handle_listitem(self, element, idx, doc): """Handles listitem tags (li).""" nested_lists = element.find(["ul", "ol"]) + + parent_list_label = self.parents[self.level].label + index_in_list = len(self.parents[self.level].children) + 1 + if nested_lists: name = element.name text = self.get_direct_text(element) + marker = "" + enumerated = False + if parent_list_label == GroupLabel.ORDERED_LIST: + marker = str(index_in_list) + enumerated = True + # create a list-item - self.parents[self.level + 1] = doc.add_text( - label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level] + self.parents[self.level + 1] = doc.add_list_item( + text=text, + enumerated=enumerated, + marker=marker, + parent=self.parents[self.level], ) self.level += 1 @@ -244,8 +266,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): elif isinstance(element.text, str): text = element.text.strip() - doc.add_text( - label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level] + marker = "" + enumerated = False + if parent_list_label == GroupLabel.ORDERED_LIST: + marker = f"{str(index_in_list)}." + enumerated = True + doc.add_list_item( + text=text, + enumerated=enumerated, + marker=marker, + parent=self.parents[self.level], ) else: _log.warn("list-item has no text: ", element) diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 7703e3b1..b67c3ca3 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -43,7 +43,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB try: self.pptx_obj = Presentation(self.path_or_stream) self.valid = True - except Exception: + except Exception as e: raise RuntimeError( f"MsPowerpointDocumentBackend could not load document with hash {document_hash}" ) from e @@ -134,6 +134,8 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB prov = self.generate_prov(shape, slide_ind, shape.text.strip()) if is_a_list: + # TODO: determine if this is an unordered list or an ordered list. + # Set GroupLabel.ORDERED_LIST when it fits. new_list = doc.add_group( label=GroupLabel.LIST, name=f"list", parent=parent_slide ) @@ -157,9 +159,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB e_is_a_list_item = True else: e_is_a_list_item = False + if e_is_a_list_item: - doc.add_text( - label=DocItemLabel.LIST_ITEM, + # TODO: Set marker and enumerated arguments if this is an enumeration element. + doc.add_list_item( parent=new_list, text=e.text.strip(), prov=prov, diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 49911009..c3504b33 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -312,9 +312,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): label=GroupLabel.LIST, name="list", parent=self.parents[level - 1] ) - doc.add_text( - label=DocItemLabel.LIST_ITEM, parent=self.parents[level], text=text - ) + # TODO: Set marker and enumerated arguments if this is an enumeration element. + doc.add_list_item(parent=self.parents[level], text=text) elif ( self.prev_numid() == numid and self.prev_indent() < ilevel @@ -323,12 +322,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.level_at_new_list + self.prev_indent() + 1, self.level_at_new_list + ilevel + 1, ): + # TODO: determine if this is an unordered list or an ordered list. + # Set GroupLabel.ORDERED_LIST when it fits. self.parents[i] = doc.add_group( label=GroupLabel.LIST, name="list", parent=self.parents[i - 1] ) - doc.add_text( - label=DocItemLabel.LIST_ITEM, + # TODO: Set marker and enumerated arguments if this is an enumeration element. + doc.add_list_item( parent=self.parents[self.level_at_new_list + ilevel], text=text, ) @@ -338,16 +339,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if k > self.level_at_new_list + ilevel: self.parents[k] = None - doc.add_text( - label=DocItemLabel.LIST_ITEM, + # TODO: Set marker and enumerated arguments if this is an enumeration element. + doc.add_list_item( parent=self.parents[self.level_at_new_list + ilevel], text=text, ) elif self.prev_numid() == numid or self.prev_indent() == ilevel: - doc.add_text( - label=DocItemLabel.LIST_ITEM, parent=self.parents[level - 1], text=text - ) + # TODO: Set marker and enumerated arguments if this is an enumeration element. + doc.add_list_item(parent=self.parents[level - 1], text=text) return def handle_tables(self, element, docx_obj, doc): diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 615acfac..9240c6b5 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -3,7 +3,7 @@ import re from enum import Enum from io import BytesIO from pathlib import Path, PurePath -from typing import Dict, Iterable, List, Optional, Tuple, Type, Union +from typing import Dict, Iterable, List, Optional, Tuple, Union import filetype from docling_core.types import BaseText @@ -24,10 +24,6 @@ from pydantic import BaseModel from typing_extensions import deprecated from docling.backend.abstract_backend import AbstractDocumentBackend -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend -from docling.backend.html_backend import HTMLDocumentBackend -from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend -from docling.backend.msword_backend import MsWordDocumentBackend from docling.datamodel.base_models import ( AssembledUnit, ConversionStatus, @@ -372,14 +368,20 @@ class ConversionResult(BaseModel): strict_text: bool = False, image_placeholder: str = "", ) -> str: - return self.legacy_output.export_to_markdown( - delim=delim, - main_text_start=main_text_start, - main_text_stop=main_text_stop, - main_text_labels=main_text_labels, - strict_text=strict_text, - image_placeholder=image_placeholder, - ) + if self.legacy_output is None: + raise RuntimeError( + "No legacy output was produced, can not export as markdown. " + "Please use output.export_to_markdown() instead." + ) + else: + return self.legacy_output.export_to_markdown( + delim=delim, + main_text_start=main_text_start, + main_text_stop=main_text_stop, + main_text_labels=main_text_labels, + strict_text=strict_text, + image_placeholder=image_placeholder, + ) @deprecated("Use output.export_to_text() instead.") def render_as_text( @@ -394,13 +396,19 @@ class ConversionResult(BaseModel): "caption", ], ) -> str: - return self.legacy_output.export_to_markdown( - delim=delim, - main_text_start=main_text_start, - main_text_stop=main_text_stop, - main_text_labels=main_text_labels, - strict_text=True, - ) + if self.legacy_output is None: + raise RuntimeError( + "No legacy output was produced, can not export as text. " + "Please use output.export_to_markdown() instead." + ) + else: + return self.legacy_output.export_to_markdown( + delim=delim, + main_text_start=main_text_start, + main_text_stop=main_text_stop, + main_text_labels=main_text_labels, + strict_text=True, + ) @deprecated("Use output.export_to_document_tokens() instead.") def render_as_doctags( @@ -426,21 +434,27 @@ class ConversionResult(BaseModel): add_table_cell_label: bool = True, add_table_cell_text: bool = True, ) -> str: - return self.legacy_output.export_to_document_tokens( - delim=delim, - main_text_start=main_text_start, - main_text_stop=main_text_stop, - main_text_labels=main_text_labels, - xsize=xsize, - ysize=ysize, - add_location=add_location, - add_content=add_content, - add_page_index=add_page_index, - # table specific flags - add_table_cell_location=add_table_cell_location, - add_table_cell_label=add_table_cell_label, - add_table_cell_text=add_table_cell_text, - ) + if self.legacy_output is None: + raise RuntimeError( + "No legacy output was produced, can not export as doctags. " + "Please use output.export_to_markdown() instead." + ) + else: + return self.legacy_output.export_to_document_tokens( + delim=delim, + main_text_start=main_text_start, + main_text_stop=main_text_stop, + main_text_labels=main_text_labels, + xsize=xsize, + ysize=ysize, + add_location=add_location, + add_content=add_content, + add_page_index=add_page_index, + # table specific flags + add_table_cell_location=add_table_cell_location, + add_table_cell_label=add_table_cell_label, + add_table_cell_text=add_table_cell_text, + ) def render_element_images( self, element_types: Tuple[PageElement] = (FigureElement,) diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index 5173c1bf..56202ca9 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -1,7 +1,6 @@ import logging from typing import Iterable -import numpy from docling_core.types.experimental import BoundingBox, CoordOrigin from docling.datamodel.base_models import OcrCell, Page diff --git a/docling/pipeline/simple_model_pipeline.py b/docling/pipeline/simple_model_pipeline.py index 2e5aa8fa..ee5febab 100644 --- a/docling/pipeline/simple_model_pipeline.py +++ b/docling/pipeline/simple_model_pipeline.py @@ -1,7 +1,4 @@ import logging -from typing import Iterable - -from docling_core.types.experimental import NodeItem from docling.backend.abstract_backend import ( AbstractDocumentBackend, @@ -9,7 +6,7 @@ from docling.backend.abstract_backend import ( ) from docling.datamodel.base_models import ConversionStatus from docling.datamodel.document import ConversionResult, InputDocument -from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions +from docling.datamodel.pipeline_options import PipelineOptions from docling.pipeline.base_model_pipeline import AbstractModelPipeline _log = logging.getLogger(__name__) diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py index 37bb1b1a..f93db241 100644 --- a/examples/run_with_formats.py +++ b/examples/run_with_formats.py @@ -2,13 +2,10 @@ import json import logging from pathlib import Path -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend -from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import InputFormat from docling.document_converter import ( DocumentConverter, - FormatOption, PdfFormatOption, WordFormatOption, ) @@ -38,6 +35,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal # InputFormat.IMAGE, InputFormat.DOCX, InputFormat.HTML, + InputFormat.PPTX, ], # whitelist formats, other files are ignored. format_options={ InputFormat.PDF: PdfFormatOption( @@ -50,6 +48,30 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal }, ) +doc_converter = DocumentConverter( # all of the below is optional, has internal defaults. + pdf=None, + docx=WordFormatOption( + pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend + ), + formats=[ + InputFormat.PDF, + # InputFormat.IMAGE, + InputFormat.DOCX, + InputFormat.HTML, + InputFormat.PPTX, + ], # whitelist formats, other files are ignored. + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend + ), # PdfFormatOption(backend=PyPdfiumDocumentBackend), + InputFormat.DOCX: WordFormatOption( + pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend + ), + # InputFormat.IMAGE: PdfFormatOption(), + }, +) + + conv_results = doc_converter.convert_all(input_paths) for res in conv_results: diff --git a/tests/data/powerpoint_sample.pptx b/tests/data/powerpoint_sample.pptx index f54963e2..9779aa6c 100644 Binary files a/tests/data/powerpoint_sample.pptx and b/tests/data/powerpoint_sample.pptx differ diff --git a/tests/verify_utils.py b/tests/verify_utils.py index 186f2d9b..44e51829 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -3,7 +3,6 @@ import warnings from pathlib import Path from typing import List -from docling_core.types import BaseText from docling_core.types import Document as DsDocument from docling_core.types.experimental import DoclingDocument from pydantic import TypeAdapter