Change code to use unordered/ordered list, robustifications

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-11 14:53:38 +02:00
parent 3ee97c42b2
commit 95c1f80087
14 changed files with 111 additions and 78 deletions

View File

@ -1,5 +1,5 @@
import logging
from io import BytesIO, TextIOWrapper
from io import BytesIO
from pathlib import Path
from typing import Set, Union
@ -81,9 +81,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
try:
self.analyse_element(element, idx, doc)
except Exception as exc_child:
_log.error(" -> error treating child: ", exc_child)
_log.error(" => element: ", element, "\n")
pass
raise exc_child
except Exception as exc:
pass
@ -212,10 +213,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def handle_list(self, element, idx, doc):
"""Handles list tags (ul, ol) and their list items."""
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
)
if element.name == "ul":
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
)
elif element.name == "ol":
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level],
name="ordered list",
label=GroupLabel.ORDERED_LIST,
)
self.level += 1
self.walk(element, doc)
@ -226,13 +235,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def handle_listitem(self, element, idx, doc):
"""Handles listitem tags (li)."""
nested_lists = element.find(["ul", "ol"])
parent_list_label = self.parents[self.level].label
index_in_list = len(self.parents[self.level].children) + 1
if nested_lists:
name = element.name
text = self.get_direct_text(element)
marker = ""
enumerated = False
if parent_list_label == GroupLabel.ORDERED_LIST:
marker = str(index_in_list)
enumerated = True
# create a list-item
self.parents[self.level + 1] = doc.add_text(
label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level]
self.parents[self.level + 1] = doc.add_list_item(
text=text,
enumerated=enumerated,
marker=marker,
parent=self.parents[self.level],
)
self.level += 1
@ -244,8 +266,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(element.text, str):
text = element.text.strip()
doc.add_text(
label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level]
marker = ""
enumerated = False
if parent_list_label == GroupLabel.ORDERED_LIST:
marker = f"{str(index_in_list)}."
enumerated = True
doc.add_list_item(
text=text,
enumerated=enumerated,
marker=marker,
parent=self.parents[self.level],
)
else:
_log.warn("list-item has no text: ", element)

View File

@ -43,7 +43,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
try:
self.pptx_obj = Presentation(self.path_or_stream)
self.valid = True
except Exception:
except Exception as e:
raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
) from e
@ -134,6 +134,8 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
if is_a_list:
# TODO: determine if this is an unordered list or an ordered list.
# Set GroupLabel.ORDERED_LIST when it fits.
new_list = doc.add_group(
label=GroupLabel.LIST, name=f"list", parent=parent_slide
)
@ -157,9 +159,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
e_is_a_list_item = True
else:
e_is_a_list_item = False
if e_is_a_list_item:
doc.add_text(
label=DocItemLabel.LIST_ITEM,
# TODO: Set marker and enumerated arguments if this is an enumeration element.
doc.add_list_item(
parent=new_list,
text=e.text.strip(),
prov=prov,

View File

@ -312,9 +312,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
)
doc.add_text(
label=DocItemLabel.LIST_ITEM, parent=self.parents[level], text=text
)
# TODO: Set marker and enumerated arguments if this is an enumeration element.
doc.add_list_item(parent=self.parents[level], text=text)
elif (
self.prev_numid() == numid and self.prev_indent() < ilevel
@ -323,12 +322,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level_at_new_list + self.prev_indent() + 1,
self.level_at_new_list + ilevel + 1,
):
# TODO: determine if this is an unordered list or an ordered list.
# Set GroupLabel.ORDERED_LIST when it fits.
self.parents[i] = doc.add_group(
label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
)
doc.add_text(
label=DocItemLabel.LIST_ITEM,
# TODO: Set marker and enumerated arguments if this is an enumeration element.
doc.add_list_item(
parent=self.parents[self.level_at_new_list + ilevel],
text=text,
)
@ -338,16 +339,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if k > self.level_at_new_list + ilevel:
self.parents[k] = None
doc.add_text(
label=DocItemLabel.LIST_ITEM,
# TODO: Set marker and enumerated arguments if this is an enumeration element.
doc.add_list_item(
parent=self.parents[self.level_at_new_list + ilevel],
text=text,
)
elif self.prev_numid() == numid or self.prev_indent() == ilevel:
doc.add_text(
label=DocItemLabel.LIST_ITEM, parent=self.parents[level - 1], text=text
)
# TODO: Set marker and enumerated arguments if this is an enumeration element.
doc.add_list_item(parent=self.parents[level - 1], text=text)
return
def handle_tables(self, element, docx_obj, doc):

View File

@ -3,7 +3,7 @@ import re
from enum import Enum
from io import BytesIO
from pathlib import Path, PurePath
from typing import Dict, Iterable, List, Optional, Tuple, Type, Union
from typing import Dict, Iterable, List, Optional, Tuple, Union
import filetype
from docling_core.types import BaseText
@ -23,10 +23,6 @@ from pydantic import BaseModel
from typing_extensions import deprecated
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import (
AssembledUnit,
ConversionStatus,
@ -372,14 +368,20 @@ class ConvertedDocument(BaseModel):
strict_text: bool = False,
image_placeholder: str = "<!-- image -->",
) -> str:
return self.legacy_output.export_to_markdown(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
strict_text=strict_text,
image_placeholder=image_placeholder,
)
if self.legacy_output is None:
raise RuntimeError(
"No legacy output was produced, can not export as markdown. "
"Please use output.export_to_markdown() instead."
)
else:
return self.legacy_output.export_to_markdown(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
strict_text=strict_text,
image_placeholder=image_placeholder,
)
@deprecated("Use output.export_to_text() instead.")
def render_as_text(
@ -394,13 +396,19 @@ class ConvertedDocument(BaseModel):
"caption",
],
) -> str:
return self.legacy_output.export_to_markdown(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
strict_text=True,
)
if self.legacy_output is None:
raise RuntimeError(
"No legacy output was produced, can not export as text. "
"Please use output.export_to_markdown() instead."
)
else:
return self.legacy_output.export_to_markdown(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
strict_text=True,
)
@deprecated("Use output.export_to_document_tokens() instead.")
def render_as_doctags(
@ -426,21 +434,27 @@ class ConvertedDocument(BaseModel):
add_table_cell_label: bool = True,
add_table_cell_text: bool = True,
) -> str:
return self.legacy_output.export_to_document_tokens(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
xsize=xsize,
ysize=ysize,
add_location=add_location,
add_content=add_content,
add_page_index=add_page_index,
# table specific flags
add_table_cell_location=add_table_cell_location,
add_table_cell_label=add_table_cell_label,
add_table_cell_text=add_table_cell_text,
)
if self.legacy_output is None:
raise RuntimeError(
"No legacy output was produced, can not export as doctags. "
"Please use output.export_to_markdown() instead."
)
else:
return self.legacy_output.export_to_document_tokens(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
xsize=xsize,
ysize=ysize,
add_location=add_location,
add_content=add_content,
add_page_index=add_page_index,
# table specific flags
add_table_cell_location=add_table_cell_location,
add_table_cell_label=add_table_cell_label,
add_table_cell_text=add_table_cell_text,
)
def render_element_images(
self, element_types: Tuple[PageElement] = (FigureElement,)

View File

@ -11,7 +11,6 @@ from pydantic import (
ConfigDict,
TypeAdapter,
ValidationError,
field_validator,
model_validator,
)
from typing_extensions import deprecated

View File

@ -1,7 +1,6 @@
import logging
from typing import Iterable
import numpy
from docling_core.types.experimental import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page

View File

@ -1,7 +1,4 @@
import logging
from typing import Iterable
from docling_core.types.experimental import NodeItem
from docling.backend.abstract_backend import (
AbstractDocumentBackend,
@ -9,7 +6,7 @@ from docling.backend.abstract_backend import (
)
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
from docling.datamodel.pipeline_options import PipelineOptions
from docling.pipeline.base_model_pipeline import AbstractModelPipeline
_log = logging.getLogger(__name__)

View File

@ -6,13 +6,8 @@ from typing import Iterable
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__)

View File

@ -2,14 +2,11 @@ import json
import logging
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import (
DocumentConverter,
FormatOption,
PdfFormatOption,
WordFormatOption,
)
@ -40,6 +37,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
# InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
], # whitelist formats, other files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(

Binary file not shown.

View File

@ -3,7 +3,7 @@ from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2

View File

@ -8,7 +8,6 @@ from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrOptions,
PdfPipelineOptions,
PipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)

View File

@ -6,7 +6,7 @@ import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2

View File

@ -2,7 +2,6 @@ import json
from pathlib import Path
from typing import List
from docling_core.types import BaseText
from docling_core.types import Document as DsDocument
from docling_core.types.experimental import DoclingDocument
from pydantic import TypeAdapter