mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Change code to use unordered/ordered list, robustifications
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
3ee97c42b2
commit
95c1f80087
@ -1,5 +1,5 @@
|
||||
import logging
|
||||
from io import BytesIO, TextIOWrapper
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
@ -81,9 +81,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
try:
|
||||
self.analyse_element(element, idx, doc)
|
||||
except Exception as exc_child:
|
||||
|
||||
_log.error(" -> error treating child: ", exc_child)
|
||||
_log.error(" => element: ", element, "\n")
|
||||
pass
|
||||
raise exc_child
|
||||
|
||||
except Exception as exc:
|
||||
pass
|
||||
@ -212,10 +213,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def handle_list(self, element, idx, doc):
|
||||
"""Handles list tags (ul, ol) and their list items."""
|
||||
|
||||
# create a list group
|
||||
self.parents[self.level + 1] = doc.add_group(
|
||||
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
|
||||
)
|
||||
if element.name == "ul":
|
||||
# create a list group
|
||||
self.parents[self.level + 1] = doc.add_group(
|
||||
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
|
||||
)
|
||||
elif element.name == "ol":
|
||||
# create a list group
|
||||
self.parents[self.level + 1] = doc.add_group(
|
||||
parent=self.parents[self.level],
|
||||
name="ordered list",
|
||||
label=GroupLabel.ORDERED_LIST,
|
||||
)
|
||||
self.level += 1
|
||||
|
||||
self.walk(element, doc)
|
||||
@ -226,13 +235,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def handle_listitem(self, element, idx, doc):
|
||||
"""Handles listitem tags (li)."""
|
||||
nested_lists = element.find(["ul", "ol"])
|
||||
|
||||
parent_list_label = self.parents[self.level].label
|
||||
index_in_list = len(self.parents[self.level].children) + 1
|
||||
|
||||
if nested_lists:
|
||||
name = element.name
|
||||
text = self.get_direct_text(element)
|
||||
|
||||
marker = ""
|
||||
enumerated = False
|
||||
if parent_list_label == GroupLabel.ORDERED_LIST:
|
||||
marker = str(index_in_list)
|
||||
enumerated = True
|
||||
|
||||
# create a list-item
|
||||
self.parents[self.level + 1] = doc.add_text(
|
||||
label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level]
|
||||
self.parents[self.level + 1] = doc.add_list_item(
|
||||
text=text,
|
||||
enumerated=enumerated,
|
||||
marker=marker,
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
self.level += 1
|
||||
|
||||
@ -244,8 +266,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
elif isinstance(element.text, str):
|
||||
text = element.text.strip()
|
||||
|
||||
doc.add_text(
|
||||
label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level]
|
||||
marker = ""
|
||||
enumerated = False
|
||||
if parent_list_label == GroupLabel.ORDERED_LIST:
|
||||
marker = f"{str(index_in_list)}."
|
||||
enumerated = True
|
||||
doc.add_list_item(
|
||||
text=text,
|
||||
enumerated=enumerated,
|
||||
marker=marker,
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
else:
|
||||
_log.warn("list-item has no text: ", element)
|
||||
|
@ -43,7 +43,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
try:
|
||||
self.pptx_obj = Presentation(self.path_or_stream)
|
||||
self.valid = True
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
|
||||
) from e
|
||||
@ -134,6 +134,8 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
|
||||
|
||||
if is_a_list:
|
||||
# TODO: determine if this is an unordered list or an ordered list.
|
||||
# Set GroupLabel.ORDERED_LIST when it fits.
|
||||
new_list = doc.add_group(
|
||||
label=GroupLabel.LIST, name=f"list", parent=parent_slide
|
||||
)
|
||||
@ -157,9 +159,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
e_is_a_list_item = True
|
||||
else:
|
||||
e_is_a_list_item = False
|
||||
|
||||
if e_is_a_list_item:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.LIST_ITEM,
|
||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||
doc.add_list_item(
|
||||
parent=new_list,
|
||||
text=e.text.strip(),
|
||||
prov=prov,
|
||||
|
@ -312,9 +312,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
|
||||
)
|
||||
|
||||
doc.add_text(
|
||||
label=DocItemLabel.LIST_ITEM, parent=self.parents[level], text=text
|
||||
)
|
||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||
doc.add_list_item(parent=self.parents[level], text=text)
|
||||
|
||||
elif (
|
||||
self.prev_numid() == numid and self.prev_indent() < ilevel
|
||||
@ -323,12 +322,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.level_at_new_list + self.prev_indent() + 1,
|
||||
self.level_at_new_list + ilevel + 1,
|
||||
):
|
||||
# TODO: determine if this is an unordered list or an ordered list.
|
||||
# Set GroupLabel.ORDERED_LIST when it fits.
|
||||
self.parents[i] = doc.add_group(
|
||||
label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
|
||||
)
|
||||
|
||||
doc.add_text(
|
||||
label=DocItemLabel.LIST_ITEM,
|
||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||
doc.add_list_item(
|
||||
parent=self.parents[self.level_at_new_list + ilevel],
|
||||
text=text,
|
||||
)
|
||||
@ -338,16 +339,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if k > self.level_at_new_list + ilevel:
|
||||
self.parents[k] = None
|
||||
|
||||
doc.add_text(
|
||||
label=DocItemLabel.LIST_ITEM,
|
||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||
doc.add_list_item(
|
||||
parent=self.parents[self.level_at_new_list + ilevel],
|
||||
text=text,
|
||||
)
|
||||
|
||||
elif self.prev_numid() == numid or self.prev_indent() == ilevel:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.LIST_ITEM, parent=self.parents[level - 1], text=text
|
||||
)
|
||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||
doc.add_list_item(parent=self.parents[level - 1], text=text)
|
||||
return
|
||||
|
||||
def handle_tables(self, element, docx_obj, doc):
|
||||
|
@ -3,7 +3,7 @@ import re
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
from typing import Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
||||
|
||||
import filetype
|
||||
from docling_core.types import BaseText
|
||||
@ -23,10 +23,6 @@ from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
AssembledUnit,
|
||||
ConversionStatus,
|
||||
@ -372,14 +368,20 @@ class ConvertedDocument(BaseModel):
|
||||
strict_text: bool = False,
|
||||
image_placeholder: str = "<!-- image -->",
|
||||
) -> str:
|
||||
return self.legacy_output.export_to_markdown(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
main_text_labels=main_text_labels,
|
||||
strict_text=strict_text,
|
||||
image_placeholder=image_placeholder,
|
||||
)
|
||||
if self.legacy_output is None:
|
||||
raise RuntimeError(
|
||||
"No legacy output was produced, can not export as markdown. "
|
||||
"Please use output.export_to_markdown() instead."
|
||||
)
|
||||
else:
|
||||
return self.legacy_output.export_to_markdown(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
main_text_labels=main_text_labels,
|
||||
strict_text=strict_text,
|
||||
image_placeholder=image_placeholder,
|
||||
)
|
||||
|
||||
@deprecated("Use output.export_to_text() instead.")
|
||||
def render_as_text(
|
||||
@ -394,13 +396,19 @@ class ConvertedDocument(BaseModel):
|
||||
"caption",
|
||||
],
|
||||
) -> str:
|
||||
return self.legacy_output.export_to_markdown(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
main_text_labels=main_text_labels,
|
||||
strict_text=True,
|
||||
)
|
||||
if self.legacy_output is None:
|
||||
raise RuntimeError(
|
||||
"No legacy output was produced, can not export as text. "
|
||||
"Please use output.export_to_markdown() instead."
|
||||
)
|
||||
else:
|
||||
return self.legacy_output.export_to_markdown(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
main_text_labels=main_text_labels,
|
||||
strict_text=True,
|
||||
)
|
||||
|
||||
@deprecated("Use output.export_to_document_tokens() instead.")
|
||||
def render_as_doctags(
|
||||
@ -426,21 +434,27 @@ class ConvertedDocument(BaseModel):
|
||||
add_table_cell_label: bool = True,
|
||||
add_table_cell_text: bool = True,
|
||||
) -> str:
|
||||
return self.legacy_output.export_to_document_tokens(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
main_text_labels=main_text_labels,
|
||||
xsize=xsize,
|
||||
ysize=ysize,
|
||||
add_location=add_location,
|
||||
add_content=add_content,
|
||||
add_page_index=add_page_index,
|
||||
# table specific flags
|
||||
add_table_cell_location=add_table_cell_location,
|
||||
add_table_cell_label=add_table_cell_label,
|
||||
add_table_cell_text=add_table_cell_text,
|
||||
)
|
||||
if self.legacy_output is None:
|
||||
raise RuntimeError(
|
||||
"No legacy output was produced, can not export as doctags. "
|
||||
"Please use output.export_to_markdown() instead."
|
||||
)
|
||||
else:
|
||||
return self.legacy_output.export_to_document_tokens(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
main_text_labels=main_text_labels,
|
||||
xsize=xsize,
|
||||
ysize=ysize,
|
||||
add_location=add_location,
|
||||
add_content=add_content,
|
||||
add_page_index=add_page_index,
|
||||
# table specific flags
|
||||
add_table_cell_location=add_table_cell_location,
|
||||
add_table_cell_label=add_table_cell_label,
|
||||
add_table_cell_text=add_table_cell_text,
|
||||
)
|
||||
|
||||
def render_element_images(
|
||||
self, element_types: Tuple[PageElement] = (FigureElement,)
|
||||
|
@ -11,7 +11,6 @@ from pydantic import (
|
||||
ConfigDict,
|
||||
TypeAdapter,
|
||||
ValidationError,
|
||||
field_validator,
|
||||
model_validator,
|
||||
)
|
||||
from typing_extensions import deprecated
|
||||
|
@ -1,7 +1,6 @@
|
||||
import logging
|
||||
from typing import Iterable
|
||||
|
||||
import numpy
|
||||
from docling_core.types.experimental import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
|
@ -1,7 +1,4 @@
|
||||
import logging
|
||||
from typing import Iterable
|
||||
|
||||
from docling_core.types.experimental import NodeItem
|
||||
|
||||
from docling.backend.abstract_backend import (
|
||||
AbstractDocumentBackend,
|
||||
@ -9,7 +6,7 @@ from docling.backend.abstract_backend import (
|
||||
)
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.pipeline.base_model_pipeline import AbstractModelPipeline
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -6,13 +6,8 @@ from typing import Iterable
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import (
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -2,14 +2,11 @@ import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import (
|
||||
DocumentConverter,
|
||||
FormatOption,
|
||||
PdfFormatOption,
|
||||
WordFormatOption,
|
||||
)
|
||||
@ -40,6 +37,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
|
||||
# InputFormat.IMAGE,
|
||||
InputFormat.DOCX,
|
||||
InputFormat.HTML,
|
||||
InputFormat.PPTX,
|
||||
], # whitelist formats, other files are ignored.
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
|
Binary file not shown.
@ -3,7 +3,7 @@ from pathlib import Path
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||
|
@ -8,7 +8,6 @@ from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrOptions,
|
||||
PdfPipelineOptions,
|
||||
PipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
|
@ -6,7 +6,7 @@ import pytest
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||
|
@ -2,7 +2,6 @@ import json
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from docling_core.types import BaseText
|
||||
from docling_core.types import Document as DsDocument
|
||||
from docling_core.types.experimental import DoclingDocument
|
||||
from pydantic import TypeAdapter
|
||||
|
Loading…
Reference in New Issue
Block a user