Merge branch 'cau/input-format-abstraction' of github.com:DS4SD/docling into cau/input-format-abstraction

This commit is contained in:
Michele Dolfi 2024-10-11 16:31:28 +02:00
commit 5b5c99e9da
9 changed files with 131 additions and 67 deletions

View File

@ -1,5 +1,5 @@
import logging import logging
from io import BytesIO, TextIOWrapper from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import Set, Union
@ -81,9 +81,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
try: try:
self.analyse_element(element, idx, doc) self.analyse_element(element, idx, doc)
except Exception as exc_child: except Exception as exc_child:
_log.error(" -> error treating child: ", exc_child) _log.error(" -> error treating child: ", exc_child)
_log.error(" => element: ", element, "\n") _log.error(" => element: ", element, "\n")
pass raise exc_child
except Exception as exc: except Exception as exc:
pass pass
@ -212,10 +213,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def handle_list(self, element, idx, doc): def handle_list(self, element, idx, doc):
"""Handles list tags (ul, ol) and their list items.""" """Handles list tags (ul, ol) and their list items."""
# create a list group if element.name == "ul":
self.parents[self.level + 1] = doc.add_group( # create a list group
parent=self.parents[self.level], name="list", label=GroupLabel.LIST self.parents[self.level + 1] = doc.add_group(
) parent=self.parents[self.level], name="list", label=GroupLabel.LIST
)
elif element.name == "ol":
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level],
name="ordered list",
label=GroupLabel.ORDERED_LIST,
)
self.level += 1 self.level += 1
self.walk(element, doc) self.walk(element, doc)
@ -226,13 +235,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def handle_listitem(self, element, idx, doc): def handle_listitem(self, element, idx, doc):
"""Handles listitem tags (li).""" """Handles listitem tags (li)."""
nested_lists = element.find(["ul", "ol"]) nested_lists = element.find(["ul", "ol"])
parent_list_label = self.parents[self.level].label
index_in_list = len(self.parents[self.level].children) + 1
if nested_lists: if nested_lists:
name = element.name name = element.name
text = self.get_direct_text(element) text = self.get_direct_text(element)
marker = ""
enumerated = False
if parent_list_label == GroupLabel.ORDERED_LIST:
marker = str(index_in_list)
enumerated = True
# create a list-item # create a list-item
self.parents[self.level + 1] = doc.add_text( self.parents[self.level + 1] = doc.add_list_item(
label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level] text=text,
enumerated=enumerated,
marker=marker,
parent=self.parents[self.level],
) )
self.level += 1 self.level += 1
@ -244,8 +266,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(element.text, str): elif isinstance(element.text, str):
text = element.text.strip() text = element.text.strip()
doc.add_text( marker = ""
label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level] enumerated = False
if parent_list_label == GroupLabel.ORDERED_LIST:
marker = f"{str(index_in_list)}."
enumerated = True
doc.add_list_item(
text=text,
enumerated=enumerated,
marker=marker,
parent=self.parents[self.level],
) )
else: else:
_log.warn("list-item has no text: ", element) _log.warn("list-item has no text: ", element)

View File

@ -43,7 +43,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
try: try:
self.pptx_obj = Presentation(self.path_or_stream) self.pptx_obj = Presentation(self.path_or_stream)
self.valid = True self.valid = True
except Exception: except Exception as e:
raise RuntimeError( raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}" f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
) from e ) from e
@ -134,6 +134,8 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
prov = self.generate_prov(shape, slide_ind, shape.text.strip()) prov = self.generate_prov(shape, slide_ind, shape.text.strip())
if is_a_list: if is_a_list:
# TODO: determine if this is an unordered list or an ordered list.
# Set GroupLabel.ORDERED_LIST when it fits.
new_list = doc.add_group( new_list = doc.add_group(
label=GroupLabel.LIST, name=f"list", parent=parent_slide label=GroupLabel.LIST, name=f"list", parent=parent_slide
) )
@ -157,9 +159,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
e_is_a_list_item = True e_is_a_list_item = True
else: else:
e_is_a_list_item = False e_is_a_list_item = False
if e_is_a_list_item: if e_is_a_list_item:
doc.add_text( # TODO: Set marker and enumerated arguments if this is an enumeration element.
label=DocItemLabel.LIST_ITEM, doc.add_list_item(
parent=new_list, parent=new_list,
text=e.text.strip(), text=e.text.strip(),
prov=prov, prov=prov,

View File

@ -312,9 +312,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1] label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
) )
doc.add_text( # TODO: Set marker and enumerated arguments if this is an enumeration element.
label=DocItemLabel.LIST_ITEM, parent=self.parents[level], text=text doc.add_list_item(parent=self.parents[level], text=text)
)
elif ( elif (
self.prev_numid() == numid and self.prev_indent() < ilevel self.prev_numid() == numid and self.prev_indent() < ilevel
@ -323,12 +322,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level_at_new_list + self.prev_indent() + 1, self.level_at_new_list + self.prev_indent() + 1,
self.level_at_new_list + ilevel + 1, self.level_at_new_list + ilevel + 1,
): ):
# TODO: determine if this is an unordered list or an ordered list.
# Set GroupLabel.ORDERED_LIST when it fits.
self.parents[i] = doc.add_group( self.parents[i] = doc.add_group(
label=GroupLabel.LIST, name="list", parent=self.parents[i - 1] label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
) )
doc.add_text( # TODO: Set marker and enumerated arguments if this is an enumeration element.
label=DocItemLabel.LIST_ITEM, doc.add_list_item(
parent=self.parents[self.level_at_new_list + ilevel], parent=self.parents[self.level_at_new_list + ilevel],
text=text, text=text,
) )
@ -338,16 +339,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if k > self.level_at_new_list + ilevel: if k > self.level_at_new_list + ilevel:
self.parents[k] = None self.parents[k] = None
doc.add_text( # TODO: Set marker and enumerated arguments if this is an enumeration element.
label=DocItemLabel.LIST_ITEM, doc.add_list_item(
parent=self.parents[self.level_at_new_list + ilevel], parent=self.parents[self.level_at_new_list + ilevel],
text=text, text=text,
) )
elif self.prev_numid() == numid or self.prev_indent() == ilevel: elif self.prev_numid() == numid or self.prev_indent() == ilevel:
doc.add_text( # TODO: Set marker and enumerated arguments if this is an enumeration element.
label=DocItemLabel.LIST_ITEM, parent=self.parents[level - 1], text=text doc.add_list_item(parent=self.parents[level - 1], text=text)
)
return return
def handle_tables(self, element, docx_obj, doc): def handle_tables(self, element, docx_obj, doc):

View File

@ -3,7 +3,7 @@ import re
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path, PurePath from pathlib import Path, PurePath
from typing import Dict, Iterable, List, Optional, Tuple, Type, Union from typing import Dict, Iterable, List, Optional, Tuple, Union
import filetype import filetype
from docling_core.types import BaseText from docling_core.types import BaseText
@ -24,10 +24,6 @@ from pydantic import BaseModel
from typing_extensions import deprecated from typing_extensions import deprecated
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import ( from docling.datamodel.base_models import (
AssembledUnit, AssembledUnit,
ConversionStatus, ConversionStatus,
@ -372,14 +368,20 @@ class ConversionResult(BaseModel):
strict_text: bool = False, strict_text: bool = False,
image_placeholder: str = "<!-- image -->", image_placeholder: str = "<!-- image -->",
) -> str: ) -> str:
return self.legacy_output.export_to_markdown( if self.legacy_output is None:
delim=delim, raise RuntimeError(
main_text_start=main_text_start, "No legacy output was produced, can not export as markdown. "
main_text_stop=main_text_stop, "Please use output.export_to_markdown() instead."
main_text_labels=main_text_labels, )
strict_text=strict_text, else:
image_placeholder=image_placeholder, return self.legacy_output.export_to_markdown(
) delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
strict_text=strict_text,
image_placeholder=image_placeholder,
)
@deprecated("Use output.export_to_text() instead.") @deprecated("Use output.export_to_text() instead.")
def render_as_text( def render_as_text(
@ -394,13 +396,19 @@ class ConversionResult(BaseModel):
"caption", "caption",
], ],
) -> str: ) -> str:
return self.legacy_output.export_to_markdown( if self.legacy_output is None:
delim=delim, raise RuntimeError(
main_text_start=main_text_start, "No legacy output was produced, can not export as text. "
main_text_stop=main_text_stop, "Please use output.export_to_markdown() instead."
main_text_labels=main_text_labels, )
strict_text=True, else:
) return self.legacy_output.export_to_markdown(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
strict_text=True,
)
@deprecated("Use output.export_to_document_tokens() instead.") @deprecated("Use output.export_to_document_tokens() instead.")
def render_as_doctags( def render_as_doctags(
@ -426,21 +434,27 @@ class ConversionResult(BaseModel):
add_table_cell_label: bool = True, add_table_cell_label: bool = True,
add_table_cell_text: bool = True, add_table_cell_text: bool = True,
) -> str: ) -> str:
return self.legacy_output.export_to_document_tokens( if self.legacy_output is None:
delim=delim, raise RuntimeError(
main_text_start=main_text_start, "No legacy output was produced, can not export as doctags. "
main_text_stop=main_text_stop, "Please use output.export_to_markdown() instead."
main_text_labels=main_text_labels, )
xsize=xsize, else:
ysize=ysize, return self.legacy_output.export_to_document_tokens(
add_location=add_location, delim=delim,
add_content=add_content, main_text_start=main_text_start,
add_page_index=add_page_index, main_text_stop=main_text_stop,
# table specific flags main_text_labels=main_text_labels,
add_table_cell_location=add_table_cell_location, xsize=xsize,
add_table_cell_label=add_table_cell_label, ysize=ysize,
add_table_cell_text=add_table_cell_text, add_location=add_location,
) add_content=add_content,
add_page_index=add_page_index,
# table specific flags
add_table_cell_location=add_table_cell_location,
add_table_cell_label=add_table_cell_label,
add_table_cell_text=add_table_cell_text,
)
def render_element_images( def render_element_images(
self, element_types: Tuple[PageElement] = (FigureElement,) self, element_types: Tuple[PageElement] = (FigureElement,)

View File

@ -1,7 +1,6 @@
import logging import logging
from typing import Iterable from typing import Iterable
import numpy
from docling_core.types.experimental import BoundingBox, CoordOrigin from docling_core.types.experimental import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page from docling.datamodel.base_models import OcrCell, Page

View File

@ -1,7 +1,4 @@
import logging import logging
from typing import Iterable
from docling_core.types.experimental import NodeItem
from docling.backend.abstract_backend import ( from docling.backend.abstract_backend import (
AbstractDocumentBackend, AbstractDocumentBackend,
@ -9,7 +6,7 @@ from docling.backend.abstract_backend import (
) )
from docling.datamodel.base_models import ConversionStatus from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, InputDocument from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions from docling.datamodel.pipeline_options import PipelineOptions
from docling.pipeline.base_model_pipeline import AbstractModelPipeline from docling.pipeline.base_model_pipeline import AbstractModelPipeline
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -2,13 +2,10 @@ import json
import logging import logging
from pathlib import Path from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.document_converter import ( from docling.document_converter import (
DocumentConverter, DocumentConverter,
FormatOption,
PdfFormatOption, PdfFormatOption,
WordFormatOption, WordFormatOption,
) )
@ -38,6 +35,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
# InputFormat.IMAGE, # InputFormat.IMAGE,
InputFormat.DOCX, InputFormat.DOCX,
InputFormat.HTML, InputFormat.HTML,
InputFormat.PPTX,
], # whitelist formats, other files are ignored. ], # whitelist formats, other files are ignored.
format_options={ format_options={
InputFormat.PDF: PdfFormatOption( InputFormat.PDF: PdfFormatOption(
@ -50,6 +48,30 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
}, },
) )
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
pdf=None,
docx=WordFormatOption(
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
),
formats=[
InputFormat.PDF,
# InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
], # whitelist formats, other files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
),
# InputFormat.IMAGE: PdfFormatOption(),
},
)
conv_results = doc_converter.convert_all(input_paths) conv_results = doc_converter.convert_all(input_paths)
for res in conv_results: for res in conv_results:

Binary file not shown.

View File

@ -3,7 +3,6 @@ import warnings
from pathlib import Path from pathlib import Path
from typing import List from typing import List
from docling_core.types import BaseText
from docling_core.types import Document as DsDocument from docling_core.types import Document as DsDocument
from docling_core.types.experimental import DoclingDocument from docling_core.types.experimental import DoclingDocument
from pydantic import TypeAdapter from pydantic import TypeAdapter