Merge branch 'cau/input-format-abstraction' of github.com:DS4SD/docling into cau/input-format-abstraction

This commit is contained in:
Michele Dolfi 2024-10-11 16:31:28 +02:00
commit 5b5c99e9da
9 changed files with 131 additions and 67 deletions

View File

@ -1,5 +1,5 @@
import logging
from io import BytesIO, TextIOWrapper
from io import BytesIO
from pathlib import Path
from typing import Set, Union
@ -81,9 +81,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
try:
self.analyse_element(element, idx, doc)
except Exception as exc_child:
_log.error(" -> error treating child: ", exc_child)
_log.error(" => element: ", element, "\n")
pass
raise exc_child
except Exception as exc:
pass
@ -212,10 +213,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def handle_list(self, element, idx, doc):
"""Handles list tags (ul, ol) and their list items."""
if element.name == "ul":
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
)
elif element.name == "ol":
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level],
name="ordered list",
label=GroupLabel.ORDERED_LIST,
)
self.level += 1
self.walk(element, doc)
@ -226,13 +235,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def handle_listitem(self, element, idx, doc):
"""Handles listitem tags (li)."""
nested_lists = element.find(["ul", "ol"])
parent_list_label = self.parents[self.level].label
index_in_list = len(self.parents[self.level].children) + 1
if nested_lists:
name = element.name
text = self.get_direct_text(element)
marker = ""
enumerated = False
if parent_list_label == GroupLabel.ORDERED_LIST:
marker = str(index_in_list)
enumerated = True
# create a list-item
self.parents[self.level + 1] = doc.add_text(
label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level]
self.parents[self.level + 1] = doc.add_list_item(
text=text,
enumerated=enumerated,
marker=marker,
parent=self.parents[self.level],
)
self.level += 1
@ -244,8 +266,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(element.text, str):
text = element.text.strip()
doc.add_text(
label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level]
marker = ""
enumerated = False
if parent_list_label == GroupLabel.ORDERED_LIST:
marker = f"{str(index_in_list)}."
enumerated = True
doc.add_list_item(
text=text,
enumerated=enumerated,
marker=marker,
parent=self.parents[self.level],
)
else:
_log.warn("list-item has no text: ", element)

View File

@ -43,7 +43,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
try:
self.pptx_obj = Presentation(self.path_or_stream)
self.valid = True
except Exception:
except Exception as e:
raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
) from e
@ -134,6 +134,8 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
if is_a_list:
# TODO: determine if this is an unordered list or an ordered list.
# Set GroupLabel.ORDERED_LIST when it fits.
new_list = doc.add_group(
label=GroupLabel.LIST, name=f"list", parent=parent_slide
)
@ -157,9 +159,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
e_is_a_list_item = True
else:
e_is_a_list_item = False
if e_is_a_list_item:
doc.add_text(
label=DocItemLabel.LIST_ITEM,
# TODO: Set marker and enumerated arguments if this is an enumeration element.
doc.add_list_item(
parent=new_list,
text=e.text.strip(),
prov=prov,

View File

@ -312,9 +312,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
)
doc.add_text(
label=DocItemLabel.LIST_ITEM, parent=self.parents[level], text=text
)
# TODO: Set marker and enumerated arguments if this is an enumeration element.
doc.add_list_item(parent=self.parents[level], text=text)
elif (
self.prev_numid() == numid and self.prev_indent() < ilevel
@ -323,12 +322,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level_at_new_list + self.prev_indent() + 1,
self.level_at_new_list + ilevel + 1,
):
# TODO: determine if this is an unordered list or an ordered list.
# Set GroupLabel.ORDERED_LIST when it fits.
self.parents[i] = doc.add_group(
label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
)
doc.add_text(
label=DocItemLabel.LIST_ITEM,
# TODO: Set marker and enumerated arguments if this is an enumeration element.
doc.add_list_item(
parent=self.parents[self.level_at_new_list + ilevel],
text=text,
)
@ -338,16 +339,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if k > self.level_at_new_list + ilevel:
self.parents[k] = None
doc.add_text(
label=DocItemLabel.LIST_ITEM,
# TODO: Set marker and enumerated arguments if this is an enumeration element.
doc.add_list_item(
parent=self.parents[self.level_at_new_list + ilevel],
text=text,
)
elif self.prev_numid() == numid or self.prev_indent() == ilevel:
doc.add_text(
label=DocItemLabel.LIST_ITEM, parent=self.parents[level - 1], text=text
)
# TODO: Set marker and enumerated arguments if this is an enumeration element.
doc.add_list_item(parent=self.parents[level - 1], text=text)
return
def handle_tables(self, element, docx_obj, doc):

View File

@ -3,7 +3,7 @@ import re
from enum import Enum
from io import BytesIO
from pathlib import Path, PurePath
from typing import Dict, Iterable, List, Optional, Tuple, Type, Union
from typing import Dict, Iterable, List, Optional, Tuple, Union
import filetype
from docling_core.types import BaseText
@ -24,10 +24,6 @@ from pydantic import BaseModel
from typing_extensions import deprecated
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import (
AssembledUnit,
ConversionStatus,
@ -372,6 +368,12 @@ class ConversionResult(BaseModel):
strict_text: bool = False,
image_placeholder: str = "<!-- image -->",
) -> str:
if self.legacy_output is None:
raise RuntimeError(
"No legacy output was produced, can not export as markdown. "
"Please use output.export_to_markdown() instead."
)
else:
return self.legacy_output.export_to_markdown(
delim=delim,
main_text_start=main_text_start,
@ -394,6 +396,12 @@ class ConversionResult(BaseModel):
"caption",
],
) -> str:
if self.legacy_output is None:
raise RuntimeError(
"No legacy output was produced, can not export as text. "
"Please use output.export_to_markdown() instead."
)
else:
return self.legacy_output.export_to_markdown(
delim=delim,
main_text_start=main_text_start,
@ -426,6 +434,12 @@ class ConversionResult(BaseModel):
add_table_cell_label: bool = True,
add_table_cell_text: bool = True,
) -> str:
if self.legacy_output is None:
raise RuntimeError(
"No legacy output was produced, can not export as doctags. "
"Please use output.export_to_markdown() instead."
)
else:
return self.legacy_output.export_to_document_tokens(
delim=delim,
main_text_start=main_text_start,

View File

@ -1,7 +1,6 @@
import logging
from typing import Iterable
import numpy
from docling_core.types.experimental import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page

View File

@ -1,7 +1,4 @@
import logging
from typing import Iterable
from docling_core.types.experimental import NodeItem
from docling.backend.abstract_backend import (
AbstractDocumentBackend,
@ -9,7 +6,7 @@ from docling.backend.abstract_backend import (
)
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
from docling.datamodel.pipeline_options import PipelineOptions
from docling.pipeline.base_model_pipeline import AbstractModelPipeline
_log = logging.getLogger(__name__)

View File

@ -2,13 +2,10 @@ import json
import logging
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
DocumentConverter,
FormatOption,
PdfFormatOption,
WordFormatOption,
)
@ -38,6 +35,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
# InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
], # whitelist formats, other files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
@ -50,6 +48,30 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
},
)
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
pdf=None,
docx=WordFormatOption(
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
),
formats=[
InputFormat.PDF,
# InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
], # whitelist formats, other files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
),
# InputFormat.IMAGE: PdfFormatOption(),
},
)
conv_results = doc_converter.convert_all(input_paths)
for res in conv_results:

Binary file not shown.

View File

@ -3,7 +3,6 @@ import warnings
from pathlib import Path
from typing import List
from docling_core.types import BaseText
from docling_core.types import Document as DsDocument
from docling_core.types.experimental import DoclingDocument
from pydantic import TypeAdapter