mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Merge branch 'cau/input-format-abstraction' of github.com:DS4SD/docling into cau/input-format-abstraction
This commit is contained in:
commit
5b5c99e9da
@ -1,5 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
from io import BytesIO, TextIOWrapper
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Set, Union
|
from typing import Set, Union
|
||||||
|
|
||||||
@ -81,9 +81,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
try:
|
try:
|
||||||
self.analyse_element(element, idx, doc)
|
self.analyse_element(element, idx, doc)
|
||||||
except Exception as exc_child:
|
except Exception as exc_child:
|
||||||
|
|
||||||
_log.error(" -> error treating child: ", exc_child)
|
_log.error(" -> error treating child: ", exc_child)
|
||||||
_log.error(" => element: ", element, "\n")
|
_log.error(" => element: ", element, "\n")
|
||||||
pass
|
raise exc_child
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
pass
|
pass
|
||||||
@ -212,10 +213,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def handle_list(self, element, idx, doc):
|
def handle_list(self, element, idx, doc):
|
||||||
"""Handles list tags (ul, ol) and their list items."""
|
"""Handles list tags (ul, ol) and their list items."""
|
||||||
|
|
||||||
# create a list group
|
if element.name == "ul":
|
||||||
self.parents[self.level + 1] = doc.add_group(
|
# create a list group
|
||||||
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
|
self.parents[self.level + 1] = doc.add_group(
|
||||||
)
|
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
|
||||||
|
)
|
||||||
|
elif element.name == "ol":
|
||||||
|
# create a list group
|
||||||
|
self.parents[self.level + 1] = doc.add_group(
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
name="ordered list",
|
||||||
|
label=GroupLabel.ORDERED_LIST,
|
||||||
|
)
|
||||||
self.level += 1
|
self.level += 1
|
||||||
|
|
||||||
self.walk(element, doc)
|
self.walk(element, doc)
|
||||||
@ -226,13 +235,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def handle_listitem(self, element, idx, doc):
|
def handle_listitem(self, element, idx, doc):
|
||||||
"""Handles listitem tags (li)."""
|
"""Handles listitem tags (li)."""
|
||||||
nested_lists = element.find(["ul", "ol"])
|
nested_lists = element.find(["ul", "ol"])
|
||||||
|
|
||||||
|
parent_list_label = self.parents[self.level].label
|
||||||
|
index_in_list = len(self.parents[self.level].children) + 1
|
||||||
|
|
||||||
if nested_lists:
|
if nested_lists:
|
||||||
name = element.name
|
name = element.name
|
||||||
text = self.get_direct_text(element)
|
text = self.get_direct_text(element)
|
||||||
|
|
||||||
|
marker = ""
|
||||||
|
enumerated = False
|
||||||
|
if parent_list_label == GroupLabel.ORDERED_LIST:
|
||||||
|
marker = str(index_in_list)
|
||||||
|
enumerated = True
|
||||||
|
|
||||||
# create a list-item
|
# create a list-item
|
||||||
self.parents[self.level + 1] = doc.add_text(
|
self.parents[self.level + 1] = doc.add_list_item(
|
||||||
label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level]
|
text=text,
|
||||||
|
enumerated=enumerated,
|
||||||
|
marker=marker,
|
||||||
|
parent=self.parents[self.level],
|
||||||
)
|
)
|
||||||
self.level += 1
|
self.level += 1
|
||||||
|
|
||||||
@ -244,8 +266,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
elif isinstance(element.text, str):
|
elif isinstance(element.text, str):
|
||||||
text = element.text.strip()
|
text = element.text.strip()
|
||||||
|
|
||||||
doc.add_text(
|
marker = ""
|
||||||
label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level]
|
enumerated = False
|
||||||
|
if parent_list_label == GroupLabel.ORDERED_LIST:
|
||||||
|
marker = f"{str(index_in_list)}."
|
||||||
|
enumerated = True
|
||||||
|
doc.add_list_item(
|
||||||
|
text=text,
|
||||||
|
enumerated=enumerated,
|
||||||
|
marker=marker,
|
||||||
|
parent=self.parents[self.level],
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
_log.warn("list-item has no text: ", element)
|
_log.warn("list-item has no text: ", element)
|
||||||
|
@ -43,7 +43,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
try:
|
try:
|
||||||
self.pptx_obj = Presentation(self.path_or_stream)
|
self.pptx_obj = Presentation(self.path_or_stream)
|
||||||
self.valid = True
|
self.valid = True
|
||||||
except Exception:
|
except Exception as e:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
|
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
|
||||||
) from e
|
) from e
|
||||||
@ -134,6 +134,8 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
|
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
|
||||||
|
|
||||||
if is_a_list:
|
if is_a_list:
|
||||||
|
# TODO: determine if this is an unordered list or an ordered list.
|
||||||
|
# Set GroupLabel.ORDERED_LIST when it fits.
|
||||||
new_list = doc.add_group(
|
new_list = doc.add_group(
|
||||||
label=GroupLabel.LIST, name=f"list", parent=parent_slide
|
label=GroupLabel.LIST, name=f"list", parent=parent_slide
|
||||||
)
|
)
|
||||||
@ -157,9 +159,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
e_is_a_list_item = True
|
e_is_a_list_item = True
|
||||||
else:
|
else:
|
||||||
e_is_a_list_item = False
|
e_is_a_list_item = False
|
||||||
|
|
||||||
if e_is_a_list_item:
|
if e_is_a_list_item:
|
||||||
doc.add_text(
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||||
label=DocItemLabel.LIST_ITEM,
|
doc.add_list_item(
|
||||||
parent=new_list,
|
parent=new_list,
|
||||||
text=e.text.strip(),
|
text=e.text.strip(),
|
||||||
prov=prov,
|
prov=prov,
|
||||||
|
@ -312,9 +312,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
|
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
|
||||||
)
|
)
|
||||||
|
|
||||||
doc.add_text(
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||||
label=DocItemLabel.LIST_ITEM, parent=self.parents[level], text=text
|
doc.add_list_item(parent=self.parents[level], text=text)
|
||||||
)
|
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
self.prev_numid() == numid and self.prev_indent() < ilevel
|
self.prev_numid() == numid and self.prev_indent() < ilevel
|
||||||
@ -323,12 +322,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.level_at_new_list + self.prev_indent() + 1,
|
self.level_at_new_list + self.prev_indent() + 1,
|
||||||
self.level_at_new_list + ilevel + 1,
|
self.level_at_new_list + ilevel + 1,
|
||||||
):
|
):
|
||||||
|
# TODO: determine if this is an unordered list or an ordered list.
|
||||||
|
# Set GroupLabel.ORDERED_LIST when it fits.
|
||||||
self.parents[i] = doc.add_group(
|
self.parents[i] = doc.add_group(
|
||||||
label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
|
label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
|
||||||
)
|
)
|
||||||
|
|
||||||
doc.add_text(
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||||
label=DocItemLabel.LIST_ITEM,
|
doc.add_list_item(
|
||||||
parent=self.parents[self.level_at_new_list + ilevel],
|
parent=self.parents[self.level_at_new_list + ilevel],
|
||||||
text=text,
|
text=text,
|
||||||
)
|
)
|
||||||
@ -338,16 +339,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if k > self.level_at_new_list + ilevel:
|
if k > self.level_at_new_list + ilevel:
|
||||||
self.parents[k] = None
|
self.parents[k] = None
|
||||||
|
|
||||||
doc.add_text(
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||||
label=DocItemLabel.LIST_ITEM,
|
doc.add_list_item(
|
||||||
parent=self.parents[self.level_at_new_list + ilevel],
|
parent=self.parents[self.level_at_new_list + ilevel],
|
||||||
text=text,
|
text=text,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif self.prev_numid() == numid or self.prev_indent() == ilevel:
|
elif self.prev_numid() == numid or self.prev_indent() == ilevel:
|
||||||
doc.add_text(
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||||
label=DocItemLabel.LIST_ITEM, parent=self.parents[level - 1], text=text
|
doc.add_list_item(parent=self.parents[level - 1], text=text)
|
||||||
)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def handle_tables(self, element, docx_obj, doc):
|
def handle_tables(self, element, docx_obj, doc):
|
||||||
|
@ -3,7 +3,7 @@ import re
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
from typing import Dict, Iterable, List, Optional, Tuple, Type, Union
|
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
||||||
|
|
||||||
import filetype
|
import filetype
|
||||||
from docling_core.types import BaseText
|
from docling_core.types import BaseText
|
||||||
@ -24,10 +24,6 @@ from pydantic import BaseModel
|
|||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
||||||
from docling.backend.html_backend import HTMLDocumentBackend
|
|
||||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
AssembledUnit,
|
AssembledUnit,
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
@ -372,14 +368,20 @@ class ConversionResult(BaseModel):
|
|||||||
strict_text: bool = False,
|
strict_text: bool = False,
|
||||||
image_placeholder: str = "<!-- image -->",
|
image_placeholder: str = "<!-- image -->",
|
||||||
) -> str:
|
) -> str:
|
||||||
return self.legacy_output.export_to_markdown(
|
if self.legacy_output is None:
|
||||||
delim=delim,
|
raise RuntimeError(
|
||||||
main_text_start=main_text_start,
|
"No legacy output was produced, can not export as markdown. "
|
||||||
main_text_stop=main_text_stop,
|
"Please use output.export_to_markdown() instead."
|
||||||
main_text_labels=main_text_labels,
|
)
|
||||||
strict_text=strict_text,
|
else:
|
||||||
image_placeholder=image_placeholder,
|
return self.legacy_output.export_to_markdown(
|
||||||
)
|
delim=delim,
|
||||||
|
main_text_start=main_text_start,
|
||||||
|
main_text_stop=main_text_stop,
|
||||||
|
main_text_labels=main_text_labels,
|
||||||
|
strict_text=strict_text,
|
||||||
|
image_placeholder=image_placeholder,
|
||||||
|
)
|
||||||
|
|
||||||
@deprecated("Use output.export_to_text() instead.")
|
@deprecated("Use output.export_to_text() instead.")
|
||||||
def render_as_text(
|
def render_as_text(
|
||||||
@ -394,13 +396,19 @@ class ConversionResult(BaseModel):
|
|||||||
"caption",
|
"caption",
|
||||||
],
|
],
|
||||||
) -> str:
|
) -> str:
|
||||||
return self.legacy_output.export_to_markdown(
|
if self.legacy_output is None:
|
||||||
delim=delim,
|
raise RuntimeError(
|
||||||
main_text_start=main_text_start,
|
"No legacy output was produced, can not export as text. "
|
||||||
main_text_stop=main_text_stop,
|
"Please use output.export_to_markdown() instead."
|
||||||
main_text_labels=main_text_labels,
|
)
|
||||||
strict_text=True,
|
else:
|
||||||
)
|
return self.legacy_output.export_to_markdown(
|
||||||
|
delim=delim,
|
||||||
|
main_text_start=main_text_start,
|
||||||
|
main_text_stop=main_text_stop,
|
||||||
|
main_text_labels=main_text_labels,
|
||||||
|
strict_text=True,
|
||||||
|
)
|
||||||
|
|
||||||
@deprecated("Use output.export_to_document_tokens() instead.")
|
@deprecated("Use output.export_to_document_tokens() instead.")
|
||||||
def render_as_doctags(
|
def render_as_doctags(
|
||||||
@ -426,21 +434,27 @@ class ConversionResult(BaseModel):
|
|||||||
add_table_cell_label: bool = True,
|
add_table_cell_label: bool = True,
|
||||||
add_table_cell_text: bool = True,
|
add_table_cell_text: bool = True,
|
||||||
) -> str:
|
) -> str:
|
||||||
return self.legacy_output.export_to_document_tokens(
|
if self.legacy_output is None:
|
||||||
delim=delim,
|
raise RuntimeError(
|
||||||
main_text_start=main_text_start,
|
"No legacy output was produced, can not export as doctags. "
|
||||||
main_text_stop=main_text_stop,
|
"Please use output.export_to_markdown() instead."
|
||||||
main_text_labels=main_text_labels,
|
)
|
||||||
xsize=xsize,
|
else:
|
||||||
ysize=ysize,
|
return self.legacy_output.export_to_document_tokens(
|
||||||
add_location=add_location,
|
delim=delim,
|
||||||
add_content=add_content,
|
main_text_start=main_text_start,
|
||||||
add_page_index=add_page_index,
|
main_text_stop=main_text_stop,
|
||||||
# table specific flags
|
main_text_labels=main_text_labels,
|
||||||
add_table_cell_location=add_table_cell_location,
|
xsize=xsize,
|
||||||
add_table_cell_label=add_table_cell_label,
|
ysize=ysize,
|
||||||
add_table_cell_text=add_table_cell_text,
|
add_location=add_location,
|
||||||
)
|
add_content=add_content,
|
||||||
|
add_page_index=add_page_index,
|
||||||
|
# table specific flags
|
||||||
|
add_table_cell_location=add_table_cell_location,
|
||||||
|
add_table_cell_label=add_table_cell_label,
|
||||||
|
add_table_cell_text=add_table_cell_text,
|
||||||
|
)
|
||||||
|
|
||||||
def render_element_images(
|
def render_element_images(
|
||||||
self, element_types: Tuple[PageElement] = (FigureElement,)
|
self, element_types: Tuple[PageElement] = (FigureElement,)
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
import numpy
|
|
||||||
from docling_core.types.experimental import BoundingBox, CoordOrigin
|
from docling_core.types.experimental import BoundingBox, CoordOrigin
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell, Page
|
from docling.datamodel.base_models import OcrCell, Page
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
import logging
|
import logging
|
||||||
from typing import Iterable
|
|
||||||
|
|
||||||
from docling_core.types.experimental import NodeItem
|
|
||||||
|
|
||||||
from docling.backend.abstract_backend import (
|
from docling.backend.abstract_backend import (
|
||||||
AbstractDocumentBackend,
|
AbstractDocumentBackend,
|
||||||
@ -9,7 +6,7 @@ from docling.backend.abstract_backend import (
|
|||||||
)
|
)
|
||||||
from docling.datamodel.base_models import ConversionStatus
|
from docling.datamodel.base_models import ConversionStatus
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
from docling.datamodel.pipeline_options import PipelineOptions
|
||||||
from docling.pipeline.base_model_pipeline import AbstractModelPipeline
|
from docling.pipeline.base_model_pipeline import AbstractModelPipeline
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
@ -2,13 +2,10 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.document_converter import (
|
from docling.document_converter import (
|
||||||
DocumentConverter,
|
DocumentConverter,
|
||||||
FormatOption,
|
|
||||||
PdfFormatOption,
|
PdfFormatOption,
|
||||||
WordFormatOption,
|
WordFormatOption,
|
||||||
)
|
)
|
||||||
@ -38,6 +35,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
|
|||||||
# InputFormat.IMAGE,
|
# InputFormat.IMAGE,
|
||||||
InputFormat.DOCX,
|
InputFormat.DOCX,
|
||||||
InputFormat.HTML,
|
InputFormat.HTML,
|
||||||
|
InputFormat.PPTX,
|
||||||
], # whitelist formats, other files are ignored.
|
], # whitelist formats, other files are ignored.
|
||||||
format_options={
|
format_options={
|
||||||
InputFormat.PDF: PdfFormatOption(
|
InputFormat.PDF: PdfFormatOption(
|
||||||
@ -50,6 +48,30 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||||
|
pdf=None,
|
||||||
|
docx=WordFormatOption(
|
||||||
|
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
|
||||||
|
),
|
||||||
|
formats=[
|
||||||
|
InputFormat.PDF,
|
||||||
|
# InputFormat.IMAGE,
|
||||||
|
InputFormat.DOCX,
|
||||||
|
InputFormat.HTML,
|
||||||
|
InputFormat.PPTX,
|
||||||
|
], # whitelist formats, other files are ignored.
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
|
||||||
|
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
|
||||||
|
InputFormat.DOCX: WordFormatOption(
|
||||||
|
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
|
||||||
|
),
|
||||||
|
# InputFormat.IMAGE: PdfFormatOption(),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
conv_results = doc_converter.convert_all(input_paths)
|
conv_results = doc_converter.convert_all(input_paths)
|
||||||
|
|
||||||
for res in conv_results:
|
for res in conv_results:
|
||||||
|
Binary file not shown.
@ -3,7 +3,6 @@ import warnings
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from docling_core.types import BaseText
|
|
||||||
from docling_core.types import Document as DsDocument
|
from docling_core.types import Document as DsDocument
|
||||||
from docling_core.types.experimental import DoclingDocument
|
from docling_core.types.experimental import DoclingDocument
|
||||||
from pydantic import TypeAdapter
|
from pydantic import TypeAdapter
|
||||||
|
Loading…
Reference in New Issue
Block a user