mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
add change and updated test data
Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
parent
39401f5157
commit
9368329973
@ -2,9 +2,10 @@ import logging
|
|||||||
import re
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
from enum import Enum
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Set, Union
|
from typing import List, Literal, Optional, Set, Union
|
||||||
|
|
||||||
import marko
|
import marko
|
||||||
import marko.element
|
import marko.element
|
||||||
@ -21,7 +22,8 @@ from docling_core.types.doc import (
|
|||||||
)
|
)
|
||||||
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
||||||
from marko import Markdown
|
from marko import Markdown
|
||||||
from pydantic import AnyUrl, TypeAdapter
|
from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
|
||||||
|
from typing_extensions import Annotated
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.backend.html_backend import HTMLDocumentBackend
|
from docling.backend.html_backend import HTMLDocumentBackend
|
||||||
@ -35,6 +37,31 @@ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
|
|||||||
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
|
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
|
||||||
|
|
||||||
|
|
||||||
|
class _PendingCreationType(str, Enum):
|
||||||
|
"""CoordOrigin."""
|
||||||
|
|
||||||
|
HEADING = "heading"
|
||||||
|
LIST_ITEM = "list_item"
|
||||||
|
|
||||||
|
|
||||||
|
class _HeadingCreationPayload(BaseModel):
|
||||||
|
kind: Literal["heading"] = "heading"
|
||||||
|
level: int
|
||||||
|
|
||||||
|
|
||||||
|
class _ListItemCreationPayload(BaseModel):
|
||||||
|
kind: Literal["list_item"] = "list_item"
|
||||||
|
|
||||||
|
|
||||||
|
_CreationPayload = Annotated[
|
||||||
|
Union[
|
||||||
|
_HeadingCreationPayload,
|
||||||
|
_ListItemCreationPayload,
|
||||||
|
],
|
||||||
|
Field(discriminator="kind"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||||
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
||||||
# This regex will match any sequence of underscores
|
# This regex will match any sequence of underscores
|
||||||
@ -155,6 +182,52 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc.add_table(data=table_data)
|
doc.add_table(data=table_data)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def _create_list_item(
|
||||||
|
self,
|
||||||
|
doc: DoclingDocument,
|
||||||
|
parent_item: Optional[NodeItem],
|
||||||
|
text: str,
|
||||||
|
formatting: Optional[Formatting] = None,
|
||||||
|
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
||||||
|
):
|
||||||
|
if not isinstance(parent_item, (OrderedList, UnorderedList)):
|
||||||
|
_log.warning("ListItem would have not had a list parent, adding one.")
|
||||||
|
parent_item = doc.add_unordered_list(parent=parent_item)
|
||||||
|
item = doc.add_list_item(
|
||||||
|
text=text,
|
||||||
|
enumerated=(isinstance(parent_item, OrderedList)),
|
||||||
|
parent=parent_item,
|
||||||
|
formatting=formatting,
|
||||||
|
hyperlink=hyperlink,
|
||||||
|
)
|
||||||
|
return item
|
||||||
|
|
||||||
|
def _create_heading_item(
|
||||||
|
self,
|
||||||
|
doc: DoclingDocument,
|
||||||
|
parent_item: Optional[NodeItem],
|
||||||
|
text: str,
|
||||||
|
level: int,
|
||||||
|
formatting: Optional[Formatting] = None,
|
||||||
|
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
||||||
|
):
|
||||||
|
if level == 1:
|
||||||
|
item = doc.add_title(
|
||||||
|
text=text,
|
||||||
|
parent=parent_item,
|
||||||
|
formatting=formatting,
|
||||||
|
hyperlink=hyperlink,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
item = doc.add_heading(
|
||||||
|
text=text,
|
||||||
|
level=level - 1,
|
||||||
|
parent=parent_item,
|
||||||
|
formatting=formatting,
|
||||||
|
hyperlink=hyperlink,
|
||||||
|
)
|
||||||
|
return item
|
||||||
|
|
||||||
def _iterate_elements( # noqa: C901
|
def _iterate_elements( # noqa: C901
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
@ -162,6 +235,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
depth: int,
|
depth: int,
|
||||||
doc: DoclingDocument,
|
doc: DoclingDocument,
|
||||||
visited: Set[marko.element.Element],
|
visited: Set[marko.element.Element],
|
||||||
|
creation_stack: list[
|
||||||
|
_CreationPayload
|
||||||
|
], # stack for lazy item creation triggered deep in marko's AST (on RawText)
|
||||||
parent_item: Optional[NodeItem] = None,
|
parent_item: Optional[NodeItem] = None,
|
||||||
formatting: Optional[Formatting] = None,
|
formatting: Optional[Formatting] = None,
|
||||||
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
||||||
@ -177,28 +253,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
||||||
)
|
)
|
||||||
|
|
||||||
if len(element.children) == 1:
|
if len(element.children) > 1: # inline group will be created further down
|
||||||
child = element.children[0]
|
parent_item = self._create_heading_item(
|
||||||
snippet_text = str(child.children) # type: ignore
|
doc=doc,
|
||||||
visited.add(child)
|
parent_item=parent_item,
|
||||||
else:
|
text="",
|
||||||
snippet_text = "" # inline group will be created
|
level=element.level,
|
||||||
|
|
||||||
if element.level == 1:
|
|
||||||
parent_item = doc.add_title(
|
|
||||||
text=snippet_text,
|
|
||||||
parent=parent_item,
|
|
||||||
formatting=formatting,
|
formatting=formatting,
|
||||||
hyperlink=hyperlink,
|
hyperlink=hyperlink,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
parent_item = doc.add_heading(
|
creation_stack.append(_HeadingCreationPayload(level=element.level))
|
||||||
text=snippet_text,
|
|
||||||
level=element.level - 1,
|
|
||||||
parent=parent_item,
|
|
||||||
formatting=formatting,
|
|
||||||
hyperlink=hyperlink,
|
|
||||||
)
|
|
||||||
|
|
||||||
elif isinstance(element, marko.block.List):
|
elif isinstance(element, marko.block.List):
|
||||||
has_non_empty_list_items = False
|
has_non_empty_list_items = False
|
||||||
@ -224,22 +289,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self._close_table(doc)
|
self._close_table(doc)
|
||||||
_log.debug(" - List item")
|
_log.debug(" - List item")
|
||||||
|
|
||||||
if len(child.children) == 1:
|
if len(child.children) > 1: # inline group will be created further down
|
||||||
snippet_text = str(child.children[0].children) # type: ignore
|
parent_item = self._create_list_item(
|
||||||
visited.add(child)
|
doc=doc,
|
||||||
|
parent_item=parent_item,
|
||||||
|
text="",
|
||||||
|
formatting=formatting,
|
||||||
|
hyperlink=hyperlink,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
snippet_text = "" # inline group will be created
|
creation_stack.append(_ListItemCreationPayload())
|
||||||
is_numbered = isinstance(parent_item, OrderedList)
|
|
||||||
if not isinstance(parent_item, (OrderedList, UnorderedList)):
|
|
||||||
_log.warning("ListItem would have not had a list parent, adding one.")
|
|
||||||
parent_item = doc.add_unordered_list(parent=parent_item)
|
|
||||||
parent_item = doc.add_list_item(
|
|
||||||
enumerated=is_numbered,
|
|
||||||
parent=parent_item,
|
|
||||||
text=snippet_text,
|
|
||||||
formatting=formatting,
|
|
||||||
hyperlink=hyperlink,
|
|
||||||
)
|
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.Image):
|
elif isinstance(element, marko.inline.Image):
|
||||||
self._close_table(doc)
|
self._close_table(doc)
|
||||||
@ -285,13 +344,38 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.md_table_buffer.append(snippet_text)
|
self.md_table_buffer.append(snippet_text)
|
||||||
elif snippet_text:
|
elif snippet_text:
|
||||||
self._close_table(doc)
|
self._close_table(doc)
|
||||||
doc.add_text(
|
|
||||||
label=DocItemLabel.TEXT,
|
if creation_stack:
|
||||||
parent=parent_item,
|
while len(creation_stack) > 0:
|
||||||
text=snippet_text,
|
to_create = creation_stack.pop()
|
||||||
formatting=formatting,
|
if isinstance(to_create, _ListItemCreationPayload):
|
||||||
hyperlink=hyperlink,
|
parent_item = self._create_list_item(
|
||||||
)
|
doc=doc,
|
||||||
|
parent_item=parent_item,
|
||||||
|
text=snippet_text,
|
||||||
|
formatting=formatting,
|
||||||
|
hyperlink=hyperlink,
|
||||||
|
)
|
||||||
|
elif isinstance(to_create, _HeadingCreationPayload):
|
||||||
|
# not keeping as parent_item as logic for correctly tracking
|
||||||
|
# that not implemented yet (section components not captured
|
||||||
|
# as heading children in marko)
|
||||||
|
self._create_heading_item(
|
||||||
|
doc=doc,
|
||||||
|
parent_item=parent_item,
|
||||||
|
text=snippet_text,
|
||||||
|
level=to_create.level,
|
||||||
|
formatting=formatting,
|
||||||
|
hyperlink=hyperlink,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel.TEXT,
|
||||||
|
parent=parent_item,
|
||||||
|
text=snippet_text,
|
||||||
|
formatting=formatting,
|
||||||
|
hyperlink=hyperlink,
|
||||||
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.CodeSpan):
|
elif isinstance(element, marko.inline.CodeSpan):
|
||||||
self._close_table(doc)
|
self._close_table(doc)
|
||||||
@ -353,7 +437,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
parent_item = doc.add_inline_group(parent=parent_item)
|
parent_item = doc.add_inline_group(parent=parent_item)
|
||||||
|
|
||||||
processed_block_types = (
|
processed_block_types = (
|
||||||
# marko.block.Heading,
|
|
||||||
marko.block.CodeBlock,
|
marko.block.CodeBlock,
|
||||||
marko.block.FencedCode,
|
marko.block.FencedCode,
|
||||||
marko.inline.RawText,
|
marko.inline.RawText,
|
||||||
@ -369,6 +452,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
depth=depth + 1,
|
depth=depth + 1,
|
||||||
doc=doc,
|
doc=doc,
|
||||||
visited=visited,
|
visited=visited,
|
||||||
|
creation_stack=creation_stack,
|
||||||
parent_item=parent_item,
|
parent_item=parent_item,
|
||||||
formatting=formatting,
|
formatting=formatting,
|
||||||
hyperlink=hyperlink,
|
hyperlink=hyperlink,
|
||||||
@ -405,6 +489,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Parse the markdown into an abstract syntax tree (AST)
|
# Parse the markdown into an abstract syntax tree (AST)
|
||||||
marko_parser = Markdown()
|
marko_parser = Markdown()
|
||||||
parsed_ast = marko_parser.parse(self.markdown)
|
parsed_ast = marko_parser.parse(self.markdown)
|
||||||
|
print(f"{parsed_ast=}")
|
||||||
# Start iterating from the root of the AST
|
# Start iterating from the root of the AST
|
||||||
self._iterate_elements(
|
self._iterate_elements(
|
||||||
element=parsed_ast,
|
element=parsed_ast,
|
||||||
@ -412,6 +497,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc=doc,
|
doc=doc,
|
||||||
parent_item=None,
|
parent_item=None,
|
||||||
visited=set(),
|
visited=set(),
|
||||||
|
creation_stack=[],
|
||||||
)
|
)
|
||||||
self._close_table(doc=doc) # handle any last hanging table
|
self._close_table(doc=doc) # handle any last hanging table
|
||||||
|
|
||||||
|
@ -11,10 +11,10 @@ Create your feature branch: `git checkout -b feature/AmazingFeature` .
|
|||||||
3. Commit your changes ( `git commit -m 'Add some AmazingFeature'` )
|
3. Commit your changes ( `git commit -m 'Add some AmazingFeature'` )
|
||||||
4. Push to the branch ( `git push origin feature/AmazingFeature` )
|
4. Push to the branch ( `git push origin feature/AmazingFeature` )
|
||||||
5. Open a Pull Request
|
5. Open a Pull Request
|
||||||
6. [<RawText children='Whole list item has same formatting'>]
|
6. **Whole list item has same formatting**
|
||||||
7. List item has *mixed or partial* formatting
|
7. List item has *mixed or partial* formatting
|
||||||
|
|
||||||
# [<RawText children='Whole heading is italic'>]
|
*# Whole heading is italic*
|
||||||
|
|
||||||
Bar
|
Bar
|
||||||
|
|
||||||
|
@ -424,14 +424,19 @@ texts:
|
|||||||
- children: []
|
- children: []
|
||||||
content_layer: body
|
content_layer: body
|
||||||
enumerated: true
|
enumerated: true
|
||||||
|
formatting:
|
||||||
|
bold: true
|
||||||
|
italic: false
|
||||||
|
strikethrough: false
|
||||||
|
underline: false
|
||||||
label: list_item
|
label: list_item
|
||||||
marker: '-'
|
marker: '-'
|
||||||
orig: '[<RawText children=''Whole list item has same formatting''>]'
|
orig: Whole list item has same formatting
|
||||||
parent:
|
parent:
|
||||||
$ref: '#/groups/2'
|
$ref: '#/groups/2'
|
||||||
prov: []
|
prov: []
|
||||||
self_ref: '#/texts/27'
|
self_ref: '#/texts/27'
|
||||||
text: '[<RawText children=''Whole list item has same formatting''>]'
|
text: Whole list item has same formatting
|
||||||
- children:
|
- children:
|
||||||
- $ref: '#/groups/7'
|
- $ref: '#/groups/7'
|
||||||
content_layer: body
|
content_layer: body
|
||||||
@ -478,13 +483,18 @@ texts:
|
|||||||
text: formatting
|
text: formatting
|
||||||
- children: []
|
- children: []
|
||||||
content_layer: body
|
content_layer: body
|
||||||
|
formatting:
|
||||||
|
bold: false
|
||||||
|
italic: true
|
||||||
|
strikethrough: false
|
||||||
|
underline: false
|
||||||
label: title
|
label: title
|
||||||
orig: '[<RawText children=''Whole heading is italic''>]'
|
orig: Whole heading is italic
|
||||||
parent:
|
parent:
|
||||||
$ref: '#/body'
|
$ref: '#/body'
|
||||||
prov: []
|
prov: []
|
||||||
self_ref: '#/texts/32'
|
self_ref: '#/texts/32'
|
||||||
text: '[<RawText children=''Whole heading is italic''>]'
|
text: Whole heading is italic
|
||||||
- children: []
|
- children: []
|
||||||
content_layer: body
|
content_layer: body
|
||||||
label: text
|
label: text
|
||||||
|
Loading…
Reference in New Issue
Block a user