mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
fix: improve HTML layer detection, various MD fixes (#1241)
Some checks failed
Run Docs CD / build-deploy-docs (push) Failing after 1m31s
Run Docs CI / build-docs (push) Failing after 54s
Some checks failed
Run Docs CD / build-deploy-docs (push) Failing after 1m31s
Run Docs CI / build-docs (push) Failing after 54s
Markdown fixes: - properly propagate section header levels - improve handling of list subroots without text Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
@@ -206,9 +206,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
hlevel = int(element.name.replace("h", ""))
|
||||
text = element.text.strip()
|
||||
|
||||
if hlevel == 1:
|
||||
self.content_layer = ContentLayer.BODY
|
||||
self.content_layer = ContentLayer.BODY
|
||||
|
||||
if hlevel == 1:
|
||||
for key in self.parents.keys():
|
||||
self.parents[key] = None
|
||||
|
||||
|
||||
@@ -212,9 +212,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
traverse(element)
|
||||
snippet_text = "".join(strings)
|
||||
if len(snippet_text) > 0:
|
||||
parent_item = doc.add_text(
|
||||
label=doc_label, parent=parent_item, text=snippet_text
|
||||
)
|
||||
if doc_label == DocItemLabel.SECTION_HEADER:
|
||||
parent_item = doc.add_heading(
|
||||
text=snippet_text,
|
||||
level=element.level - 1,
|
||||
parent=parent_item,
|
||||
)
|
||||
else:
|
||||
parent_item = doc.add_text(
|
||||
label=doc_label, parent=parent_item, text=snippet_text
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.block.List):
|
||||
has_non_empty_list_items = False
|
||||
@@ -232,12 +239,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
label=label, name=f"list", parent=parent_item
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
|
||||
elif (
|
||||
isinstance(element, marko.block.ListItem)
|
||||
and len(element.children) > 0
|
||||
and isinstance((first_child := element.children[0]), marko.block.Paragraph)
|
||||
):
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(" - List item")
|
||||
|
||||
first_child = element.children[0]
|
||||
snippet_text = str(first_child.children[0].children) # type: ignore
|
||||
is_numbered = False
|
||||
if (
|
||||
|
||||
Reference in New Issue
Block a user