mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-10 13:48:13 +00:00
fix: fix duplicate title and heading + add e2e tests for html and docx (#186)
* add real e2e tests for html and docx Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updated the output of itxt Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted the text Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the tests Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the tests (2) Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the examples (1) Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the output of the test Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updated the tests, moved the ground-truth Signed-off-by: Peter Staar <taa@zurich.ibm.com> * moved the ground-truth data Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the html tests Signed-off-by: Peter Staar <taa@zurich.ibm.com> * restructure title fix (#187) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
dda2645d4c
commit
f542460af3
@@ -179,31 +179,31 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.parents[self.level] = doc.add_text(
|
||||
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
|
||||
)
|
||||
else:
|
||||
if hlevel > self.level:
|
||||
|
||||
elif hlevel > self.level:
|
||||
# add invisible group
|
||||
for i in range(self.level + 1, hlevel):
|
||||
self.parents[i] = doc.add_group(
|
||||
name=f"header-{i}",
|
||||
label=GroupLabel.SECTION,
|
||||
parent=self.parents[i - 1],
|
||||
)
|
||||
self.level = hlevel
|
||||
|
||||
# add invisible group
|
||||
for i in range(self.level + 1, hlevel):
|
||||
self.parents[i] = doc.add_group(
|
||||
name=f"header-{i}",
|
||||
label=GroupLabel.SECTION,
|
||||
parent=self.parents[i - 1],
|
||||
)
|
||||
self.level = hlevel
|
||||
elif hlevel < self.level:
|
||||
|
||||
elif hlevel < self.level:
|
||||
# remove the tail
|
||||
for key, val in self.parents.items():
|
||||
if key > hlevel:
|
||||
self.parents[key] = None
|
||||
self.level = hlevel
|
||||
|
||||
# remove the tail
|
||||
for key, val in self.parents.items():
|
||||
if key > hlevel:
|
||||
self.parents[key] = None
|
||||
self.level = hlevel
|
||||
|
||||
self.parents[hlevel] = doc.add_heading(
|
||||
parent=self.parents[hlevel - 1],
|
||||
text=text,
|
||||
level=hlevel,
|
||||
)
|
||||
self.parents[hlevel] = doc.add_heading(
|
||||
parent=self.parents[hlevel - 1],
|
||||
text=text,
|
||||
level=hlevel,
|
||||
)
|
||||
|
||||
def handle_paragraph(self, element, idx, doc):
|
||||
"""Handles paragraph tags (p)."""
|
||||
|
||||
Reference in New Issue
Block a user