mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
reformatted the code
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
5c82ff9890
commit
311640fb9d
@ -11,6 +11,8 @@ from docling_core.types.doc import (
|
|||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
DocumentOrigin,
|
DocumentOrigin,
|
||||||
GroupLabel,
|
GroupLabel,
|
||||||
|
ImageRef,
|
||||||
|
Size,
|
||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
@ -117,16 +119,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
else:
|
else:
|
||||||
_log.debug(f"ignoring element of type {type(element)}")
|
_log.debug(f"ignoring element of type {type(element)}")
|
||||||
|
|
||||||
"""
|
|
||||||
elif isinstance(element, Tag):
|
|
||||||
try:
|
|
||||||
self.analyse_element(element, 0, doc)
|
|
||||||
except Exception as exc:
|
|
||||||
_log.info(f" -> error treating elem: {exc}")
|
|
||||||
raise exc
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
_log.debug(f"error walking element: {type(element)}")
|
_log.debug(f"error walking element: {type(element)}")
|
||||||
pass
|
pass
|
||||||
@ -472,17 +464,63 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
return cell.text
|
return cell.text
|
||||||
|
|
||||||
def handle_figure(self, element, idx, doc):
|
def _get_imageref(self, element):
|
||||||
"""Handles image tags (img)."""
|
|
||||||
|
|
||||||
# Extract the image URI from the <img> tag
|
fig_ref = None
|
||||||
# image_uri = root.xpath('//figure//img/@src')[0]
|
|
||||||
|
img = element.find(["img"])
|
||||||
|
_log.info(img)
|
||||||
|
|
||||||
|
if img is not None and img.has_attr("src"):
|
||||||
|
fig_uri = img["src"]
|
||||||
|
_log.info(fig_uri)
|
||||||
|
|
||||||
|
dpi = 128
|
||||||
|
try:
|
||||||
|
dpi = int(img["dpi"])
|
||||||
|
except:
|
||||||
|
_log.debug("could not identify `dpi` of image")
|
||||||
|
|
||||||
|
width = 128
|
||||||
|
try:
|
||||||
|
width = int(img["width"])
|
||||||
|
except:
|
||||||
|
_log.debug("could not identify `width` of image")
|
||||||
|
|
||||||
|
height = 128
|
||||||
|
try:
|
||||||
|
height = int(img["height"])
|
||||||
|
except:
|
||||||
|
_log.debug("could not identify `height` of image")
|
||||||
|
|
||||||
|
if fig_uri.endswith(".jpg"):
|
||||||
|
fig_ref = ImageRef(
|
||||||
|
mimetype="image/jpg", dpi=dpi, size=Size(width, height), uri=fig_uri
|
||||||
|
)
|
||||||
|
|
||||||
|
elif fig_uri.endswith(".jpeg"):
|
||||||
|
fig_ref = ImageRef(
|
||||||
|
mimetype="image/jpg", dpi=dpi, size=Size(width, height), uri=fig_uri
|
||||||
|
)
|
||||||
|
|
||||||
|
elif fig_uri.endswith(".png"):
|
||||||
|
fig_ref = ImageRef(
|
||||||
|
mimetype="image/png", dpi=dpi, size=Size(width, height), uri=fig_uri
|
||||||
|
)
|
||||||
|
|
||||||
|
elif fig_uri.endswith(".svg"):
|
||||||
|
fig_ref = ImageRef(
|
||||||
|
mimetype="image/svg", dpi=dpi, size=Size(width, height), uri=fig_uri
|
||||||
|
)
|
||||||
|
|
||||||
|
return fig_ref
|
||||||
|
|
||||||
|
def _get_figcaption(self, element, doc):
|
||||||
|
|
||||||
|
fig_caption = None
|
||||||
|
|
||||||
contains_captions = element.find(["figcaption"])
|
contains_captions = element.find(["figcaption"])
|
||||||
if contains_captions is None:
|
if contains_captions is not None:
|
||||||
doc.add_picture(parent=self.parents[self.level], caption=None)
|
|
||||||
|
|
||||||
else:
|
|
||||||
texts = []
|
texts = []
|
||||||
for item in contains_captions:
|
for item in contains_captions:
|
||||||
texts.append(item.text)
|
texts.append(item.text)
|
||||||
@ -490,15 +528,34 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
fig_caption = doc.add_text(
|
fig_caption = doc.add_text(
|
||||||
label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
|
label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
|
||||||
)
|
)
|
||||||
doc.add_picture(
|
|
||||||
parent=self.parents[self.level],
|
return fig_caption
|
||||||
caption=fig_caption,
|
|
||||||
)
|
def handle_figure(self, element, idx, doc):
|
||||||
|
"""Handles image tags (img)."""
|
||||||
|
|
||||||
|
fig_ref = self._get_imageref(element)
|
||||||
|
fig_caption = self._get_figcaption(element, doc)
|
||||||
|
|
||||||
|
_log.warn(fig_ref)
|
||||||
|
|
||||||
|
doc.add_picture(
|
||||||
|
parent=self.parents[self.level], image=fig_ref, caption=fig_caption
|
||||||
|
)
|
||||||
|
|
||||||
def handle_image(self, element, idx, doc):
|
def handle_image(self, element, idx, doc):
|
||||||
"""Handles image tags (img)."""
|
"""Handles image tags (img)."""
|
||||||
doc.add_picture(parent=self.parents[self.level], caption=None)
|
|
||||||
|
fig_ref = self._get_imageref(element)
|
||||||
|
|
||||||
|
doc.add_picture(parent=self.parents[self.level], image=fig_ref, caption=None)
|
||||||
|
|
||||||
def handle_svg(self, element, idx, doc):
|
def handle_svg(self, element, idx, doc):
|
||||||
"""Handles svg tags."""
|
"""Handles svg tags."""
|
||||||
doc.add_picture(parent=self.parents[self.level], caption=None)
|
|
||||||
|
fig_ref = self._get_imageref(element)
|
||||||
|
fig_caption = self._get_figcaption(element, doc)
|
||||||
|
|
||||||
|
doc.add_picture(
|
||||||
|
parent=self.parents[self.level], image=fig_ref, caption=fig_caption
|
||||||
|
)
|
||||||
|
@ -120,8 +120,8 @@ def export_documents(
|
|||||||
if export_itxt:
|
if export_itxt:
|
||||||
fname = output_dir / f"{doc_filename}.itxt"
|
fname = output_dir / f"{doc_filename}.itxt"
|
||||||
with fname.open("w") as fp:
|
with fname.open("w") as fp:
|
||||||
_log.info(f"writing Doc Tags output to {fname}")
|
_log.info(f"writing Indented Text output to {fname}")
|
||||||
fp.write(conv_res.document._export_to_indented_text())
|
fp.write(conv_res.document._export_to_indented_text())
|
||||||
else:
|
else:
|
||||||
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
||||||
failure_count += 1
|
failure_count += 1
|
||||||
|
Loading…
Reference in New Issue
Block a user