mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 23:12:20 +00:00
fixed the mypy
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
b04f14ec24
commit
bb3db07836
@ -4,20 +4,20 @@ from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
from pydantic import (
|
||||
AnyUrl,
|
||||
)
|
||||
|
||||
from docling_core.types.doc import (
|
||||
Size,
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupItem,
|
||||
GroupLabel,
|
||||
ImageRef,
|
||||
NodeItem,
|
||||
Size,
|
||||
TableCell,
|
||||
TableData,
|
||||
ImageRef,
|
||||
)
|
||||
from pydantic import AnyUrl
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
@ -90,12 +90,14 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
in_list = False
|
||||
in_table = False
|
||||
|
||||
text_data = []
|
||||
table_data = []
|
||||
caption_data = []
|
||||
text_data: list[str] = []
|
||||
table_data: list[str] = []
|
||||
caption_data: list[str] = []
|
||||
|
||||
parents = {}
|
||||
indents = {}
|
||||
# parents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
||||
parents: dict[int, Union[GroupItem, None]] = {}
|
||||
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
||||
indents: dict[int, Union[GroupItem, None]] = {}
|
||||
|
||||
for i in range(0, 10):
|
||||
parents[i] = None
|
||||
@ -109,14 +111,18 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
item = self.parse_title(line)
|
||||
level = item["level"]
|
||||
|
||||
parents[level] = doc.add_text(text=item["text"], label=DocItemLabel.TITLE)
|
||||
parents[level] = doc.add_text(
|
||||
text=item["text"], label=DocItemLabel.TITLE
|
||||
)
|
||||
|
||||
# Section headers
|
||||
elif self.is_section_header(line):
|
||||
item = self.parse_section_header(line)
|
||||
level = item["level"]
|
||||
|
||||
parents[level] = doc.add_heading(text=item["text"], level=item["level"], parent=parents[level-1])
|
||||
parents[level] = doc.add_heading(
|
||||
text=item["text"], level=item["level"], parent=parents[level - 1]
|
||||
)
|
||||
for k, v in parents.items():
|
||||
if k > level:
|
||||
parents[k] = None
|
||||
@ -169,16 +175,22 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
in_table = True
|
||||
table_data.append(self.parse_table_line(line))
|
||||
|
||||
elif in_table and ((not self.is_table_line(line)) or line.strip()=="|==="): # end of table
|
||||
elif in_table and (
|
||||
(not self.is_table_line(line)) or line.strip() == "|==="
|
||||
): # end of table
|
||||
|
||||
caption = None
|
||||
if len(caption_data) > 0:
|
||||
caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION)
|
||||
caption = doc.add_text(
|
||||
text=" ".join(caption_data), label=DocItemLabel.CAPTION
|
||||
)
|
||||
|
||||
caption_data = []
|
||||
|
||||
data = self.populate_table_as_grid(table_data)
|
||||
doc.add_table(data=data, parent=self.get_current_parent(parents), caption=caption)
|
||||
doc.add_table(
|
||||
data=data, parent=self.get_current_parent(parents), caption=caption
|
||||
)
|
||||
|
||||
in_table = False
|
||||
table_data = []
|
||||
@ -188,7 +200,9 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
|
||||
caption = None
|
||||
if len(caption_data) > 0:
|
||||
caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION)
|
||||
caption = doc.add_text(
|
||||
text=" ".join(caption_data), label=DocItemLabel.CAPTION
|
||||
)
|
||||
|
||||
caption_data = []
|
||||
|
||||
@ -200,9 +214,17 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
size = Size(width=int(item["width"]), height=int(item["height"]))
|
||||
|
||||
uri = None
|
||||
if "uri" in item and not item["uri"].startswith("http") and item["uri"].startswith("//"):
|
||||
if (
|
||||
"uri" in item
|
||||
and not item["uri"].startswith("http")
|
||||
and item["uri"].startswith("//")
|
||||
):
|
||||
uri = "file:" + item["uri"]
|
||||
elif "uri" in item and not item["uri"].startswith("http") and item["uri"].startswith("/"):
|
||||
elif (
|
||||
"uri" in item
|
||||
and not item["uri"].startswith("http")
|
||||
and item["uri"].startswith("/")
|
||||
):
|
||||
uri = "file:/" + item["uri"]
|
||||
elif "uri" in item and not item["uri"].startswith("http"):
|
||||
uri = "file://" + item["uri"]
|
||||
@ -215,14 +237,19 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
item = self.parse_caption(line)
|
||||
caption_data.append(item["text"])
|
||||
|
||||
elif len(line.strip())>0 and len(caption_data)>0: # allow multiline captions
|
||||
elif (
|
||||
len(line.strip()) > 0 and len(caption_data) > 0
|
||||
): # allow multiline captions
|
||||
item = self.parse_text(line)
|
||||
caption_data.append(item["text"])
|
||||
|
||||
# Plain text
|
||||
elif len(line.strip()) == 0 and len(text_data) > 0:
|
||||
doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH,
|
||||
parent=self.get_current_parent(parents))
|
||||
doc.add_text(
|
||||
text=" ".join(text_data),
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
parent=self.get_current_parent(parents),
|
||||
)
|
||||
text_data = []
|
||||
|
||||
elif len(line.strip()) > 0: # allow multiline texts
|
||||
@ -231,8 +258,11 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
text_data.append(item["text"])
|
||||
|
||||
if len(text_data) > 0:
|
||||
doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH,
|
||||
parent=self.get_current_parent(parents))
|
||||
doc.add_text(
|
||||
text=" ".join(text_data),
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
parent=self.get_current_parent(parents),
|
||||
)
|
||||
text_data = []
|
||||
|
||||
if in_table and len(table_data) > 0:
|
||||
@ -296,15 +326,30 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
text = match.group(3) # The actual text of the list item
|
||||
|
||||
if marker == "*" or marker == "-":
|
||||
return {"type": "list_item", "marker": marker, "text": text.strip(),
|
||||
"numbered": False, "indent": 0 if indent==None else len(indent)}
|
||||
return {
|
||||
"type": "list_item",
|
||||
"marker": marker,
|
||||
"text": text.strip(),
|
||||
"numbered": False,
|
||||
"indent": 0 if indent == None else len(indent),
|
||||
}
|
||||
else:
|
||||
return {"type": "list_item", "marker": marker, "text": text.strip(),
|
||||
"numbered": True, "indent": 0 if indent==None else len(indent)}
|
||||
return {
|
||||
"type": "list_item",
|
||||
"marker": marker,
|
||||
"text": text.strip(),
|
||||
"numbered": True,
|
||||
"indent": 0 if indent == None else len(indent),
|
||||
}
|
||||
else:
|
||||
# Fallback if no match
|
||||
return {"type": "list_item", "marker": item_marker, "text": line,
|
||||
"numbered": False, "indent": 0}
|
||||
return {
|
||||
"type": "list_item",
|
||||
"marker": item_marker,
|
||||
"text": line,
|
||||
"numbered": False,
|
||||
"indent": 0,
|
||||
}
|
||||
|
||||
# ========= Tables
|
||||
def is_table_line(self, line):
|
||||
@ -357,14 +402,14 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
|
||||
if mtch:
|
||||
picture_path = mtch.group(1).strip()
|
||||
attributes = mtch.group(2).split(',')
|
||||
attributes = mtch.group(2).split(",")
|
||||
picture_info = {"type": "picture", "uri": picture_path}
|
||||
|
||||
# Extract optional attributes (alt text, width, height, alignment)
|
||||
if attributes:
|
||||
picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
|
||||
for attr in attributes[1:]:
|
||||
key, value = attr.split('=')
|
||||
key, value = attr.split("=")
|
||||
picture_info[key.strip()] = value.strip()
|
||||
|
||||
return picture_info
|
||||
|
Loading…
Reference in New Issue
Block a user