fixed the mypy

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-10-22 09:44:27 +02:00
parent b04f14ec24
commit bb3db07836
2 changed files with 150 additions and 105 deletions

View File

@ -4,20 +4,20 @@ from io import BytesIO
from pathlib import Path
from typing import Set, Union
from pydantic import (
AnyUrl,
)
from docling_core.types.doc import (
Size,
DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupItem,
GroupLabel,
ImageRef,
NodeItem,
Size,
TableCell,
TableData,
ImageRef,
)
from pydantic import AnyUrl
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
@ -90,108 +90,122 @@ class AsciidocBackend(DeclarativeDocumentBackend):
in_list = False
in_table = False
text_data = []
table_data = []
caption_data = []
text_data: list[str] = []
table_data: list[str] = []
caption_data: list[str] = []
# parents: dict[int, Union[DocItem, GroupItem, None]] = {}
parents: dict[int, Union[GroupItem, None]] = {}
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
indents: dict[int, Union[GroupItem, None]] = {}
parents = {}
indents = {}
for i in range(0, 10):
parents[i] = None
indents[i] = None
for line in self.lines:
#line = line.strip()
# line = line.strip()
# Title
if self.is_title(line):
item = self.parse_title(line)
level = item["level"]
parents[level] = doc.add_text(text=item["text"], label=DocItemLabel.TITLE)
parents[level] = doc.add_text(
text=item["text"], label=DocItemLabel.TITLE
)
# Section headers
elif self.is_section_header(line):
item = self.parse_section_header(line)
level = item["level"]
parents[level] = doc.add_heading(text=item["text"], level=item["level"], parent=parents[level-1])
for k,v in parents.items():
if k>level:
parents[level] = doc.add_heading(
text=item["text"], level=item["level"], parent=parents[level - 1]
)
for k, v in parents.items():
if k > level:
parents[k] = None
# Lists
elif self.is_list_item(line):
print("line: ", line)
item = self.parse_list_item(line)
print("parsed list-item: ", item)
level = self.get_current_level(parents)
if not in_list:
in_list = True
parents[level+1] = doc.add_group(
parent=parents[level], name="list", label=GroupLabel.LIST
)
indents[level+1] = item["indent"]
elif in_list and item["indent"]>indents[level]:
parents[level+1] = doc.add_group(
parent=parents[level], name="list", label=GroupLabel.LIST
)
indents[level+1] = item["indent"]
elif in_list and item["indent"]<indents[level]:
parents[level + 1] = doc.add_group(
parent=parents[level], name="list", label=GroupLabel.LIST
)
indents[level + 1] = item["indent"]
elif in_list and item["indent"] > indents[level]:
parents[level + 1] = doc.add_group(
parent=parents[level], name="list", label=GroupLabel.LIST
)
indents[level + 1] = item["indent"]
elif in_list and item["indent"] < indents[level]:
print(item["indent"], " => ", indents[level])
while item["indent"]<indents[level]:
while item["indent"] < indents[level]:
print(item["indent"], " => ", indents[level])
parents[level] = None
indents[level] = None
level -= 1
doc.add_list_item(item["text"], parent=self.get_current_parent(parents))
elif in_list and not self.is_list_item(line):
in_list = False
level = self.get_current_level(parents)
parents[level]=None
parents[level] = None
# Tables
elif line.strip()=="|===" and not in_table: # start of table
elif line.strip() == "|===" and not in_table: # start of table
in_table = True
elif self.is_table_line(line): # within a table
elif self.is_table_line(line): # within a table
in_table = True
table_data.append(self.parse_table_line(line))
elif in_table and ((not self.is_table_line(line)) or line.strip()=="|==="): # end of table
elif in_table and (
(not self.is_table_line(line)) or line.strip() == "|==="
): # end of table
caption = None
if len(caption_data)>0:
caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION)
if len(caption_data) > 0:
caption = doc.add_text(
text=" ".join(caption_data), label=DocItemLabel.CAPTION
)
caption_data = []
caption_data = []
data = self.populate_table_as_grid(table_data)
doc.add_table(data=data, parent=self.get_current_parent(parents), caption=caption)
doc.add_table(
data=data, parent=self.get_current_parent(parents), caption=caption
)
in_table = False
table_data = []
# Picture
elif self.is_picture(line):
caption = None
if len(caption_data)>0:
caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION)
if len(caption_data) > 0:
caption = doc.add_text(
text=" ".join(caption_data), label=DocItemLabel.CAPTION
)
caption_data = []
caption_data = []
item = self.parse_picture(line)
print(item)
@ -200,41 +214,57 @@ class AsciidocBackend(DeclarativeDocumentBackend):
size = Size(width=int(item["width"]), height=int(item["height"]))
uri = None
if "uri" in item and not item["uri"].startswith("http") and item["uri"].startswith("//"):
uri = "file:"+item["uri"]
elif "uri" in item and not item["uri"].startswith("http") and item["uri"].startswith("/"):
uri = "file:/"+item["uri"]
if (
"uri" in item
and not item["uri"].startswith("http")
and item["uri"].startswith("//")
):
uri = "file:" + item["uri"]
elif (
"uri" in item
and not item["uri"].startswith("http")
and item["uri"].startswith("/")
):
uri = "file:/" + item["uri"]
elif "uri" in item and not item["uri"].startswith("http"):
uri = "file://"+item["uri"]
uri = "file://" + item["uri"]
image = ImageRef(mimetype="image/png", size=size, dpi=70, uri=uri)
doc.add_picture(image=image, caption=caption)
# Caption
elif self.is_caption(line) and len(caption_data)==0:
elif self.is_caption(line) and len(caption_data) == 0:
item = self.parse_caption(line)
caption_data.append(item["text"])
elif len(line.strip())>0 and len(caption_data)>0: # allow multiline captions
elif (
len(line.strip()) > 0 and len(caption_data) > 0
): # allow multiline captions
item = self.parse_text(line)
caption_data.append(item["text"])
# Plain text
elif len(line.strip())==0 and len(text_data)>0:
doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH,
parent=self.get_current_parent(parents))
elif len(line.strip()) == 0 and len(text_data) > 0:
doc.add_text(
text=" ".join(text_data),
label=DocItemLabel.PARAGRAPH,
parent=self.get_current_parent(parents),
)
text_data = []
elif len(line.strip())>0: # allow multiline texts
elif len(line.strip()) > 0: # allow multiline texts
item = self.parse_text(line)
text_data.append(item["text"])
if len(text_data) > 0:
doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH,
parent=self.get_current_parent(parents))
doc.add_text(
text=" ".join(text_data),
label=DocItemLabel.PARAGRAPH,
parent=self.get_current_parent(parents),
)
text_data = []
if in_table and len(table_data) > 0:
data = self.populate_table_as_grid(table_data)
doc.add_table(data=data, parent=self.get_current_parent(parents))
@ -245,25 +275,25 @@ class AsciidocBackend(DeclarativeDocumentBackend):
return doc
def get_current_level(self, parents):
for k,v in parents.items():
if v==None and k>0:
return k-1
for k, v in parents.items():
if v == None and k > 0:
return k - 1
return 0
def get_current_parent(self, parents):
for k,v in parents.items():
if v==None and k>0:
return parents[k-1]
for k, v in parents.items():
if v == None and k > 0:
return parents[k - 1]
return None
# ========= Title
def is_title(self, line):
return re.match(r"^= ", line)
def parse_title(self, line):
return {"type": "title", "text": line[2:].strip(), "level":0}
return {"type": "title", "text": line[2:].strip(), "level": 0}
# ========= Section headers
def is_section_header(self, line):
@ -271,14 +301,14 @@ class AsciidocBackend(DeclarativeDocumentBackend):
def parse_section_header(self, line):
match = re.match(r"^(=+)\s+(.*)", line)
marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
text = match.group(2) # The actual text of the list item
text = match.group(2) # The actual text of the list item
header_level = marker.count("=") # number of '=' represents level
return {
"type": "header",
"level": header_level-1,
"level": header_level - 1,
"text": text.strip(),
}
@ -293,19 +323,34 @@ class AsciidocBackend(DeclarativeDocumentBackend):
if match:
indent = match.group(1)
marker = match.group(2) # The list marker (e.g., "*", "-", "1.")
text = match.group(3) # The actual text of the list item
if marker=="*" or marker=="-":
return {"type": "list_item", "marker": marker, "text": text.strip(),
"numbered": False, "indent": 0 if indent==None else len(indent)}
text = match.group(3) # The actual text of the list item
if marker == "*" or marker == "-":
return {
"type": "list_item",
"marker": marker,
"text": text.strip(),
"numbered": False,
"indent": 0 if indent == None else len(indent),
}
else:
return {"type": "list_item", "marker": marker, "text": text.strip(),
"numbered": True, "indent": 0 if indent==None else len(indent)}
return {
"type": "list_item",
"marker": marker,
"text": text.strip(),
"numbered": True,
"indent": 0 if indent == None else len(indent),
}
else:
# Fallback if no match
return {"type": "list_item", "marker": item_marker, "text": line,
"numbered": False, "indent": 0}
return {
"type": "list_item",
"marker": item_marker,
"text": line,
"numbered": False,
"indent": 0,
}
# ========= Tables
def is_table_line(self, line):
return re.match(r"^\|.*\|", line)
@ -357,18 +402,18 @@ class AsciidocBackend(DeclarativeDocumentBackend):
mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
if mtch:
picture_path = mtch.group(1).strip()
attributes = mtch.group(2).split(',')
attributes = mtch.group(2).split(",")
picture_info = {"type": "picture", "uri": picture_path}
# Extract optional attributes (alt text, width, height, alignment)
if attributes:
picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
for attr in attributes[1:]:
key, value = attr.split('=')
key, value = attr.split("=")
picture_info[key.strip()] = value.strip()
return picture_info
return {"type": "picture", "uri": line}
# ========= Captions
@ -382,7 +427,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
return {"type": "caption", "text": text}
return {"type": "caption", "text": ""}
# ========= Plain text
def parse_text(self, line):
return {"type": "text", "text": line.strip()}

View File

@ -27,29 +27,29 @@ def test_asciidocs_examples():
for fname in fnames:
print(f"reading {fname}")
bname = os.path.basename(fname)
gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md")
doc_backend = _get_backend(Path(fname))
doc = doc_backend.convert()
pred_itdoc = doc._export_to_indented_text(max_text_len=16)
print("\n\n", pred_itdoc)
pred_mddoc = doc.export_to_markdown()
print("\n\n", pred_mddoc)
if os.path.exists(gname):
with open(gname, "r") as fr:
true_mddoc = fr.read()
#assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
else:
with open(gname, "w") as fw:
fw.write(pred_mddoc)
#print("\n\n", doc.export_to_markdown())
# print("\n\n", doc.export_to_markdown())
input("continue")