fixed the mypy

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-10-22 09:44:27 +02:00
parent b04f14ec24
commit bb3db07836
2 changed files with 150 additions and 105 deletions

View File

@ -4,20 +4,20 @@ from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import Set, Union
from pydantic import (
AnyUrl,
)
from docling_core.types.doc import ( from docling_core.types.doc import (
Size, DocItem,
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
DocumentOrigin, DocumentOrigin,
GroupItem,
GroupLabel, GroupLabel,
ImageRef,
NodeItem,
Size,
TableCell, TableCell,
TableData, TableData,
ImageRef,
) )
from pydantic import AnyUrl
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
@ -90,108 +90,122 @@ class AsciidocBackend(DeclarativeDocumentBackend):
in_list = False in_list = False
in_table = False in_table = False
text_data = [] text_data: list[str] = []
table_data = [] table_data: list[str] = []
caption_data = [] caption_data: list[str] = []
# parents: dict[int, Union[DocItem, GroupItem, None]] = {}
parents: dict[int, Union[GroupItem, None]] = {}
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
indents: dict[int, Union[GroupItem, None]] = {}
parents = {}
indents = {}
for i in range(0, 10): for i in range(0, 10):
parents[i] = None parents[i] = None
indents[i] = None indents[i] = None
for line in self.lines: for line in self.lines:
#line = line.strip() # line = line.strip()
# Title # Title
if self.is_title(line): if self.is_title(line):
item = self.parse_title(line) item = self.parse_title(line)
level = item["level"] level = item["level"]
parents[level] = doc.add_text(text=item["text"], label=DocItemLabel.TITLE) parents[level] = doc.add_text(
text=item["text"], label=DocItemLabel.TITLE
)
# Section headers # Section headers
elif self.is_section_header(line): elif self.is_section_header(line):
item = self.parse_section_header(line) item = self.parse_section_header(line)
level = item["level"] level = item["level"]
parents[level] = doc.add_heading(text=item["text"], level=item["level"], parent=parents[level-1]) parents[level] = doc.add_heading(
for k,v in parents.items(): text=item["text"], level=item["level"], parent=parents[level - 1]
if k>level: )
for k, v in parents.items():
if k > level:
parents[k] = None parents[k] = None
# Lists # Lists
elif self.is_list_item(line): elif self.is_list_item(line):
print("line: ", line) print("line: ", line)
item = self.parse_list_item(line) item = self.parse_list_item(line)
print("parsed list-item: ", item) print("parsed list-item: ", item)
level = self.get_current_level(parents) level = self.get_current_level(parents)
if not in_list: if not in_list:
in_list = True in_list = True
parents[level+1] = doc.add_group(
parent=parents[level], name="list", label=GroupLabel.LIST
)
indents[level+1] = item["indent"]
elif in_list and item["indent"]>indents[level]:
parents[level+1] = doc.add_group(
parent=parents[level], name="list", label=GroupLabel.LIST
)
indents[level+1] = item["indent"]
elif in_list and item["indent"]<indents[level]: parents[level + 1] = doc.add_group(
parent=parents[level], name="list", label=GroupLabel.LIST
)
indents[level + 1] = item["indent"]
elif in_list and item["indent"] > indents[level]:
parents[level + 1] = doc.add_group(
parent=parents[level], name="list", label=GroupLabel.LIST
)
indents[level + 1] = item["indent"]
elif in_list and item["indent"] < indents[level]:
print(item["indent"], " => ", indents[level]) print(item["indent"], " => ", indents[level])
while item["indent"]<indents[level]: while item["indent"] < indents[level]:
print(item["indent"], " => ", indents[level]) print(item["indent"], " => ", indents[level])
parents[level] = None parents[level] = None
indents[level] = None indents[level] = None
level -= 1 level -= 1
doc.add_list_item(item["text"], parent=self.get_current_parent(parents)) doc.add_list_item(item["text"], parent=self.get_current_parent(parents))
elif in_list and not self.is_list_item(line): elif in_list and not self.is_list_item(line):
in_list = False in_list = False
level = self.get_current_level(parents) level = self.get_current_level(parents)
parents[level]=None parents[level] = None
# Tables # Tables
elif line.strip()=="|===" and not in_table: # start of table elif line.strip() == "|===" and not in_table: # start of table
in_table = True in_table = True
elif self.is_table_line(line): # within a table elif self.is_table_line(line): # within a table
in_table = True in_table = True
table_data.append(self.parse_table_line(line)) table_data.append(self.parse_table_line(line))
elif in_table and ((not self.is_table_line(line)) or line.strip()=="|==="): # end of table elif in_table and (
(not self.is_table_line(line)) or line.strip() == "|==="
): # end of table
caption = None caption = None
if len(caption_data)>0: if len(caption_data) > 0:
caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION) caption = doc.add_text(
text=" ".join(caption_data), label=DocItemLabel.CAPTION
)
caption_data = []
caption_data = []
data = self.populate_table_as_grid(table_data) data = self.populate_table_as_grid(table_data)
doc.add_table(data=data, parent=self.get_current_parent(parents), caption=caption) doc.add_table(
data=data, parent=self.get_current_parent(parents), caption=caption
)
in_table = False in_table = False
table_data = [] table_data = []
# Picture # Picture
elif self.is_picture(line): elif self.is_picture(line):
caption = None caption = None
if len(caption_data)>0: if len(caption_data) > 0:
caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION) caption = doc.add_text(
text=" ".join(caption_data), label=DocItemLabel.CAPTION
)
caption_data = []
caption_data = []
item = self.parse_picture(line) item = self.parse_picture(line)
print(item) print(item)
@ -200,41 +214,57 @@ class AsciidocBackend(DeclarativeDocumentBackend):
size = Size(width=int(item["width"]), height=int(item["height"])) size = Size(width=int(item["width"]), height=int(item["height"]))
uri = None uri = None
if "uri" in item and not item["uri"].startswith("http") and item["uri"].startswith("//"): if (
uri = "file:"+item["uri"] "uri" in item
elif "uri" in item and not item["uri"].startswith("http") and item["uri"].startswith("/"): and not item["uri"].startswith("http")
uri = "file:/"+item["uri"] and item["uri"].startswith("//")
):
uri = "file:" + item["uri"]
elif (
"uri" in item
and not item["uri"].startswith("http")
and item["uri"].startswith("/")
):
uri = "file:/" + item["uri"]
elif "uri" in item and not item["uri"].startswith("http"): elif "uri" in item and not item["uri"].startswith("http"):
uri = "file://"+item["uri"] uri = "file://" + item["uri"]
image = ImageRef(mimetype="image/png", size=size, dpi=70, uri=uri) image = ImageRef(mimetype="image/png", size=size, dpi=70, uri=uri)
doc.add_picture(image=image, caption=caption) doc.add_picture(image=image, caption=caption)
# Caption # Caption
elif self.is_caption(line) and len(caption_data)==0: elif self.is_caption(line) and len(caption_data) == 0:
item = self.parse_caption(line) item = self.parse_caption(line)
caption_data.append(item["text"]) caption_data.append(item["text"])
elif len(line.strip())>0 and len(caption_data)>0: # allow multiline captions elif (
len(line.strip()) > 0 and len(caption_data) > 0
): # allow multiline captions
item = self.parse_text(line) item = self.parse_text(line)
caption_data.append(item["text"]) caption_data.append(item["text"])
# Plain text # Plain text
elif len(line.strip())==0 and len(text_data)>0: elif len(line.strip()) == 0 and len(text_data) > 0:
doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH, doc.add_text(
parent=self.get_current_parent(parents)) text=" ".join(text_data),
label=DocItemLabel.PARAGRAPH,
parent=self.get_current_parent(parents),
)
text_data = [] text_data = []
elif len(line.strip())>0: # allow multiline texts elif len(line.strip()) > 0: # allow multiline texts
item = self.parse_text(line) item = self.parse_text(line)
text_data.append(item["text"]) text_data.append(item["text"])
if len(text_data) > 0: if len(text_data) > 0:
doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH, doc.add_text(
parent=self.get_current_parent(parents)) text=" ".join(text_data),
label=DocItemLabel.PARAGRAPH,
parent=self.get_current_parent(parents),
)
text_data = [] text_data = []
if in_table and len(table_data) > 0: if in_table and len(table_data) > 0:
data = self.populate_table_as_grid(table_data) data = self.populate_table_as_grid(table_data)
doc.add_table(data=data, parent=self.get_current_parent(parents)) doc.add_table(data=data, parent=self.get_current_parent(parents))
@ -245,25 +275,25 @@ class AsciidocBackend(DeclarativeDocumentBackend):
return doc return doc
def get_current_level(self, parents): def get_current_level(self, parents):
for k,v in parents.items(): for k, v in parents.items():
if v==None and k>0: if v == None and k > 0:
return k-1 return k - 1
return 0 return 0
def get_current_parent(self, parents): def get_current_parent(self, parents):
for k,v in parents.items(): for k, v in parents.items():
if v==None and k>0: if v == None and k > 0:
return parents[k-1] return parents[k - 1]
return None return None
# ========= Title # ========= Title
def is_title(self, line): def is_title(self, line):
return re.match(r"^= ", line) return re.match(r"^= ", line)
def parse_title(self, line): def parse_title(self, line):
return {"type": "title", "text": line[2:].strip(), "level":0} return {"type": "title", "text": line[2:].strip(), "level": 0}
# ========= Section headers # ========= Section headers
def is_section_header(self, line): def is_section_header(self, line):
@ -271,14 +301,14 @@ class AsciidocBackend(DeclarativeDocumentBackend):
def parse_section_header(self, line): def parse_section_header(self, line):
match = re.match(r"^(=+)\s+(.*)", line) match = re.match(r"^(=+)\s+(.*)", line)
marker = match.group(1) # The list marker (e.g., "*", "-", "1.") marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
text = match.group(2) # The actual text of the list item text = match.group(2) # The actual text of the list item
header_level = marker.count("=") # number of '=' represents level header_level = marker.count("=") # number of '=' represents level
return { return {
"type": "header", "type": "header",
"level": header_level-1, "level": header_level - 1,
"text": text.strip(), "text": text.strip(),
} }
@ -293,19 +323,34 @@ class AsciidocBackend(DeclarativeDocumentBackend):
if match: if match:
indent = match.group(1) indent = match.group(1)
marker = match.group(2) # The list marker (e.g., "*", "-", "1.") marker = match.group(2) # The list marker (e.g., "*", "-", "1.")
text = match.group(3) # The actual text of the list item text = match.group(3) # The actual text of the list item
if marker=="*" or marker=="-": if marker == "*" or marker == "-":
return {"type": "list_item", "marker": marker, "text": text.strip(), return {
"numbered": False, "indent": 0 if indent==None else len(indent)} "type": "list_item",
"marker": marker,
"text": text.strip(),
"numbered": False,
"indent": 0 if indent == None else len(indent),
}
else: else:
return {"type": "list_item", "marker": marker, "text": text.strip(), return {
"numbered": True, "indent": 0 if indent==None else len(indent)} "type": "list_item",
"marker": marker,
"text": text.strip(),
"numbered": True,
"indent": 0 if indent == None else len(indent),
}
else: else:
# Fallback if no match # Fallback if no match
return {"type": "list_item", "marker": item_marker, "text": line, return {
"numbered": False, "indent": 0} "type": "list_item",
"marker": item_marker,
"text": line,
"numbered": False,
"indent": 0,
}
# ========= Tables # ========= Tables
def is_table_line(self, line): def is_table_line(self, line):
return re.match(r"^\|.*\|", line) return re.match(r"^\|.*\|", line)
@ -357,18 +402,18 @@ class AsciidocBackend(DeclarativeDocumentBackend):
mtch = re.match(r"^image::(.+)\[(.*)\]$", line) mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
if mtch: if mtch:
picture_path = mtch.group(1).strip() picture_path = mtch.group(1).strip()
attributes = mtch.group(2).split(',') attributes = mtch.group(2).split(",")
picture_info = {"type": "picture", "uri": picture_path} picture_info = {"type": "picture", "uri": picture_path}
# Extract optional attributes (alt text, width, height, alignment) # Extract optional attributes (alt text, width, height, alignment)
if attributes: if attributes:
picture_info["alt"] = attributes[0].strip() if attributes[0] else "" picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
for attr in attributes[1:]: for attr in attributes[1:]:
key, value = attr.split('=') key, value = attr.split("=")
picture_info[key.strip()] = value.strip() picture_info[key.strip()] = value.strip()
return picture_info return picture_info
return {"type": "picture", "uri": line} return {"type": "picture", "uri": line}
# ========= Captions # ========= Captions
@ -382,7 +427,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
return {"type": "caption", "text": text} return {"type": "caption", "text": text}
return {"type": "caption", "text": ""} return {"type": "caption", "text": ""}
# ========= Plain text # ========= Plain text
def parse_text(self, line): def parse_text(self, line):
return {"type": "text", "text": line.strip()} return {"type": "text", "text": line.strip()}

View File

@ -27,29 +27,29 @@ def test_asciidocs_examples():
for fname in fnames: for fname in fnames:
print(f"reading {fname}") print(f"reading {fname}")
bname = os.path.basename(fname) bname = os.path.basename(fname)
gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md") gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md")
doc_backend = _get_backend(Path(fname)) doc_backend = _get_backend(Path(fname))
doc = doc_backend.convert() doc = doc_backend.convert()
pred_itdoc = doc._export_to_indented_text(max_text_len=16) pred_itdoc = doc._export_to_indented_text(max_text_len=16)
print("\n\n", pred_itdoc) print("\n\n", pred_itdoc)
pred_mddoc = doc.export_to_markdown() pred_mddoc = doc.export_to_markdown()
print("\n\n", pred_mddoc) print("\n\n", pred_mddoc)
if os.path.exists(gname): if os.path.exists(gname):
with open(gname, "r") as fr: with open(gname, "r") as fr:
true_mddoc = fr.read() true_mddoc = fr.read()
#assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc" # assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
else: else:
with open(gname, "w") as fw: with open(gname, "w") as fw:
fw.write(pred_mddoc) fw.write(pred_mddoc)
#print("\n\n", doc.export_to_markdown()) # print("\n\n", doc.export_to_markdown())
input("continue") input("continue")