mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 15:02:21 +00:00
fixed the mypy
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
b04f14ec24
commit
bb3db07836
@ -4,20 +4,20 @@ from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
from pydantic import (
|
||||
AnyUrl,
|
||||
)
|
||||
|
||||
from docling_core.types.doc import (
|
||||
Size,
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupItem,
|
||||
GroupLabel,
|
||||
ImageRef,
|
||||
NodeItem,
|
||||
Size,
|
||||
TableCell,
|
||||
TableData,
|
||||
ImageRef,
|
||||
)
|
||||
from pydantic import AnyUrl
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
@ -90,108 +90,122 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
in_list = False
|
||||
in_table = False
|
||||
|
||||
text_data = []
|
||||
table_data = []
|
||||
caption_data = []
|
||||
text_data: list[str] = []
|
||||
table_data: list[str] = []
|
||||
caption_data: list[str] = []
|
||||
|
||||
# parents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
||||
parents: dict[int, Union[GroupItem, None]] = {}
|
||||
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
||||
indents: dict[int, Union[GroupItem, None]] = {}
|
||||
|
||||
parents = {}
|
||||
indents = {}
|
||||
|
||||
for i in range(0, 10):
|
||||
parents[i] = None
|
||||
indents[i] = None
|
||||
|
||||
|
||||
for line in self.lines:
|
||||
#line = line.strip()
|
||||
# line = line.strip()
|
||||
|
||||
# Title
|
||||
if self.is_title(line):
|
||||
item = self.parse_title(line)
|
||||
level = item["level"]
|
||||
|
||||
parents[level] = doc.add_text(text=item["text"], label=DocItemLabel.TITLE)
|
||||
|
||||
|
||||
parents[level] = doc.add_text(
|
||||
text=item["text"], label=DocItemLabel.TITLE
|
||||
)
|
||||
|
||||
# Section headers
|
||||
elif self.is_section_header(line):
|
||||
item = self.parse_section_header(line)
|
||||
level = item["level"]
|
||||
|
||||
parents[level] = doc.add_heading(text=item["text"], level=item["level"], parent=parents[level-1])
|
||||
for k,v in parents.items():
|
||||
if k>level:
|
||||
|
||||
parents[level] = doc.add_heading(
|
||||
text=item["text"], level=item["level"], parent=parents[level - 1]
|
||||
)
|
||||
for k, v in parents.items():
|
||||
if k > level:
|
||||
parents[k] = None
|
||||
|
||||
|
||||
# Lists
|
||||
elif self.is_list_item(line):
|
||||
|
||||
print("line: ", line)
|
||||
item = self.parse_list_item(line)
|
||||
print("parsed list-item: ", item)
|
||||
|
||||
|
||||
level = self.get_current_level(parents)
|
||||
|
||||
|
||||
if not in_list:
|
||||
in_list = True
|
||||
|
||||
parents[level+1] = doc.add_group(
|
||||
parent=parents[level], name="list", label=GroupLabel.LIST
|
||||
)
|
||||
indents[level+1] = item["indent"]
|
||||
|
||||
elif in_list and item["indent"]>indents[level]:
|
||||
parents[level+1] = doc.add_group(
|
||||
parent=parents[level], name="list", label=GroupLabel.LIST
|
||||
)
|
||||
indents[level+1] = item["indent"]
|
||||
|
||||
elif in_list and item["indent"]<indents[level]:
|
||||
parents[level + 1] = doc.add_group(
|
||||
parent=parents[level], name="list", label=GroupLabel.LIST
|
||||
)
|
||||
indents[level + 1] = item["indent"]
|
||||
|
||||
elif in_list and item["indent"] > indents[level]:
|
||||
parents[level + 1] = doc.add_group(
|
||||
parent=parents[level], name="list", label=GroupLabel.LIST
|
||||
)
|
||||
indents[level + 1] = item["indent"]
|
||||
|
||||
elif in_list and item["indent"] < indents[level]:
|
||||
|
||||
print(item["indent"], " => ", indents[level])
|
||||
while item["indent"]<indents[level]:
|
||||
while item["indent"] < indents[level]:
|
||||
print(item["indent"], " => ", indents[level])
|
||||
parents[level] = None
|
||||
indents[level] = None
|
||||
level -= 1
|
||||
|
||||
|
||||
doc.add_list_item(item["text"], parent=self.get_current_parent(parents))
|
||||
|
||||
elif in_list and not self.is_list_item(line):
|
||||
in_list = False
|
||||
|
||||
level = self.get_current_level(parents)
|
||||
parents[level]=None
|
||||
parents[level] = None
|
||||
|
||||
# Tables
|
||||
elif line.strip()=="|===" and not in_table: # start of table
|
||||
elif line.strip() == "|===" and not in_table: # start of table
|
||||
in_table = True
|
||||
|
||||
elif self.is_table_line(line): # within a table
|
||||
|
||||
elif self.is_table_line(line): # within a table
|
||||
in_table = True
|
||||
table_data.append(self.parse_table_line(line))
|
||||
|
||||
elif in_table and ((not self.is_table_line(line)) or line.strip()=="|==="): # end of table
|
||||
elif in_table and (
|
||||
(not self.is_table_line(line)) or line.strip() == "|==="
|
||||
): # end of table
|
||||
|
||||
caption = None
|
||||
if len(caption_data)>0:
|
||||
caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION)
|
||||
if len(caption_data) > 0:
|
||||
caption = doc.add_text(
|
||||
text=" ".join(caption_data), label=DocItemLabel.CAPTION
|
||||
)
|
||||
|
||||
caption_data = []
|
||||
|
||||
caption_data = []
|
||||
|
||||
data = self.populate_table_as_grid(table_data)
|
||||
doc.add_table(data=data, parent=self.get_current_parent(parents), caption=caption)
|
||||
doc.add_table(
|
||||
data=data, parent=self.get_current_parent(parents), caption=caption
|
||||
)
|
||||
|
||||
in_table = False
|
||||
table_data = []
|
||||
|
||||
|
||||
# Picture
|
||||
elif self.is_picture(line):
|
||||
|
||||
caption = None
|
||||
if len(caption_data)>0:
|
||||
caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION)
|
||||
if len(caption_data) > 0:
|
||||
caption = doc.add_text(
|
||||
text=" ".join(caption_data), label=DocItemLabel.CAPTION
|
||||
)
|
||||
|
||||
caption_data = []
|
||||
|
||||
caption_data = []
|
||||
|
||||
item = self.parse_picture(line)
|
||||
print(item)
|
||||
|
||||
@ -200,41 +214,57 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
size = Size(width=int(item["width"]), height=int(item["height"]))
|
||||
|
||||
uri = None
|
||||
if "uri" in item and not item["uri"].startswith("http") and item["uri"].startswith("//"):
|
||||
uri = "file:"+item["uri"]
|
||||
elif "uri" in item and not item["uri"].startswith("http") and item["uri"].startswith("/"):
|
||||
uri = "file:/"+item["uri"]
|
||||
if (
|
||||
"uri" in item
|
||||
and not item["uri"].startswith("http")
|
||||
and item["uri"].startswith("//")
|
||||
):
|
||||
uri = "file:" + item["uri"]
|
||||
elif (
|
||||
"uri" in item
|
||||
and not item["uri"].startswith("http")
|
||||
and item["uri"].startswith("/")
|
||||
):
|
||||
uri = "file:/" + item["uri"]
|
||||
elif "uri" in item and not item["uri"].startswith("http"):
|
||||
uri = "file://"+item["uri"]
|
||||
uri = "file://" + item["uri"]
|
||||
|
||||
image = ImageRef(mimetype="image/png", size=size, dpi=70, uri=uri)
|
||||
doc.add_picture(image=image, caption=caption)
|
||||
|
||||
|
||||
# Caption
|
||||
elif self.is_caption(line) and len(caption_data)==0:
|
||||
elif self.is_caption(line) and len(caption_data) == 0:
|
||||
item = self.parse_caption(line)
|
||||
caption_data.append(item["text"])
|
||||
|
||||
elif len(line.strip())>0 and len(caption_data)>0: # allow multiline captions
|
||||
elif (
|
||||
len(line.strip()) > 0 and len(caption_data) > 0
|
||||
): # allow multiline captions
|
||||
item = self.parse_text(line)
|
||||
caption_data.append(item["text"])
|
||||
|
||||
|
||||
# Plain text
|
||||
elif len(line.strip())==0 and len(text_data)>0:
|
||||
doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH,
|
||||
parent=self.get_current_parent(parents))
|
||||
elif len(line.strip()) == 0 and len(text_data) > 0:
|
||||
doc.add_text(
|
||||
text=" ".join(text_data),
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
parent=self.get_current_parent(parents),
|
||||
)
|
||||
text_data = []
|
||||
|
||||
elif len(line.strip())>0: # allow multiline texts
|
||||
|
||||
|
||||
elif len(line.strip()) > 0: # allow multiline texts
|
||||
|
||||
item = self.parse_text(line)
|
||||
text_data.append(item["text"])
|
||||
|
||||
if len(text_data) > 0:
|
||||
doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH,
|
||||
parent=self.get_current_parent(parents))
|
||||
doc.add_text(
|
||||
text=" ".join(text_data),
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
parent=self.get_current_parent(parents),
|
||||
)
|
||||
text_data = []
|
||||
|
||||
|
||||
if in_table and len(table_data) > 0:
|
||||
data = self.populate_table_as_grid(table_data)
|
||||
doc.add_table(data=data, parent=self.get_current_parent(parents))
|
||||
@ -245,25 +275,25 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
return doc
|
||||
|
||||
def get_current_level(self, parents):
|
||||
for k,v in parents.items():
|
||||
if v==None and k>0:
|
||||
return k-1
|
||||
for k, v in parents.items():
|
||||
if v == None and k > 0:
|
||||
return k - 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def get_current_parent(self, parents):
|
||||
for k,v in parents.items():
|
||||
if v==None and k>0:
|
||||
return parents[k-1]
|
||||
for k, v in parents.items():
|
||||
if v == None and k > 0:
|
||||
return parents[k - 1]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ========= Title
|
||||
def is_title(self, line):
|
||||
return re.match(r"^= ", line)
|
||||
|
||||
def parse_title(self, line):
|
||||
return {"type": "title", "text": line[2:].strip(), "level":0}
|
||||
return {"type": "title", "text": line[2:].strip(), "level": 0}
|
||||
|
||||
# ========= Section headers
|
||||
def is_section_header(self, line):
|
||||
@ -271,14 +301,14 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
|
||||
def parse_section_header(self, line):
|
||||
match = re.match(r"^(=+)\s+(.*)", line)
|
||||
|
||||
|
||||
marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
|
||||
text = match.group(2) # The actual text of the list item
|
||||
|
||||
text = match.group(2) # The actual text of the list item
|
||||
|
||||
header_level = marker.count("=") # number of '=' represents level
|
||||
return {
|
||||
"type": "header",
|
||||
"level": header_level-1,
|
||||
"level": header_level - 1,
|
||||
"text": text.strip(),
|
||||
}
|
||||
|
||||
@ -293,19 +323,34 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
if match:
|
||||
indent = match.group(1)
|
||||
marker = match.group(2) # The list marker (e.g., "*", "-", "1.")
|
||||
text = match.group(3) # The actual text of the list item
|
||||
|
||||
if marker=="*" or marker=="-":
|
||||
return {"type": "list_item", "marker": marker, "text": text.strip(),
|
||||
"numbered": False, "indent": 0 if indent==None else len(indent)}
|
||||
text = match.group(3) # The actual text of the list item
|
||||
|
||||
if marker == "*" or marker == "-":
|
||||
return {
|
||||
"type": "list_item",
|
||||
"marker": marker,
|
||||
"text": text.strip(),
|
||||
"numbered": False,
|
||||
"indent": 0 if indent == None else len(indent),
|
||||
}
|
||||
else:
|
||||
return {"type": "list_item", "marker": marker, "text": text.strip(),
|
||||
"numbered": True, "indent": 0 if indent==None else len(indent)}
|
||||
return {
|
||||
"type": "list_item",
|
||||
"marker": marker,
|
||||
"text": text.strip(),
|
||||
"numbered": True,
|
||||
"indent": 0 if indent == None else len(indent),
|
||||
}
|
||||
else:
|
||||
# Fallback if no match
|
||||
return {"type": "list_item", "marker": item_marker, "text": line,
|
||||
"numbered": False, "indent": 0}
|
||||
|
||||
return {
|
||||
"type": "list_item",
|
||||
"marker": item_marker,
|
||||
"text": line,
|
||||
"numbered": False,
|
||||
"indent": 0,
|
||||
}
|
||||
|
||||
# ========= Tables
|
||||
def is_table_line(self, line):
|
||||
return re.match(r"^\|.*\|", line)
|
||||
@ -357,18 +402,18 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
|
||||
if mtch:
|
||||
picture_path = mtch.group(1).strip()
|
||||
attributes = mtch.group(2).split(',')
|
||||
attributes = mtch.group(2).split(",")
|
||||
picture_info = {"type": "picture", "uri": picture_path}
|
||||
|
||||
# Extract optional attributes (alt text, width, height, alignment)
|
||||
if attributes:
|
||||
picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
|
||||
for attr in attributes[1:]:
|
||||
key, value = attr.split('=')
|
||||
key, value = attr.split("=")
|
||||
picture_info[key.strip()] = value.strip()
|
||||
|
||||
|
||||
return picture_info
|
||||
|
||||
|
||||
return {"type": "picture", "uri": line}
|
||||
|
||||
# ========= Captions
|
||||
@ -382,7 +427,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
return {"type": "caption", "text": text}
|
||||
|
||||
return {"type": "caption", "text": ""}
|
||||
|
||||
|
||||
# ========= Plain text
|
||||
def parse_text(self, line):
|
||||
return {"type": "text", "text": line.strip()}
|
||||
|
@ -27,29 +27,29 @@ def test_asciidocs_examples():
|
||||
|
||||
for fname in fnames:
|
||||
print(f"reading {fname}")
|
||||
|
||||
|
||||
bname = os.path.basename(fname)
|
||||
gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md")
|
||||
|
||||
|
||||
doc_backend = _get_backend(Path(fname))
|
||||
doc = doc_backend.convert()
|
||||
|
||||
pred_itdoc = doc._export_to_indented_text(max_text_len=16)
|
||||
print("\n\n", pred_itdoc)
|
||||
|
||||
|
||||
pred_mddoc = doc.export_to_markdown()
|
||||
print("\n\n", pred_mddoc)
|
||||
|
||||
|
||||
if os.path.exists(gname):
|
||||
with open(gname, "r") as fr:
|
||||
true_mddoc = fr.read()
|
||||
|
||||
#assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
|
||||
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
|
||||
else:
|
||||
with open(gname, "w") as fw:
|
||||
fw.write(pred_mddoc)
|
||||
|
||||
#print("\n\n", doc.export_to_markdown())
|
||||
# print("\n\n", doc.export_to_markdown())
|
||||
|
||||
input("continue")
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user