mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 23:12:20 +00:00
fixed the mypy
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
b04f14ec24
commit
bb3db07836
@ -4,20 +4,20 @@ from io import BytesIO
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Set, Union
|
from typing import Set, Union
|
||||||
|
|
||||||
from pydantic import (
|
|
||||||
AnyUrl,
|
|
||||||
)
|
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
Size,
|
DocItem,
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
DocumentOrigin,
|
DocumentOrigin,
|
||||||
|
GroupItem,
|
||||||
GroupLabel,
|
GroupLabel,
|
||||||
|
ImageRef,
|
||||||
|
NodeItem,
|
||||||
|
Size,
|
||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
ImageRef,
|
|
||||||
)
|
)
|
||||||
|
from pydantic import AnyUrl
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
@ -90,108 +90,122 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
in_list = False
|
in_list = False
|
||||||
in_table = False
|
in_table = False
|
||||||
|
|
||||||
text_data = []
|
text_data: list[str] = []
|
||||||
table_data = []
|
table_data: list[str] = []
|
||||||
caption_data = []
|
caption_data: list[str] = []
|
||||||
|
|
||||||
|
# parents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
||||||
|
parents: dict[int, Union[GroupItem, None]] = {}
|
||||||
|
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
||||||
|
indents: dict[int, Union[GroupItem, None]] = {}
|
||||||
|
|
||||||
parents = {}
|
|
||||||
indents = {}
|
|
||||||
|
|
||||||
for i in range(0, 10):
|
for i in range(0, 10):
|
||||||
parents[i] = None
|
parents[i] = None
|
||||||
indents[i] = None
|
indents[i] = None
|
||||||
|
|
||||||
for line in self.lines:
|
for line in self.lines:
|
||||||
#line = line.strip()
|
# line = line.strip()
|
||||||
|
|
||||||
# Title
|
# Title
|
||||||
if self.is_title(line):
|
if self.is_title(line):
|
||||||
item = self.parse_title(line)
|
item = self.parse_title(line)
|
||||||
level = item["level"]
|
level = item["level"]
|
||||||
|
|
||||||
parents[level] = doc.add_text(text=item["text"], label=DocItemLabel.TITLE)
|
parents[level] = doc.add_text(
|
||||||
|
text=item["text"], label=DocItemLabel.TITLE
|
||||||
|
)
|
||||||
|
|
||||||
# Section headers
|
# Section headers
|
||||||
elif self.is_section_header(line):
|
elif self.is_section_header(line):
|
||||||
item = self.parse_section_header(line)
|
item = self.parse_section_header(line)
|
||||||
level = item["level"]
|
level = item["level"]
|
||||||
|
|
||||||
parents[level] = doc.add_heading(text=item["text"], level=item["level"], parent=parents[level-1])
|
parents[level] = doc.add_heading(
|
||||||
for k,v in parents.items():
|
text=item["text"], level=item["level"], parent=parents[level - 1]
|
||||||
if k>level:
|
)
|
||||||
|
for k, v in parents.items():
|
||||||
|
if k > level:
|
||||||
parents[k] = None
|
parents[k] = None
|
||||||
|
|
||||||
# Lists
|
# Lists
|
||||||
elif self.is_list_item(line):
|
elif self.is_list_item(line):
|
||||||
|
|
||||||
print("line: ", line)
|
print("line: ", line)
|
||||||
item = self.parse_list_item(line)
|
item = self.parse_list_item(line)
|
||||||
print("parsed list-item: ", item)
|
print("parsed list-item: ", item)
|
||||||
|
|
||||||
level = self.get_current_level(parents)
|
level = self.get_current_level(parents)
|
||||||
|
|
||||||
if not in_list:
|
if not in_list:
|
||||||
in_list = True
|
in_list = True
|
||||||
|
|
||||||
parents[level+1] = doc.add_group(
|
|
||||||
parent=parents[level], name="list", label=GroupLabel.LIST
|
|
||||||
)
|
|
||||||
indents[level+1] = item["indent"]
|
|
||||||
|
|
||||||
elif in_list and item["indent"]>indents[level]:
|
|
||||||
parents[level+1] = doc.add_group(
|
|
||||||
parent=parents[level], name="list", label=GroupLabel.LIST
|
|
||||||
)
|
|
||||||
indents[level+1] = item["indent"]
|
|
||||||
|
|
||||||
elif in_list and item["indent"]<indents[level]:
|
parents[level + 1] = doc.add_group(
|
||||||
|
parent=parents[level], name="list", label=GroupLabel.LIST
|
||||||
|
)
|
||||||
|
indents[level + 1] = item["indent"]
|
||||||
|
|
||||||
|
elif in_list and item["indent"] > indents[level]:
|
||||||
|
parents[level + 1] = doc.add_group(
|
||||||
|
parent=parents[level], name="list", label=GroupLabel.LIST
|
||||||
|
)
|
||||||
|
indents[level + 1] = item["indent"]
|
||||||
|
|
||||||
|
elif in_list and item["indent"] < indents[level]:
|
||||||
|
|
||||||
print(item["indent"], " => ", indents[level])
|
print(item["indent"], " => ", indents[level])
|
||||||
while item["indent"]<indents[level]:
|
while item["indent"] < indents[level]:
|
||||||
print(item["indent"], " => ", indents[level])
|
print(item["indent"], " => ", indents[level])
|
||||||
parents[level] = None
|
parents[level] = None
|
||||||
indents[level] = None
|
indents[level] = None
|
||||||
level -= 1
|
level -= 1
|
||||||
|
|
||||||
doc.add_list_item(item["text"], parent=self.get_current_parent(parents))
|
doc.add_list_item(item["text"], parent=self.get_current_parent(parents))
|
||||||
|
|
||||||
elif in_list and not self.is_list_item(line):
|
elif in_list and not self.is_list_item(line):
|
||||||
in_list = False
|
in_list = False
|
||||||
|
|
||||||
level = self.get_current_level(parents)
|
level = self.get_current_level(parents)
|
||||||
parents[level]=None
|
parents[level] = None
|
||||||
|
|
||||||
# Tables
|
# Tables
|
||||||
elif line.strip()=="|===" and not in_table: # start of table
|
elif line.strip() == "|===" and not in_table: # start of table
|
||||||
in_table = True
|
in_table = True
|
||||||
|
|
||||||
elif self.is_table_line(line): # within a table
|
elif self.is_table_line(line): # within a table
|
||||||
in_table = True
|
in_table = True
|
||||||
table_data.append(self.parse_table_line(line))
|
table_data.append(self.parse_table_line(line))
|
||||||
|
|
||||||
elif in_table and ((not self.is_table_line(line)) or line.strip()=="|==="): # end of table
|
elif in_table and (
|
||||||
|
(not self.is_table_line(line)) or line.strip() == "|==="
|
||||||
|
): # end of table
|
||||||
|
|
||||||
caption = None
|
caption = None
|
||||||
if len(caption_data)>0:
|
if len(caption_data) > 0:
|
||||||
caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION)
|
caption = doc.add_text(
|
||||||
|
text=" ".join(caption_data), label=DocItemLabel.CAPTION
|
||||||
|
)
|
||||||
|
|
||||||
|
caption_data = []
|
||||||
|
|
||||||
caption_data = []
|
|
||||||
|
|
||||||
data = self.populate_table_as_grid(table_data)
|
data = self.populate_table_as_grid(table_data)
|
||||||
doc.add_table(data=data, parent=self.get_current_parent(parents), caption=caption)
|
doc.add_table(
|
||||||
|
data=data, parent=self.get_current_parent(parents), caption=caption
|
||||||
|
)
|
||||||
|
|
||||||
in_table = False
|
in_table = False
|
||||||
table_data = []
|
table_data = []
|
||||||
|
|
||||||
# Picture
|
# Picture
|
||||||
elif self.is_picture(line):
|
elif self.is_picture(line):
|
||||||
|
|
||||||
caption = None
|
caption = None
|
||||||
if len(caption_data)>0:
|
if len(caption_data) > 0:
|
||||||
caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION)
|
caption = doc.add_text(
|
||||||
|
text=" ".join(caption_data), label=DocItemLabel.CAPTION
|
||||||
|
)
|
||||||
|
|
||||||
|
caption_data = []
|
||||||
|
|
||||||
caption_data = []
|
|
||||||
|
|
||||||
item = self.parse_picture(line)
|
item = self.parse_picture(line)
|
||||||
print(item)
|
print(item)
|
||||||
|
|
||||||
@ -200,41 +214,57 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
size = Size(width=int(item["width"]), height=int(item["height"]))
|
size = Size(width=int(item["width"]), height=int(item["height"]))
|
||||||
|
|
||||||
uri = None
|
uri = None
|
||||||
if "uri" in item and not item["uri"].startswith("http") and item["uri"].startswith("//"):
|
if (
|
||||||
uri = "file:"+item["uri"]
|
"uri" in item
|
||||||
elif "uri" in item and not item["uri"].startswith("http") and item["uri"].startswith("/"):
|
and not item["uri"].startswith("http")
|
||||||
uri = "file:/"+item["uri"]
|
and item["uri"].startswith("//")
|
||||||
|
):
|
||||||
|
uri = "file:" + item["uri"]
|
||||||
|
elif (
|
||||||
|
"uri" in item
|
||||||
|
and not item["uri"].startswith("http")
|
||||||
|
and item["uri"].startswith("/")
|
||||||
|
):
|
||||||
|
uri = "file:/" + item["uri"]
|
||||||
elif "uri" in item and not item["uri"].startswith("http"):
|
elif "uri" in item and not item["uri"].startswith("http"):
|
||||||
uri = "file://"+item["uri"]
|
uri = "file://" + item["uri"]
|
||||||
|
|
||||||
image = ImageRef(mimetype="image/png", size=size, dpi=70, uri=uri)
|
image = ImageRef(mimetype="image/png", size=size, dpi=70, uri=uri)
|
||||||
doc.add_picture(image=image, caption=caption)
|
doc.add_picture(image=image, caption=caption)
|
||||||
|
|
||||||
# Caption
|
# Caption
|
||||||
elif self.is_caption(line) and len(caption_data)==0:
|
elif self.is_caption(line) and len(caption_data) == 0:
|
||||||
item = self.parse_caption(line)
|
item = self.parse_caption(line)
|
||||||
caption_data.append(item["text"])
|
caption_data.append(item["text"])
|
||||||
|
|
||||||
elif len(line.strip())>0 and len(caption_data)>0: # allow multiline captions
|
elif (
|
||||||
|
len(line.strip()) > 0 and len(caption_data) > 0
|
||||||
|
): # allow multiline captions
|
||||||
item = self.parse_text(line)
|
item = self.parse_text(line)
|
||||||
caption_data.append(item["text"])
|
caption_data.append(item["text"])
|
||||||
|
|
||||||
# Plain text
|
# Plain text
|
||||||
elif len(line.strip())==0 and len(text_data)>0:
|
elif len(line.strip()) == 0 and len(text_data) > 0:
|
||||||
doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH,
|
doc.add_text(
|
||||||
parent=self.get_current_parent(parents))
|
text=" ".join(text_data),
|
||||||
|
label=DocItemLabel.PARAGRAPH,
|
||||||
|
parent=self.get_current_parent(parents),
|
||||||
|
)
|
||||||
text_data = []
|
text_data = []
|
||||||
|
|
||||||
elif len(line.strip())>0: # allow multiline texts
|
elif len(line.strip()) > 0: # allow multiline texts
|
||||||
|
|
||||||
item = self.parse_text(line)
|
item = self.parse_text(line)
|
||||||
text_data.append(item["text"])
|
text_data.append(item["text"])
|
||||||
|
|
||||||
if len(text_data) > 0:
|
if len(text_data) > 0:
|
||||||
doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH,
|
doc.add_text(
|
||||||
parent=self.get_current_parent(parents))
|
text=" ".join(text_data),
|
||||||
|
label=DocItemLabel.PARAGRAPH,
|
||||||
|
parent=self.get_current_parent(parents),
|
||||||
|
)
|
||||||
text_data = []
|
text_data = []
|
||||||
|
|
||||||
if in_table and len(table_data) > 0:
|
if in_table and len(table_data) > 0:
|
||||||
data = self.populate_table_as_grid(table_data)
|
data = self.populate_table_as_grid(table_data)
|
||||||
doc.add_table(data=data, parent=self.get_current_parent(parents))
|
doc.add_table(data=data, parent=self.get_current_parent(parents))
|
||||||
@ -245,25 +275,25 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
return doc
|
return doc
|
||||||
|
|
||||||
def get_current_level(self, parents):
|
def get_current_level(self, parents):
|
||||||
for k,v in parents.items():
|
for k, v in parents.items():
|
||||||
if v==None and k>0:
|
if v == None and k > 0:
|
||||||
return k-1
|
return k - 1
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def get_current_parent(self, parents):
|
def get_current_parent(self, parents):
|
||||||
for k,v in parents.items():
|
for k, v in parents.items():
|
||||||
if v==None and k>0:
|
if v == None and k > 0:
|
||||||
return parents[k-1]
|
return parents[k - 1]
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# ========= Title
|
# ========= Title
|
||||||
def is_title(self, line):
|
def is_title(self, line):
|
||||||
return re.match(r"^= ", line)
|
return re.match(r"^= ", line)
|
||||||
|
|
||||||
def parse_title(self, line):
|
def parse_title(self, line):
|
||||||
return {"type": "title", "text": line[2:].strip(), "level":0}
|
return {"type": "title", "text": line[2:].strip(), "level": 0}
|
||||||
|
|
||||||
# ========= Section headers
|
# ========= Section headers
|
||||||
def is_section_header(self, line):
|
def is_section_header(self, line):
|
||||||
@ -271,14 +301,14 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
def parse_section_header(self, line):
|
def parse_section_header(self, line):
|
||||||
match = re.match(r"^(=+)\s+(.*)", line)
|
match = re.match(r"^(=+)\s+(.*)", line)
|
||||||
|
|
||||||
marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
|
marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
|
||||||
text = match.group(2) # The actual text of the list item
|
text = match.group(2) # The actual text of the list item
|
||||||
|
|
||||||
header_level = marker.count("=") # number of '=' represents level
|
header_level = marker.count("=") # number of '=' represents level
|
||||||
return {
|
return {
|
||||||
"type": "header",
|
"type": "header",
|
||||||
"level": header_level-1,
|
"level": header_level - 1,
|
||||||
"text": text.strip(),
|
"text": text.strip(),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -293,19 +323,34 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
if match:
|
if match:
|
||||||
indent = match.group(1)
|
indent = match.group(1)
|
||||||
marker = match.group(2) # The list marker (e.g., "*", "-", "1.")
|
marker = match.group(2) # The list marker (e.g., "*", "-", "1.")
|
||||||
text = match.group(3) # The actual text of the list item
|
text = match.group(3) # The actual text of the list item
|
||||||
|
|
||||||
if marker=="*" or marker=="-":
|
if marker == "*" or marker == "-":
|
||||||
return {"type": "list_item", "marker": marker, "text": text.strip(),
|
return {
|
||||||
"numbered": False, "indent": 0 if indent==None else len(indent)}
|
"type": "list_item",
|
||||||
|
"marker": marker,
|
||||||
|
"text": text.strip(),
|
||||||
|
"numbered": False,
|
||||||
|
"indent": 0 if indent == None else len(indent),
|
||||||
|
}
|
||||||
else:
|
else:
|
||||||
return {"type": "list_item", "marker": marker, "text": text.strip(),
|
return {
|
||||||
"numbered": True, "indent": 0 if indent==None else len(indent)}
|
"type": "list_item",
|
||||||
|
"marker": marker,
|
||||||
|
"text": text.strip(),
|
||||||
|
"numbered": True,
|
||||||
|
"indent": 0 if indent == None else len(indent),
|
||||||
|
}
|
||||||
else:
|
else:
|
||||||
# Fallback if no match
|
# Fallback if no match
|
||||||
return {"type": "list_item", "marker": item_marker, "text": line,
|
return {
|
||||||
"numbered": False, "indent": 0}
|
"type": "list_item",
|
||||||
|
"marker": item_marker,
|
||||||
|
"text": line,
|
||||||
|
"numbered": False,
|
||||||
|
"indent": 0,
|
||||||
|
}
|
||||||
|
|
||||||
# ========= Tables
|
# ========= Tables
|
||||||
def is_table_line(self, line):
|
def is_table_line(self, line):
|
||||||
return re.match(r"^\|.*\|", line)
|
return re.match(r"^\|.*\|", line)
|
||||||
@ -357,18 +402,18 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
|
mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
|
||||||
if mtch:
|
if mtch:
|
||||||
picture_path = mtch.group(1).strip()
|
picture_path = mtch.group(1).strip()
|
||||||
attributes = mtch.group(2).split(',')
|
attributes = mtch.group(2).split(",")
|
||||||
picture_info = {"type": "picture", "uri": picture_path}
|
picture_info = {"type": "picture", "uri": picture_path}
|
||||||
|
|
||||||
# Extract optional attributes (alt text, width, height, alignment)
|
# Extract optional attributes (alt text, width, height, alignment)
|
||||||
if attributes:
|
if attributes:
|
||||||
picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
|
picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
|
||||||
for attr in attributes[1:]:
|
for attr in attributes[1:]:
|
||||||
key, value = attr.split('=')
|
key, value = attr.split("=")
|
||||||
picture_info[key.strip()] = value.strip()
|
picture_info[key.strip()] = value.strip()
|
||||||
|
|
||||||
return picture_info
|
return picture_info
|
||||||
|
|
||||||
return {"type": "picture", "uri": line}
|
return {"type": "picture", "uri": line}
|
||||||
|
|
||||||
# ========= Captions
|
# ========= Captions
|
||||||
@ -382,7 +427,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
return {"type": "caption", "text": text}
|
return {"type": "caption", "text": text}
|
||||||
|
|
||||||
return {"type": "caption", "text": ""}
|
return {"type": "caption", "text": ""}
|
||||||
|
|
||||||
# ========= Plain text
|
# ========= Plain text
|
||||||
def parse_text(self, line):
|
def parse_text(self, line):
|
||||||
return {"type": "text", "text": line.strip()}
|
return {"type": "text", "text": line.strip()}
|
||||||
|
@ -27,29 +27,29 @@ def test_asciidocs_examples():
|
|||||||
|
|
||||||
for fname in fnames:
|
for fname in fnames:
|
||||||
print(f"reading {fname}")
|
print(f"reading {fname}")
|
||||||
|
|
||||||
bname = os.path.basename(fname)
|
bname = os.path.basename(fname)
|
||||||
gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md")
|
gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md")
|
||||||
|
|
||||||
doc_backend = _get_backend(Path(fname))
|
doc_backend = _get_backend(Path(fname))
|
||||||
doc = doc_backend.convert()
|
doc = doc_backend.convert()
|
||||||
|
|
||||||
pred_itdoc = doc._export_to_indented_text(max_text_len=16)
|
pred_itdoc = doc._export_to_indented_text(max_text_len=16)
|
||||||
print("\n\n", pred_itdoc)
|
print("\n\n", pred_itdoc)
|
||||||
|
|
||||||
pred_mddoc = doc.export_to_markdown()
|
pred_mddoc = doc.export_to_markdown()
|
||||||
print("\n\n", pred_mddoc)
|
print("\n\n", pred_mddoc)
|
||||||
|
|
||||||
if os.path.exists(gname):
|
if os.path.exists(gname):
|
||||||
with open(gname, "r") as fr:
|
with open(gname, "r") as fr:
|
||||||
true_mddoc = fr.read()
|
true_mddoc = fr.read()
|
||||||
|
|
||||||
#assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
|
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
|
||||||
else:
|
else:
|
||||||
with open(gname, "w") as fw:
|
with open(gname, "w") as fw:
|
||||||
fw.write(pred_mddoc)
|
fw.write(pred_mddoc)
|
||||||
|
|
||||||
#print("\n\n", doc.export_to_markdown())
|
# print("\n\n", doc.export_to_markdown())
|
||||||
|
|
||||||
input("continue")
|
input("continue")
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user