mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
fixed the mypy
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
b04f14ec24
commit
bb3db07836
@ -4,20 +4,20 @@ from io import BytesIO
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Set, Union
|
from typing import Set, Union
|
||||||
|
|
||||||
from pydantic import (
|
|
||||||
AnyUrl,
|
|
||||||
)
|
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
Size,
|
DocItem,
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
DocumentOrigin,
|
DocumentOrigin,
|
||||||
|
GroupItem,
|
||||||
GroupLabel,
|
GroupLabel,
|
||||||
|
ImageRef,
|
||||||
|
NodeItem,
|
||||||
|
Size,
|
||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
ImageRef,
|
|
||||||
)
|
)
|
||||||
|
from pydantic import AnyUrl
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
@ -90,12 +90,14 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
in_list = False
|
in_list = False
|
||||||
in_table = False
|
in_table = False
|
||||||
|
|
||||||
text_data = []
|
text_data: list[str] = []
|
||||||
table_data = []
|
table_data: list[str] = []
|
||||||
caption_data = []
|
caption_data: list[str] = []
|
||||||
|
|
||||||
parents = {}
|
# parents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
||||||
indents = {}
|
parents: dict[int, Union[GroupItem, None]] = {}
|
||||||
|
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
||||||
|
indents: dict[int, Union[GroupItem, None]] = {}
|
||||||
|
|
||||||
for i in range(0, 10):
|
for i in range(0, 10):
|
||||||
parents[i] = None
|
parents[i] = None
|
||||||
@ -109,14 +111,18 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
item = self.parse_title(line)
|
item = self.parse_title(line)
|
||||||
level = item["level"]
|
level = item["level"]
|
||||||
|
|
||||||
parents[level] = doc.add_text(text=item["text"], label=DocItemLabel.TITLE)
|
parents[level] = doc.add_text(
|
||||||
|
text=item["text"], label=DocItemLabel.TITLE
|
||||||
|
)
|
||||||
|
|
||||||
# Section headers
|
# Section headers
|
||||||
elif self.is_section_header(line):
|
elif self.is_section_header(line):
|
||||||
item = self.parse_section_header(line)
|
item = self.parse_section_header(line)
|
||||||
level = item["level"]
|
level = item["level"]
|
||||||
|
|
||||||
parents[level] = doc.add_heading(text=item["text"], level=item["level"], parent=parents[level-1])
|
parents[level] = doc.add_heading(
|
||||||
|
text=item["text"], level=item["level"], parent=parents[level - 1]
|
||||||
|
)
|
||||||
for k, v in parents.items():
|
for k, v in parents.items():
|
||||||
if k > level:
|
if k > level:
|
||||||
parents[k] = None
|
parents[k] = None
|
||||||
@ -169,16 +175,22 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
in_table = True
|
in_table = True
|
||||||
table_data.append(self.parse_table_line(line))
|
table_data.append(self.parse_table_line(line))
|
||||||
|
|
||||||
elif in_table and ((not self.is_table_line(line)) or line.strip()=="|==="): # end of table
|
elif in_table and (
|
||||||
|
(not self.is_table_line(line)) or line.strip() == "|==="
|
||||||
|
): # end of table
|
||||||
|
|
||||||
caption = None
|
caption = None
|
||||||
if len(caption_data) > 0:
|
if len(caption_data) > 0:
|
||||||
caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION)
|
caption = doc.add_text(
|
||||||
|
text=" ".join(caption_data), label=DocItemLabel.CAPTION
|
||||||
|
)
|
||||||
|
|
||||||
caption_data = []
|
caption_data = []
|
||||||
|
|
||||||
data = self.populate_table_as_grid(table_data)
|
data = self.populate_table_as_grid(table_data)
|
||||||
doc.add_table(data=data, parent=self.get_current_parent(parents), caption=caption)
|
doc.add_table(
|
||||||
|
data=data, parent=self.get_current_parent(parents), caption=caption
|
||||||
|
)
|
||||||
|
|
||||||
in_table = False
|
in_table = False
|
||||||
table_data = []
|
table_data = []
|
||||||
@ -188,7 +200,9 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
caption = None
|
caption = None
|
||||||
if len(caption_data) > 0:
|
if len(caption_data) > 0:
|
||||||
caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION)
|
caption = doc.add_text(
|
||||||
|
text=" ".join(caption_data), label=DocItemLabel.CAPTION
|
||||||
|
)
|
||||||
|
|
||||||
caption_data = []
|
caption_data = []
|
||||||
|
|
||||||
@ -200,9 +214,17 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
size = Size(width=int(item["width"]), height=int(item["height"]))
|
size = Size(width=int(item["width"]), height=int(item["height"]))
|
||||||
|
|
||||||
uri = None
|
uri = None
|
||||||
if "uri" in item and not item["uri"].startswith("http") and item["uri"].startswith("//"):
|
if (
|
||||||
|
"uri" in item
|
||||||
|
and not item["uri"].startswith("http")
|
||||||
|
and item["uri"].startswith("//")
|
||||||
|
):
|
||||||
uri = "file:" + item["uri"]
|
uri = "file:" + item["uri"]
|
||||||
elif "uri" in item and not item["uri"].startswith("http") and item["uri"].startswith("/"):
|
elif (
|
||||||
|
"uri" in item
|
||||||
|
and not item["uri"].startswith("http")
|
||||||
|
and item["uri"].startswith("/")
|
||||||
|
):
|
||||||
uri = "file:/" + item["uri"]
|
uri = "file:/" + item["uri"]
|
||||||
elif "uri" in item and not item["uri"].startswith("http"):
|
elif "uri" in item and not item["uri"].startswith("http"):
|
||||||
uri = "file://" + item["uri"]
|
uri = "file://" + item["uri"]
|
||||||
@ -215,14 +237,19 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
item = self.parse_caption(line)
|
item = self.parse_caption(line)
|
||||||
caption_data.append(item["text"])
|
caption_data.append(item["text"])
|
||||||
|
|
||||||
elif len(line.strip())>0 and len(caption_data)>0: # allow multiline captions
|
elif (
|
||||||
|
len(line.strip()) > 0 and len(caption_data) > 0
|
||||||
|
): # allow multiline captions
|
||||||
item = self.parse_text(line)
|
item = self.parse_text(line)
|
||||||
caption_data.append(item["text"])
|
caption_data.append(item["text"])
|
||||||
|
|
||||||
# Plain text
|
# Plain text
|
||||||
elif len(line.strip()) == 0 and len(text_data) > 0:
|
elif len(line.strip()) == 0 and len(text_data) > 0:
|
||||||
doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH,
|
doc.add_text(
|
||||||
parent=self.get_current_parent(parents))
|
text=" ".join(text_data),
|
||||||
|
label=DocItemLabel.PARAGRAPH,
|
||||||
|
parent=self.get_current_parent(parents),
|
||||||
|
)
|
||||||
text_data = []
|
text_data = []
|
||||||
|
|
||||||
elif len(line.strip()) > 0: # allow multiline texts
|
elif len(line.strip()) > 0: # allow multiline texts
|
||||||
@ -231,8 +258,11 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
text_data.append(item["text"])
|
text_data.append(item["text"])
|
||||||
|
|
||||||
if len(text_data) > 0:
|
if len(text_data) > 0:
|
||||||
doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH,
|
doc.add_text(
|
||||||
parent=self.get_current_parent(parents))
|
text=" ".join(text_data),
|
||||||
|
label=DocItemLabel.PARAGRAPH,
|
||||||
|
parent=self.get_current_parent(parents),
|
||||||
|
)
|
||||||
text_data = []
|
text_data = []
|
||||||
|
|
||||||
if in_table and len(table_data) > 0:
|
if in_table and len(table_data) > 0:
|
||||||
@ -296,15 +326,30 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
text = match.group(3) # The actual text of the list item
|
text = match.group(3) # The actual text of the list item
|
||||||
|
|
||||||
if marker == "*" or marker == "-":
|
if marker == "*" or marker == "-":
|
||||||
return {"type": "list_item", "marker": marker, "text": text.strip(),
|
return {
|
||||||
"numbered": False, "indent": 0 if indent==None else len(indent)}
|
"type": "list_item",
|
||||||
|
"marker": marker,
|
||||||
|
"text": text.strip(),
|
||||||
|
"numbered": False,
|
||||||
|
"indent": 0 if indent == None else len(indent),
|
||||||
|
}
|
||||||
else:
|
else:
|
||||||
return {"type": "list_item", "marker": marker, "text": text.strip(),
|
return {
|
||||||
"numbered": True, "indent": 0 if indent==None else len(indent)}
|
"type": "list_item",
|
||||||
|
"marker": marker,
|
||||||
|
"text": text.strip(),
|
||||||
|
"numbered": True,
|
||||||
|
"indent": 0 if indent == None else len(indent),
|
||||||
|
}
|
||||||
else:
|
else:
|
||||||
# Fallback if no match
|
# Fallback if no match
|
||||||
return {"type": "list_item", "marker": item_marker, "text": line,
|
return {
|
||||||
"numbered": False, "indent": 0}
|
"type": "list_item",
|
||||||
|
"marker": item_marker,
|
||||||
|
"text": line,
|
||||||
|
"numbered": False,
|
||||||
|
"indent": 0,
|
||||||
|
}
|
||||||
|
|
||||||
# ========= Tables
|
# ========= Tables
|
||||||
def is_table_line(self, line):
|
def is_table_line(self, line):
|
||||||
@ -357,14 +402,14 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
|
mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
|
||||||
if mtch:
|
if mtch:
|
||||||
picture_path = mtch.group(1).strip()
|
picture_path = mtch.group(1).strip()
|
||||||
attributes = mtch.group(2).split(',')
|
attributes = mtch.group(2).split(",")
|
||||||
picture_info = {"type": "picture", "uri": picture_path}
|
picture_info = {"type": "picture", "uri": picture_path}
|
||||||
|
|
||||||
# Extract optional attributes (alt text, width, height, alignment)
|
# Extract optional attributes (alt text, width, height, alignment)
|
||||||
if attributes:
|
if attributes:
|
||||||
picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
|
picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
|
||||||
for attr in attributes[1:]:
|
for attr in attributes[1:]:
|
||||||
key, value = attr.split('=')
|
key, value = attr.split("=")
|
||||||
picture_info[key.strip()] = value.strip()
|
picture_info[key.strip()] = value.strip()
|
||||||
|
|
||||||
return picture_info
|
return picture_info
|
||||||
|
Loading…
Reference in New Issue
Block a user