diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py index 2061f54f..9dcfadb3 100644 --- a/docling/backend/asciidoc_backend.py +++ b/docling/backend/asciidoc_backend.py @@ -4,20 +4,20 @@ from io import BytesIO from pathlib import Path from typing import Set, Union -from pydantic import ( - AnyUrl, -) - from docling_core.types.doc import ( - Size, + DocItem, DocItemLabel, DoclingDocument, DocumentOrigin, + GroupItem, GroupLabel, + ImageRef, + NodeItem, + Size, TableCell, TableData, - ImageRef, ) +from pydantic import AnyUrl from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.datamodel.base_models import InputFormat @@ -90,108 +90,122 @@ class AsciidocBackend(DeclarativeDocumentBackend): in_list = False in_table = False - text_data = [] - table_data = [] - caption_data = [] + text_data: list[str] = [] + table_data: list[str] = [] + caption_data: list[str] = [] + + # parents: dict[int, Union[DocItem, GroupItem, None]] = {} + parents: dict[int, Union[GroupItem, None]] = {} + # indents: dict[int, Union[DocItem, GroupItem, None]] = {} + indents: dict[int, Union[GroupItem, None]] = {} - parents = {} - indents = {} - for i in range(0, 10): parents[i] = None indents[i] = None - + for line in self.lines: - #line = line.strip() + # line = line.strip() # Title if self.is_title(line): item = self.parse_title(line) level = item["level"] - - parents[level] = doc.add_text(text=item["text"], label=DocItemLabel.TITLE) - + + parents[level] = doc.add_text( + text=item["text"], label=DocItemLabel.TITLE + ) + # Section headers elif self.is_section_header(line): item = self.parse_section_header(line) level = item["level"] - - parents[level] = doc.add_heading(text=item["text"], level=item["level"], parent=parents[level-1]) - for k,v in parents.items(): - if k>level: + + parents[level] = doc.add_heading( + text=item["text"], level=item["level"], parent=parents[level - 1] + ) + for k, v in parents.items(): + if k > level: parents[k] = None - + # Lists elif self.is_list_item(line): print("line: ", line) item = self.parse_list_item(line) print("parsed list-item: ", item) - + level = self.get_current_level(parents) - + if not in_list: in_list = True - - parents[level+1] = doc.add_group( - parent=parents[level], name="list", label=GroupLabel.LIST - ) - indents[level+1] = item["indent"] - - elif in_list and item["indent"]>indents[level]: - parents[level+1] = doc.add_group( - parent=parents[level], name="list", label=GroupLabel.LIST - ) - indents[level+1] = item["indent"] - elif in_list and item["indent"] indents[level]: + parents[level + 1] = doc.add_group( + parent=parents[level], name="list", label=GroupLabel.LIST + ) + indents[level + 1] = item["indent"] + + elif in_list and item["indent"] < indents[level]: print(item["indent"], " => ", indents[level]) - while item["indent"] ", indents[level]) parents[level] = None indents[level] = None level -= 1 - + doc.add_list_item(item["text"], parent=self.get_current_parent(parents)) elif in_list and not self.is_list_item(line): in_list = False level = self.get_current_level(parents) - parents[level]=None + parents[level] = None # Tables - elif line.strip()=="|===" and not in_table: # start of table + elif line.strip() == "|===" and not in_table: # start of table in_table = True - - elif self.is_table_line(line): # within a table + + elif self.is_table_line(line): # within a table in_table = True table_data.append(self.parse_table_line(line)) - elif in_table and ((not self.is_table_line(line)) or line.strip()=="|==="): # end of table + elif in_table and ( + (not self.is_table_line(line)) or line.strip() == "|===" + ): # end of table caption = None - if len(caption_data)>0: - caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION) + if len(caption_data) > 0: + caption = doc.add_text( + text=" ".join(caption_data), label=DocItemLabel.CAPTION + ) + + caption_data = [] - caption_data = [] - data = self.populate_table_as_grid(table_data) - doc.add_table(data=data, parent=self.get_current_parent(parents), caption=caption) + doc.add_table( + data=data, parent=self.get_current_parent(parents), caption=caption + ) in_table = False table_data = [] - + # Picture elif self.is_picture(line): caption = None - if len(caption_data)>0: - caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION) + if len(caption_data) > 0: + caption = doc.add_text( + text=" ".join(caption_data), label=DocItemLabel.CAPTION + ) + + caption_data = [] - caption_data = [] - item = self.parse_picture(line) print(item) @@ -200,41 +214,57 @@ class AsciidocBackend(DeclarativeDocumentBackend): size = Size(width=int(item["width"]), height=int(item["height"])) uri = None - if "uri" in item and not item["uri"].startswith("http") and item["uri"].startswith("//"): - uri = "file:"+item["uri"] - elif "uri" in item and not item["uri"].startswith("http") and item["uri"].startswith("/"): - uri = "file:/"+item["uri"] + if ( + "uri" in item + and not item["uri"].startswith("http") + and item["uri"].startswith("//") + ): + uri = "file:" + item["uri"] + elif ( + "uri" in item + and not item["uri"].startswith("http") + and item["uri"].startswith("/") + ): + uri = "file:/" + item["uri"] elif "uri" in item and not item["uri"].startswith("http"): - uri = "file://"+item["uri"] + uri = "file://" + item["uri"] image = ImageRef(mimetype="image/png", size=size, dpi=70, uri=uri) doc.add_picture(image=image, caption=caption) - + # Caption - elif self.is_caption(line) and len(caption_data)==0: + elif self.is_caption(line) and len(caption_data) == 0: item = self.parse_caption(line) caption_data.append(item["text"]) - elif len(line.strip())>0 and len(caption_data)>0: # allow multiline captions + elif ( + len(line.strip()) > 0 and len(caption_data) > 0 + ): # allow multiline captions item = self.parse_text(line) caption_data.append(item["text"]) - + # Plain text - elif len(line.strip())==0 and len(text_data)>0: - doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH, - parent=self.get_current_parent(parents)) + elif len(line.strip()) == 0 and len(text_data) > 0: + doc.add_text( + text=" ".join(text_data), + label=DocItemLabel.PARAGRAPH, + parent=self.get_current_parent(parents), + ) text_data = [] - - elif len(line.strip())>0: # allow multiline texts - + + elif len(line.strip()) > 0: # allow multiline texts + item = self.parse_text(line) text_data.append(item["text"]) if len(text_data) > 0: - doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH, - parent=self.get_current_parent(parents)) + doc.add_text( + text=" ".join(text_data), + label=DocItemLabel.PARAGRAPH, + parent=self.get_current_parent(parents), + ) text_data = [] - + if in_table and len(table_data) > 0: data = self.populate_table_as_grid(table_data) doc.add_table(data=data, parent=self.get_current_parent(parents)) @@ -245,25 +275,25 @@ class AsciidocBackend(DeclarativeDocumentBackend): return doc def get_current_level(self, parents): - for k,v in parents.items(): - if v==None and k>0: - return k-1 + for k, v in parents.items(): + if v == None and k > 0: + return k - 1 return 0 - + def get_current_parent(self, parents): - for k,v in parents.items(): - if v==None and k>0: - return parents[k-1] + for k, v in parents.items(): + if v == None and k > 0: + return parents[k - 1] return None - + # ========= Title def is_title(self, line): return re.match(r"^= ", line) def parse_title(self, line): - return {"type": "title", "text": line[2:].strip(), "level":0} + return {"type": "title", "text": line[2:].strip(), "level": 0} # ========= Section headers def is_section_header(self, line): @@ -271,14 +301,14 @@ class AsciidocBackend(DeclarativeDocumentBackend): def parse_section_header(self, line): match = re.match(r"^(=+)\s+(.*)", line) - + marker = match.group(1) # The list marker (e.g., "*", "-", "1.") - text = match.group(2) # The actual text of the list item - + text = match.group(2) # The actual text of the list item + header_level = marker.count("=") # number of '=' represents level return { "type": "header", - "level": header_level-1, + "level": header_level - 1, "text": text.strip(), } @@ -293,19 +323,34 @@ class AsciidocBackend(DeclarativeDocumentBackend): if match: indent = match.group(1) marker = match.group(2) # The list marker (e.g., "*", "-", "1.") - text = match.group(3) # The actual text of the list item - - if marker=="*" or marker=="-": - return {"type": "list_item", "marker": marker, "text": text.strip(), - "numbered": False, "indent": 0 if indent==None else len(indent)} + text = match.group(3) # The actual text of the list item + + if marker == "*" or marker == "-": + return { + "type": "list_item", + "marker": marker, + "text": text.strip(), + "numbered": False, + "indent": 0 if indent == None else len(indent), + } else: - return {"type": "list_item", "marker": marker, "text": text.strip(), - "numbered": True, "indent": 0 if indent==None else len(indent)} + return { + "type": "list_item", + "marker": marker, + "text": text.strip(), + "numbered": True, + "indent": 0 if indent == None else len(indent), + } else: # Fallback if no match - return {"type": "list_item", "marker": item_marker, "text": line, - "numbered": False, "indent": 0} - + return { + "type": "list_item", + "marker": item_marker, + "text": line, + "numbered": False, + "indent": 0, + } + # ========= Tables def is_table_line(self, line): return re.match(r"^\|.*\|", line) @@ -357,18 +402,18 @@ class AsciidocBackend(DeclarativeDocumentBackend): mtch = re.match(r"^image::(.+)\[(.*)\]$", line) if mtch: picture_path = mtch.group(1).strip() - attributes = mtch.group(2).split(',') + attributes = mtch.group(2).split(",") picture_info = {"type": "picture", "uri": picture_path} # Extract optional attributes (alt text, width, height, alignment) if attributes: picture_info["alt"] = attributes[0].strip() if attributes[0] else "" for attr in attributes[1:]: - key, value = attr.split('=') + key, value = attr.split("=") picture_info[key.strip()] = value.strip() - + return picture_info - + return {"type": "picture", "uri": line} # ========= Captions @@ -382,7 +427,7 @@ class AsciidocBackend(DeclarativeDocumentBackend): return {"type": "caption", "text": text} return {"type": "caption", "text": ""} - + # ========= Plain text def parse_text(self, line): return {"type": "text", "text": line.strip()} diff --git a/tests/test_backend_asciidoc.py b/tests/test_backend_asciidoc.py index ab84dbe8..7512698e 100644 --- a/tests/test_backend_asciidoc.py +++ b/tests/test_backend_asciidoc.py @@ -27,29 +27,29 @@ def test_asciidocs_examples(): for fname in fnames: print(f"reading {fname}") - + bname = os.path.basename(fname) gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md") - + doc_backend = _get_backend(Path(fname)) doc = doc_backend.convert() pred_itdoc = doc._export_to_indented_text(max_text_len=16) print("\n\n", pred_itdoc) - + pred_mddoc = doc.export_to_markdown() print("\n\n", pred_mddoc) - + if os.path.exists(gname): with open(gname, "r") as fr: true_mddoc = fr.read() - #assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc" + # assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc" else: with open(gname, "w") as fw: fw.write(pred_mddoc) - #print("\n\n", doc.export_to_markdown()) + # print("\n\n", doc.export_to_markdown()) input("continue")