Merge branch 'dev/add-asciidocs-backend' of github.com:DS4SD/docling into cau/backend-document-origin

2025-08-01 23:12:20 +00:00 · 2024-10-22 11:04:49 +02:00 · 2024-10-22 11:04:49 +02:00 · 0bbd50f500
commit 0bbd50f500
parent d5460e2d1f bb3db07836
10 changed files with 1343 additions and 508 deletions
--- a/docling/backend/asciidoc_backend.py
+++ b/docling/backend/asciidoc_backend.py
@ -0,0 +1,433 @@
+import logging
+import re
+from io import BytesIO
+from pathlib import Path
+from typing import Set, Union
+
+from docling_core.types.doc import (
+    DocItem,
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    GroupItem,
+    GroupLabel,
+    ImageRef,
+    NodeItem,
+    Size,
+    TableCell,
+    TableData,
+)
+from pydantic import AnyUrl
+
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class AsciidocBackend(DeclarativeDocumentBackend):
+
+    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+
+        self.path_or_stream = path_or_stream
+
+        self.valid = True
+
+    def is_valid(self) -> bool:
+        return self.valid
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+
+    def unload(self):
+        return
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.ASCIIDOC}
+
+    def convert(self) -> DoclingDocument:
+        """
+        Parses the ASCII into a structured document model.
+        """
+
+        fname = ""
+        if isinstance(self.path_or_stream, Path):
+            fname = self.path_or_stream.name
+
+        origin = DocumentOrigin(
+            filename=fname,
+            mimetype="text/asciidoc",
+            binary_hash=self.document_hash,
+        )
+        if len(fname) > 0:
+            docname = Path(fname).stem
+        else:
+            docname = "stream"
+
+        doc = DoclingDocument(name=docname, origin=origin)
+
+        doc = self.parse(doc)
+
+        return doc
+
+    def parse(self, doc: DoclingDocument):
+        """
+        Main function that orchestrates the parsing by yielding components:
+        title, section headers, text, lists, and tables.
+        """
+
+        content = ""
+        if isinstance(self.path_or_stream, Path):
+            with open(self.path_or_stream, "r") as fr:
+                self.lines = fr.readlines()
+
+        # self.lines = file_content.splitlines()
+
+        in_list = False
+        in_table = False
+
+        text_data: list[str] = []
+        table_data: list[str] = []
+        caption_data: list[str] = []
+
+        # parents: dict[int, Union[DocItem, GroupItem, None]] = {}
+        parents: dict[int, Union[GroupItem, None]] = {}
+        # indents: dict[int, Union[DocItem, GroupItem, None]] = {}
+        indents: dict[int, Union[GroupItem, None]] = {}
+
+        for i in range(0, 10):
+            parents[i] = None
+            indents[i] = None
+
+        for line in self.lines:
+            # line = line.strip()
+
+            # Title
+            if self.is_title(line):
+                item = self.parse_title(line)
+                level = item["level"]
+
+                parents[level] = doc.add_text(
+                    text=item["text"], label=DocItemLabel.TITLE
+                )
+
+            # Section headers
+            elif self.is_section_header(line):
+                item = self.parse_section_header(line)
+                level = item["level"]
+
+                parents[level] = doc.add_heading(
+                    text=item["text"], level=item["level"], parent=parents[level - 1]
+                )
+                for k, v in parents.items():
+                    if k > level:
+                        parents[k] = None
+
+            # Lists
+            elif self.is_list_item(line):
+
+                print("line: ", line)
+                item = self.parse_list_item(line)
+                print("parsed list-item: ", item)
+
+                level = self.get_current_level(parents)
+
+                if not in_list:
+                    in_list = True
+
+                    parents[level + 1] = doc.add_group(
+                        parent=parents[level], name="list", label=GroupLabel.LIST
+                    )
+                    indents[level + 1] = item["indent"]
+
+                elif in_list and item["indent"] > indents[level]:
+                    parents[level + 1] = doc.add_group(
+                        parent=parents[level], name="list", label=GroupLabel.LIST
+                    )
+                    indents[level + 1] = item["indent"]
+
+                elif in_list and item["indent"] < indents[level]:
+
+                    print(item["indent"], " => ", indents[level])
+                    while item["indent"] < indents[level]:
+                        print(item["indent"], " => ", indents[level])
+                        parents[level] = None
+                        indents[level] = None
+                        level -= 1
+
+                doc.add_list_item(item["text"], parent=self.get_current_parent(parents))
+
+            elif in_list and not self.is_list_item(line):
+                in_list = False
+
+                level = self.get_current_level(parents)
+                parents[level] = None
+
+            # Tables
+            elif line.strip() == "|===" and not in_table:  # start of table
+                in_table = True
+
+            elif self.is_table_line(line):  # within a table
+                in_table = True
+                table_data.append(self.parse_table_line(line))
+
+            elif in_table and (
+                (not self.is_table_line(line)) or line.strip() == "|==="
+            ):  # end of table
+
+                caption = None
+                if len(caption_data) > 0:
+                    caption = doc.add_text(
+                        text=" ".join(caption_data), label=DocItemLabel.CAPTION
+                    )
+
+                caption_data = []
+
+                data = self.populate_table_as_grid(table_data)
+                doc.add_table(
+                    data=data, parent=self.get_current_parent(parents), caption=caption
+                )
+
+                in_table = False
+                table_data = []
+
+            # Picture
+            elif self.is_picture(line):
+
+                caption = None
+                if len(caption_data) > 0:
+                    caption = doc.add_text(
+                        text=" ".join(caption_data), label=DocItemLabel.CAPTION
+                    )
+
+                caption_data = []
+
+                item = self.parse_picture(line)
+                print(item)
+
+                size = None
+                if "width" in item and "height" in item:
+                    size = Size(width=int(item["width"]), height=int(item["height"]))
+
+                uri = None
+                if (
+                    "uri" in item
+                    and not item["uri"].startswith("http")
+                    and item["uri"].startswith("//")
+                ):
+                    uri = "file:" + item["uri"]
+                elif (
+                    "uri" in item
+                    and not item["uri"].startswith("http")
+                    and item["uri"].startswith("/")
+                ):
+                    uri = "file:/" + item["uri"]
+                elif "uri" in item and not item["uri"].startswith("http"):
+                    uri = "file://" + item["uri"]
+
+                image = ImageRef(mimetype="image/png", size=size, dpi=70, uri=uri)
+                doc.add_picture(image=image, caption=caption)
+
+            # Caption
+            elif self.is_caption(line) and len(caption_data) == 0:
+                item = self.parse_caption(line)
+                caption_data.append(item["text"])
+
+            elif (
+                len(line.strip()) > 0 and len(caption_data) > 0
+            ):  # allow multiline captions
+                item = self.parse_text(line)
+                caption_data.append(item["text"])
+
+            # Plain text
+            elif len(line.strip()) == 0 and len(text_data) > 0:
+                doc.add_text(
+                    text=" ".join(text_data),
+                    label=DocItemLabel.PARAGRAPH,
+                    parent=self.get_current_parent(parents),
+                )
+                text_data = []
+
+            elif len(line.strip()) > 0:  # allow multiline texts
+
+                item = self.parse_text(line)
+                text_data.append(item["text"])
+
+        if len(text_data) > 0:
+            doc.add_text(
+                text=" ".join(text_data),
+                label=DocItemLabel.PARAGRAPH,
+                parent=self.get_current_parent(parents),
+            )
+            text_data = []
+
+        if in_table and len(table_data) > 0:
+            data = self.populate_table_as_grid(table_data)
+            doc.add_table(data=data, parent=self.get_current_parent(parents))
+
+            in_table = False
+            table_data = []
+
+        return doc
+
+    def get_current_level(self, parents):
+        for k, v in parents.items():
+            if v == None and k > 0:
+                return k - 1
+
+        return 0
+
+    def get_current_parent(self, parents):
+        for k, v in parents.items():
+            if v == None and k > 0:
+                return parents[k - 1]
+
+        return None
+
+    #   =========   Title
+    def is_title(self, line):
+        return re.match(r"^= ", line)
+
+    def parse_title(self, line):
+        return {"type": "title", "text": line[2:].strip(), "level": 0}
+
+    #   =========   Section headers
+    def is_section_header(self, line):
+        return re.match(r"^==+", line)
+
+    def parse_section_header(self, line):
+        match = re.match(r"^(=+)\s+(.*)", line)
+
+        marker = match.group(1)  # The list marker (e.g., "*", "-", "1.")
+        text = match.group(2)  # The actual text of the list item
+
+        header_level = marker.count("=")  # number of '=' represents level
+        return {
+            "type": "header",
+            "level": header_level - 1,
+            "text": text.strip(),
+        }
+
+    #   =========   Lists
+    def is_list_item(self, line):
+        return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)
+
+    def parse_list_item(self, line):
+        """Extract the item marker (number or bullet symbol) and the text of the item."""
+
+        match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
+        if match:
+            indent = match.group(1)
+            marker = match.group(2)  # The list marker (e.g., "*", "-", "1.")
+            text = match.group(3)  # The actual text of the list item
+
+            if marker == "*" or marker == "-":
+                return {
+                    "type": "list_item",
+                    "marker": marker,
+                    "text": text.strip(),
+                    "numbered": False,
+                    "indent": 0 if indent == None else len(indent),
+                }
+            else:
+                return {
+                    "type": "list_item",
+                    "marker": marker,
+                    "text": text.strip(),
+                    "numbered": True,
+                    "indent": 0 if indent == None else len(indent),
+                }
+        else:
+            # Fallback if no match
+            return {
+                "type": "list_item",
+                "marker": item_marker,
+                "text": line,
+                "numbered": False,
+                "indent": 0,
+            }
+
+    #   =========   Tables
+    def is_table_line(self, line):
+        return re.match(r"^\|.*\|", line)
+
+    def parse_table_line(self, line):
+        # Split table cells and trim extra spaces
+        return [cell.strip() for cell in line.split("|") if cell.strip()]
+
+    def populate_table_as_grid(self, table_data):
+
+        num_rows = len(table_data)
+
+        # Adjust the table data into a grid format
+        num_cols = max(len(row) for row in table_data)
+
+        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
+        for row_idx, row in enumerate(table_data):
+            # Pad rows with empty strings to match column count
+            # grid.append(row + [''] * (max_cols - len(row)))
+
+            for col_idx, text in enumerate(row):
+                row_span = 1
+                col_span = 1
+
+                cell = TableCell(
+                    text=text,
+                    row_span=row_span,
+                    col_span=col_span,
+                    start_row_offset_idx=row_idx,
+                    end_row_offset_idx=row_idx + row_span,
+                    start_col_offset_idx=col_idx,
+                    end_col_offset_idx=col_idx + col_span,
+                    col_header=False,
+                    row_header=False,
+                )
+                data.table_cells.append(cell)
+
+        return data
+
+    #   =========   Pictures
+    def is_picture(self, line):
+        return re.match(r"^image::", line)
+
+    def parse_picture(self, line):
+        """
+        Parse an image macro, extracting its path and attributes.
+        Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
+        """
+        mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
+        if mtch:
+            picture_path = mtch.group(1).strip()
+            attributes = mtch.group(2).split(",")
+            picture_info = {"type": "picture", "uri": picture_path}
+
+            # Extract optional attributes (alt text, width, height, alignment)
+            if attributes:
+                picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
+                for attr in attributes[1:]:
+                    key, value = attr.split("=")
+                    picture_info[key.strip()] = value.strip()
+
+            return picture_info
+
+        return {"type": "picture", "uri": line}
+
+    #   =========   Captions
+    def is_caption(self, line):
+        return re.match(r"^\.(.+)", line)
+
+    def parse_caption(self, line):
+        mtch = re.match(r"^\.(.+)", line)
+        if mtch:
+            text = mtch.group(1)
+            return {"type": "caption", "text": text}
+
+        return {"type": "caption", "text": ""}
+
+    #   =========   Plain text
+    def parse_text(self, line):
+        return {"type": "text", "text": line.strip()}
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -30,6 +30,7 @@ class InputFormat(str, Enum):
    HTML = "html"
    IMAGE = "image"
    PDF = "pdf"
+    ASCIIDOC = "asciidoc"


 class OutputFormat(str, Enum):
@ -45,6 +46,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
    InputFormat.PDF: ["pdf"],
    InputFormat.HTML: ["html", "htm", "xhtml"],
    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
+    InputFormat.ASCIIDOC: ["adoc", ".asciidoc", "asc"],
 }

 FormatToMimeType: Dict[InputFormat, Set[str]] = {
@ -66,6 +68,7 @@ FormatToMimeType: Dict[InputFormat, Set[str]] = {
        "image/bmp",
    },
    InputFormat.PDF: {"application/pdf"},
+    InputFormat.ASCIIDOC: {"application/asciidoc"},
 }
 MimeTypeToFormat = {
    mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
--- a/docs/examples/run_with_formats.py
+++ b/docs/examples/run_with_formats.py
@ -25,6 +25,7 @@ def main():
        Path("tests/data/powerpoint_sample.pptx"),
        Path("tests/data/2305.03393v1-pg9-img.png"),
        Path("tests/data/2206.01062.pdf"),
+        Path("tests/data/test_01.asciidoc"),
    ]

    ## for defaults use:
@ -40,6 +41,7 @@ def main():
                InputFormat.DOCX,
                InputFormat.HTML,
                InputFormat.PPTX,
+                InputFormat.ASCIIDOC,
            ],  # whitelist formats, non-matching files are ignored.
            format_options={
                InputFormat.PDF: PdfFormatOption(
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -37,7 +37,8 @@ torchvision = [
 ######################
 python = "^3.10"
 pydantic = "^2.0.0"
-docling-core = "^2.0.0"
+#docling-core = "^2.0.1"
+docling-core = { git = "https://github.com/DS4SD/docling-core.git", rev = "c78e8f16524fd378d9a261a74982f60fa9debd47" }
 docling-ibm-models = "^2.0.1"
 deepsearch-glm = "^0.25.0"
 filetype = "^1.2.0"
--- a/tests/data/groundtruth/docling_v2/test_01.asciidoc.md
+++ b/tests/data/groundtruth/docling_v2/test_01.asciidoc.md
@ -0,0 +1,24 @@
+# Sample Document Title
+
+## Section 1
+
+This is some introductory text in section 1.
+
+## Subsection 1.1
+
+- * First list item
+
+- * Second list item
+
+This is some introductory text in section 1.1.
+
+- - A dash list item
+
+## Section 2
+
+This is some text in section 2.
+
+| Header 1   | Header 2   |
+|------------|------------|
+| Value 1    | Value 2    |
+| Value 3    | Value 4    |
--- a/tests/data/groundtruth/docling_v2/test_02.asciidoc.md
+++ b/tests/data/groundtruth/docling_v2/test_02.asciidoc.md
@ -0,0 +1,83 @@
+2nd Sample Document Title
+
+This is an abstract.
+
+ Section 1: Testing nestedlists
+
+    - First item
+    - Nested item 1
+    - Nested item 2
+    - Second item
+    - Nested ordered item 1
+    - Nested ordered item 2
+    - Deeper nested unordered item
+    - Third item
+    - Nested ordered item 1
+    - Nested ordered item 2
+    - Deeper nested unordered item
+    - Nested ordered item 2
+
+ Section 2
+
+bla bla
+
+bla bla bla
+
+ Section 3: test image
+
+image::images/example1.png[Example Image, width=200, height=150, align=center]
+
+.An example caption for the image
+
+image::images/example2.png[Example Image, width=200, height=150, align=center]
+
+ Section 4: test tables
+
+
+| Header 1   | Header 2   |
+|------------|------------|
+| Value 1    | Value 2    |
+| Value 3    | Value 4    |
+
+.Caption for the table 1
+
+|===
+
+
+| Header 1   | Header 2   |
+|------------|------------|
+| Value 1    | Value 2    |
+| Value 3    | Value 4    |
+
+.Caption for the table 2
+
+|===
+
+
+| Column 1 Heading   | Column 2 Heading   | Column 3 Heading       |
+|--------------------|--------------------|------------------------|
+| Cell 1             | Cell 2             | Cell 3                 |
+| Cell 4             | Cell 5 colspan=2   | Cell spans two columns |
+
+.Caption for the table 3
+
+|===
+
+
+| Column 1 Heading   | Column 2 Heading   | Column 3 Heading   |
+|--------------------|--------------------|--------------------|
+| Rowspan=2          | Cell 2             | Cell 3             |
+| Cell 5             | Cell 6             |                    |
+
+.Caption for the table 4
+
+|===
+
+
+| Col 1               | Col 2                              | Col 3   | Col 4   |
+|---------------------|------------------------------------|---------|---------|
+| Rowspan=2.Colspan=2 | Cell spanning 2 rows and 2 columns | Col 3   | Col 4   |
+| Col 3               | Col 4                              |         |         |
+| Col 1               | Col 2                              | Col 3   | Col 4   |
+
+ SubSubSection 2.1.1
--- a/tests/data/test_01.asciidoc
+++ b/tests/data/test_01.asciidoc
@ -0,0 +1,25 @@
+= 1st Sample Document Title
+
+This is an abstract.
+
+== Section 1
+
+This is some introductory text in section 1.
+
+This spans multiple lines but should be treated
+as a single paragraph.
+    
+=== Subsection 1.1
+* First list item
+* Second list item
+
+This is some introductory text in section 1.1.
+
+- A dash list item
+    
+== Section 2
+This is some text in section 2.
+    
+|Header 1|Header 2|
+|Value 1|Value 2|
+|Value 3|Value 4|
--- a/tests/data/test_02.asciidoc
+++ b/tests/data/test_02.asciidoc
@ -0,0 +1,69 @@
+= 2nd Sample Document Title
+
+This is an abstract.
+
+== Section 1: Testing nestedlists
+
+* First item
+  * Nested item 1
+  * Nested item 2
+* Second item
+  1. Nested ordered item 1
+  2. Nested ordered item 2
+    * Deeper nested unordered item
+* Third item
+  1. Nested ordered item 1
+  2. Nested ordered item 2
+    * Deeper nested unordered item
+  3. Nested ordered item 2
+
+== Section 2
+
+bla bla
+
+==== SubSubSection 2.1.1
+
+bla bla bla
+bli bla ble
+
+== Section 3: test image
+
+image::images/example1.png[Example Image, width=200, height=150, align=center]
+
+.An example caption for the image
+image::images/example2.png[Example Image, width=200, height=150, align=center]
+
+== Section 4: test tables
+
+|Header 1|Header 2|
+|Value 1|Value 2|
+|Value 3|Value 4|
+
+.Caption for the table 1
+|===
+|Header 1 |Header 2
+|Value 1  |Value 2
+|Value 3  |Value 4
+|===
+
+.Caption for the table 2
+|=== 
+|Column 1 Heading |Column 2 Heading |Column 3 Heading
+|Cell 1 |Cell 2 |Cell 3
+|Cell 4 |Cell 5 colspan=2|Cell spans two columns
+|===
+
+.Caption for the table 3
+|===
+|Column 1 Heading |Column 2 Heading |Column 3 Heading
+|Rowspan=2 |Cell 2 |Cell 3
+| |Cell 5 |Cell 6
+|===
+
+.Caption for the table 4
+|===
+|Col 1 |Col 2 |Col 3 |Col 4
+|Rowspan=2.Colspan=2|Cell spanning 2 rows and 2 columns |Col 3 |Col 4
+|   |   |Col 3 |Col 4
+|Col 1 |Col 2 |Col 3 |Col 4
+|===
--- a/tests/test_backend_asciidoc.py
+++ b/tests/test_backend_asciidoc.py
@ -0,0 +1,56 @@
+import glob
+import os
+from pathlib import Path
+
+import pytest
+from docling_core.types.doc import BoundingBox
+
+from docling.backend.asciidoc_backend import AsciidocBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+
+def _get_backend(fname):
+    in_doc = InputDocument(
+        path_or_stream=fname,
+        format=InputFormat.ASCIIDOC,
+        backend=AsciidocBackend,
+    )
+
+    doc_backend = in_doc._backend
+    return doc_backend
+
+
+def test_asciidocs_examples():
+
+    fnames = sorted(glob.glob("./tests/data/*.asciidoc"))
+
+    for fname in fnames:
+        print(f"reading {fname}")
+
+        bname = os.path.basename(fname)
+        gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md")
+
+        doc_backend = _get_backend(Path(fname))
+        doc = doc_backend.convert()
+
+        pred_itdoc = doc._export_to_indented_text(max_text_len=16)
+        print("\n\n", pred_itdoc)
+
+        pred_mddoc = doc.export_to_markdown()
+        print("\n\n", pred_mddoc)
+
+        if os.path.exists(gname):
+            with open(gname, "r") as fr:
+                true_mddoc = fr.read()
+
+            # assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
+        else:
+            with open(gname, "w") as fw:
+                fw.write(pred_mddoc)
+
+            # print("\n\n", doc.export_to_markdown())
+
+        input("continue")
+
+    assert True