Merge branch 'dev/add-asciidocs-backend' of github.com:DS4SD/docling into cau/backend-document-origin

This commit is contained in:
Christoph Auer 2024-10-22 11:04:49 +02:00
commit 0bbd50f500
10 changed files with 1343 additions and 508 deletions

View File

@ -0,0 +1,433 @@
import logging
import re
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from docling_core.types.doc import (
DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupItem,
GroupLabel,
ImageRef,
NodeItem,
Size,
TableCell,
TableData,
)
from pydantic import AnyUrl
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class AsciidocBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self.path_or_stream = path_or_stream
self.valid = True
def is_valid(self) -> bool:
return self.valid
@classmethod
def supports_pagination(cls) -> bool:
return False
def unload(self):
return
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.ASCIIDOC}
def convert(self) -> DoclingDocument:
"""
Parses the ASCII into a structured document model.
"""
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin(
filename=fname,
mimetype="text/asciidoc",
binary_hash=self.document_hash,
)
if len(fname) > 0:
docname = Path(fname).stem
else:
docname = "stream"
doc = DoclingDocument(name=docname, origin=origin)
doc = self.parse(doc)
return doc
def parse(self, doc: DoclingDocument):
"""
Main function that orchestrates the parsing by yielding components:
title, section headers, text, lists, and tables.
"""
content = ""
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r") as fr:
self.lines = fr.readlines()
# self.lines = file_content.splitlines()
in_list = False
in_table = False
text_data: list[str] = []
table_data: list[str] = []
caption_data: list[str] = []
# parents: dict[int, Union[DocItem, GroupItem, None]] = {}
parents: dict[int, Union[GroupItem, None]] = {}
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
indents: dict[int, Union[GroupItem, None]] = {}
for i in range(0, 10):
parents[i] = None
indents[i] = None
for line in self.lines:
# line = line.strip()
# Title
if self.is_title(line):
item = self.parse_title(line)
level = item["level"]
parents[level] = doc.add_text(
text=item["text"], label=DocItemLabel.TITLE
)
# Section headers
elif self.is_section_header(line):
item = self.parse_section_header(line)
level = item["level"]
parents[level] = doc.add_heading(
text=item["text"], level=item["level"], parent=parents[level - 1]
)
for k, v in parents.items():
if k > level:
parents[k] = None
# Lists
elif self.is_list_item(line):
print("line: ", line)
item = self.parse_list_item(line)
print("parsed list-item: ", item)
level = self.get_current_level(parents)
if not in_list:
in_list = True
parents[level + 1] = doc.add_group(
parent=parents[level], name="list", label=GroupLabel.LIST
)
indents[level + 1] = item["indent"]
elif in_list and item["indent"] > indents[level]:
parents[level + 1] = doc.add_group(
parent=parents[level], name="list", label=GroupLabel.LIST
)
indents[level + 1] = item["indent"]
elif in_list and item["indent"] < indents[level]:
print(item["indent"], " => ", indents[level])
while item["indent"] < indents[level]:
print(item["indent"], " => ", indents[level])
parents[level] = None
indents[level] = None
level -= 1
doc.add_list_item(item["text"], parent=self.get_current_parent(parents))
elif in_list and not self.is_list_item(line):
in_list = False
level = self.get_current_level(parents)
parents[level] = None
# Tables
elif line.strip() == "|===" and not in_table: # start of table
in_table = True
elif self.is_table_line(line): # within a table
in_table = True
table_data.append(self.parse_table_line(line))
elif in_table and (
(not self.is_table_line(line)) or line.strip() == "|==="
): # end of table
caption = None
if len(caption_data) > 0:
caption = doc.add_text(
text=" ".join(caption_data), label=DocItemLabel.CAPTION
)
caption_data = []
data = self.populate_table_as_grid(table_data)
doc.add_table(
data=data, parent=self.get_current_parent(parents), caption=caption
)
in_table = False
table_data = []
# Picture
elif self.is_picture(line):
caption = None
if len(caption_data) > 0:
caption = doc.add_text(
text=" ".join(caption_data), label=DocItemLabel.CAPTION
)
caption_data = []
item = self.parse_picture(line)
print(item)
size = None
if "width" in item and "height" in item:
size = Size(width=int(item["width"]), height=int(item["height"]))
uri = None
if (
"uri" in item
and not item["uri"].startswith("http")
and item["uri"].startswith("//")
):
uri = "file:" + item["uri"]
elif (
"uri" in item
and not item["uri"].startswith("http")
and item["uri"].startswith("/")
):
uri = "file:/" + item["uri"]
elif "uri" in item and not item["uri"].startswith("http"):
uri = "file://" + item["uri"]
image = ImageRef(mimetype="image/png", size=size, dpi=70, uri=uri)
doc.add_picture(image=image, caption=caption)
# Caption
elif self.is_caption(line) and len(caption_data) == 0:
item = self.parse_caption(line)
caption_data.append(item["text"])
elif (
len(line.strip()) > 0 and len(caption_data) > 0
): # allow multiline captions
item = self.parse_text(line)
caption_data.append(item["text"])
# Plain text
elif len(line.strip()) == 0 and len(text_data) > 0:
doc.add_text(
text=" ".join(text_data),
label=DocItemLabel.PARAGRAPH,
parent=self.get_current_parent(parents),
)
text_data = []
elif len(line.strip()) > 0: # allow multiline texts
item = self.parse_text(line)
text_data.append(item["text"])
if len(text_data) > 0:
doc.add_text(
text=" ".join(text_data),
label=DocItemLabel.PARAGRAPH,
parent=self.get_current_parent(parents),
)
text_data = []
if in_table and len(table_data) > 0:
data = self.populate_table_as_grid(table_data)
doc.add_table(data=data, parent=self.get_current_parent(parents))
in_table = False
table_data = []
return doc
def get_current_level(self, parents):
for k, v in parents.items():
if v == None and k > 0:
return k - 1
return 0
def get_current_parent(self, parents):
for k, v in parents.items():
if v == None and k > 0:
return parents[k - 1]
return None
# ========= Title
def is_title(self, line):
return re.match(r"^= ", line)
def parse_title(self, line):
return {"type": "title", "text": line[2:].strip(), "level": 0}
# ========= Section headers
def is_section_header(self, line):
return re.match(r"^==+", line)
def parse_section_header(self, line):
match = re.match(r"^(=+)\s+(.*)", line)
marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
text = match.group(2) # The actual text of the list item
header_level = marker.count("=") # number of '=' represents level
return {
"type": "header",
"level": header_level - 1,
"text": text.strip(),
}
# ========= Lists
def is_list_item(self, line):
return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)
def parse_list_item(self, line):
"""Extract the item marker (number or bullet symbol) and the text of the item."""
match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
if match:
indent = match.group(1)
marker = match.group(2) # The list marker (e.g., "*", "-", "1.")
text = match.group(3) # The actual text of the list item
if marker == "*" or marker == "-":
return {
"type": "list_item",
"marker": marker,
"text": text.strip(),
"numbered": False,
"indent": 0 if indent == None else len(indent),
}
else:
return {
"type": "list_item",
"marker": marker,
"text": text.strip(),
"numbered": True,
"indent": 0 if indent == None else len(indent),
}
else:
# Fallback if no match
return {
"type": "list_item",
"marker": item_marker,
"text": line,
"numbered": False,
"indent": 0,
}
# ========= Tables
def is_table_line(self, line):
return re.match(r"^\|.*\|", line)
def parse_table_line(self, line):
# Split table cells and trim extra spaces
return [cell.strip() for cell in line.split("|") if cell.strip()]
def populate_table_as_grid(self, table_data):
num_rows = len(table_data)
# Adjust the table data into a grid format
num_cols = max(len(row) for row in table_data)
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
for row_idx, row in enumerate(table_data):
# Pad rows with empty strings to match column count
# grid.append(row + [''] * (max_cols - len(row)))
for col_idx, text in enumerate(row):
row_span = 1
col_span = 1
cell = TableCell(
text=text,
row_span=row_span,
col_span=col_span,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
col_header=False,
row_header=False,
)
data.table_cells.append(cell)
return data
# ========= Pictures
def is_picture(self, line):
return re.match(r"^image::", line)
def parse_picture(self, line):
"""
Parse an image macro, extracting its path and attributes.
Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
"""
mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
if mtch:
picture_path = mtch.group(1).strip()
attributes = mtch.group(2).split(",")
picture_info = {"type": "picture", "uri": picture_path}
# Extract optional attributes (alt text, width, height, alignment)
if attributes:
picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
for attr in attributes[1:]:
key, value = attr.split("=")
picture_info[key.strip()] = value.strip()
return picture_info
return {"type": "picture", "uri": line}
# ========= Captions
def is_caption(self, line):
return re.match(r"^\.(.+)", line)
def parse_caption(self, line):
mtch = re.match(r"^\.(.+)", line)
if mtch:
text = mtch.group(1)
return {"type": "caption", "text": text}
return {"type": "caption", "text": ""}
# ========= Plain text
def parse_text(self, line):
return {"type": "text", "text": line.strip()}

View File

@ -30,6 +30,7 @@ class InputFormat(str, Enum):
HTML = "html"
IMAGE = "image"
PDF = "pdf"
ASCIIDOC = "asciidoc"
class OutputFormat(str, Enum):
@ -45,6 +46,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.PDF: ["pdf"],
InputFormat.HTML: ["html", "htm", "xhtml"],
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", ".asciidoc", "asc"],
}
FormatToMimeType: Dict[InputFormat, Set[str]] = {
@ -66,6 +68,7 @@ FormatToMimeType: Dict[InputFormat, Set[str]] = {
"image/bmp",
},
InputFormat.PDF: {"application/pdf"},
InputFormat.ASCIIDOC: {"application/asciidoc"},
}
MimeTypeToFormat = {
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes

View File

@ -25,6 +25,7 @@ def main():
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2305.03393v1-pg9-img.png"),
Path("tests/data/2206.01062.pdf"),
Path("tests/data/test_01.asciidoc"),
]
## for defaults use:
@ -40,6 +41,7 @@ def main():
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
InputFormat.ASCIIDOC,
], # whitelist formats, non-matching files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(

1153
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -37,7 +37,8 @@ torchvision = [
######################
python = "^3.10"
pydantic = "^2.0.0"
docling-core = "^2.0.0"
#docling-core = "^2.0.1"
docling-core = { git = "https://github.com/DS4SD/docling-core.git", rev = "c78e8f16524fd378d9a261a74982f60fa9debd47" }
docling-ibm-models = "^2.0.1"
deepsearch-glm = "^0.25.0"
filetype = "^1.2.0"

View File

@ -0,0 +1,24 @@
# Sample Document Title
## Section 1
This is some introductory text in section 1.
## Subsection 1.1
- * First list item
- * Second list item
This is some introductory text in section 1.1.
- - A dash list item
## Section 2
This is some text in section 2.
| Header 1 | Header 2 |
|------------|------------|
| Value 1 | Value 2 |
| Value 3 | Value 4 |

View File

@ -0,0 +1,83 @@
2nd Sample Document Title
This is an abstract.
Section 1: Testing nestedlists
- First item
- Nested item 1
- Nested item 2
- Second item
- Nested ordered item 1
- Nested ordered item 2
- Deeper nested unordered item
- Third item
- Nested ordered item 1
- Nested ordered item 2
- Deeper nested unordered item
- Nested ordered item 2
Section 2
bla bla
bla bla bla
Section 3: test image
image::images/example1.png[Example Image, width=200, height=150, align=center]
.An example caption for the image
image::images/example2.png[Example Image, width=200, height=150, align=center]
Section 4: test tables
| Header 1 | Header 2 |
|------------|------------|
| Value 1 | Value 2 |
| Value 3 | Value 4 |
.Caption for the table 1
|===
| Header 1 | Header 2 |
|------------|------------|
| Value 1 | Value 2 |
| Value 3 | Value 4 |
.Caption for the table 2
|===
| Column 1 Heading | Column 2 Heading | Column 3 Heading |
|--------------------|--------------------|------------------------|
| Cell 1 | Cell 2 | Cell 3 |
| Cell 4 | Cell 5 colspan=2 | Cell spans two columns |
.Caption for the table 3
|===
| Column 1 Heading | Column 2 Heading | Column 3 Heading |
|--------------------|--------------------|--------------------|
| Rowspan=2 | Cell 2 | Cell 3 |
| Cell 5 | Cell 6 | |
.Caption for the table 4
|===
| Col 1 | Col 2 | Col 3 | Col 4 |
|---------------------|------------------------------------|---------|---------|
| Rowspan=2.Colspan=2 | Cell spanning 2 rows and 2 columns | Col 3 | Col 4 |
| Col 3 | Col 4 | | |
| Col 1 | Col 2 | Col 3 | Col 4 |
SubSubSection 2.1.1

View File

@ -0,0 +1,25 @@
= 1st Sample Document Title
This is an abstract.
== Section 1
This is some introductory text in section 1.
This spans multiple lines but should be treated
as a single paragraph.
=== Subsection 1.1
* First list item
* Second list item
This is some introductory text in section 1.1.
- A dash list item
== Section 2
This is some text in section 2.
|Header 1|Header 2|
|Value 1|Value 2|
|Value 3|Value 4|

View File

@ -0,0 +1,69 @@
= 2nd Sample Document Title
This is an abstract.
== Section 1: Testing nestedlists
* First item
* Nested item 1
* Nested item 2
* Second item
1. Nested ordered item 1
2. Nested ordered item 2
* Deeper nested unordered item
* Third item
1. Nested ordered item 1
2. Nested ordered item 2
* Deeper nested unordered item
3. Nested ordered item 2
== Section 2
bla bla
==== SubSubSection 2.1.1
bla bla bla
bli bla ble
== Section 3: test image
image::images/example1.png[Example Image, width=200, height=150, align=center]
.An example caption for the image
image::images/example2.png[Example Image, width=200, height=150, align=center]
== Section 4: test tables
|Header 1|Header 2|
|Value 1|Value 2|
|Value 3|Value 4|
.Caption for the table 1
|===
|Header 1 |Header 2
|Value 1 |Value 2
|Value 3 |Value 4
|===
.Caption for the table 2
|===
|Column 1 Heading |Column 2 Heading |Column 3 Heading
|Cell 1 |Cell 2 |Cell 3
|Cell 4 |Cell 5 colspan=2|Cell spans two columns
|===
.Caption for the table 3
|===
|Column 1 Heading |Column 2 Heading |Column 3 Heading
|Rowspan=2 |Cell 2 |Cell 3
| |Cell 5 |Cell 6
|===
.Caption for the table 4
|===
|Col 1 |Col 2 |Col 3 |Col 4
|Rowspan=2.Colspan=2|Cell spanning 2 rows and 2 columns |Col 3 |Col 4
| | |Col 3 |Col 4
|Col 1 |Col 2 |Col 3 |Col 4
|===

View File

@ -0,0 +1,56 @@
import glob
import os
from pathlib import Path
import pytest
from docling_core.types.doc import BoundingBox
from docling.backend.asciidoc_backend import AsciidocBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
def _get_backend(fname):
in_doc = InputDocument(
path_or_stream=fname,
format=InputFormat.ASCIIDOC,
backend=AsciidocBackend,
)
doc_backend = in_doc._backend
return doc_backend
def test_asciidocs_examples():
fnames = sorted(glob.glob("./tests/data/*.asciidoc"))
for fname in fnames:
print(f"reading {fname}")
bname = os.path.basename(fname)
gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md")
doc_backend = _get_backend(Path(fname))
doc = doc_backend.convert()
pred_itdoc = doc._export_to_indented_text(max_text_len=16)
print("\n\n", pred_itdoc)
pred_mddoc = doc.export_to_markdown()
print("\n\n", pred_mddoc)
if os.path.exists(gname):
with open(gname, "r") as fr:
true_mddoc = fr.read()
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
else:
with open(gname, "w") as fw:
fw.write(pred_mddoc)
# print("\n\n", doc.export_to_markdown())
input("continue")
assert True