diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py index 43e6fd71..164a4465 100644 --- a/docling/backend/asciidoc_backend.py +++ b/docling/backend/asciidoc_backend.py @@ -65,7 +65,7 @@ class AsciidocBackend(DeclarativeDocumentBackend): doc = DoclingDocument(name=docname, origin=origin) - doc = self.parse_stream(doc) + doc = self.parse(doc) return doc @@ -77,7 +77,7 @@ class AsciidocBackend(DeclarativeDocumentBackend): content="" with open(self.path_or_stream, "r") as fr: - self.lines = fr.read_lines() + self.lines = fr.readlines() #self.lines = file_content.splitlines() @@ -91,7 +91,7 @@ class AsciidocBackend(DeclarativeDocumentBackend): # Title if self.is_title(line): item = self.parse_title(line) - doc.set_title(text=item["text"]) + doc.add_text(text=item["text"], label="title") # Section headers elif self.is_section_header(line): @@ -104,7 +104,7 @@ class AsciidocBackend(DeclarativeDocumentBackend): in_list = True item = self.parse_list_item(line) - doc.add_listitem(item["text"]) + doc.add_list_item(item["text"]) elif in_list and not self.is_list_item(line): in_list = False @@ -113,12 +113,11 @@ class AsciidocBackend(DeclarativeDocumentBackend): elif self.is_table_line(line): in_table = True table_data.append(self.parse_table_line(line)) - continue elif in_table and not self.is_table_line(line): - grid = self.populate_table_as_grid(table_data) - doc.add_table(data=grid) + data = self.populate_table_as_grid(table_data) + doc.add_table(data=data) in_table = False table_data = [] @@ -126,8 +125,15 @@ class AsciidocBackend(DeclarativeDocumentBackend): # Plain text elif line: item = self.parse_text(line) - doc.add_text(text=item["text"]) - + doc.add_text(text=item["text"], label="text") + + if in_table and len(table_data)>0: + data = self.populate_table_as_grid(table_data) + doc.add_table(data=data) + + in_table = False + table_data = [] + return doc # Title @@ -161,13 +167,34 @@ class AsciidocBackend(DeclarativeDocumentBackend): return [cell.strip() for cell in line.split('|') if cell.strip()] def populate_table_as_grid(self, table_data): + + num_rows = len(table_data) + # Adjust the table data into a grid format - max_cols = max(len(row) for row in table_data) - grid = [] - for row in table_data: + num_cols = max(len(row) for row in table_data) + + data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[]) + for row_idx,row in enumerate(table_data): # Pad rows with empty strings to match column count - grid.append(row + [''] * (max_cols - len(row))) - return grid + #grid.append(row + [''] * (max_cols - len(row))) + + for col_idx,text in enumerate(row): + row_span = 1 + col_span = 1 + + cell = TableCell( + text=text, + row_span=row_span, + col_span=col_span, + start_row_offset_idx=row_idx, + end_row_offset_idx=row_idx + row_span, + start_col_offset_idx=col_idx, + end_col_offset_idx=col_idx + col_span, + col_header=False, + row_header=False) + data.table_cells.append(cell) + + return data # Plain text def parse_text(self, line): diff --git a/tests/data/groundtruth/docling_v2/test_01.asciidoc.md b/tests/data/groundtruth/docling_v2/test_01.asciidoc.md new file mode 100644 index 00000000..241d3359 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/test_01.asciidoc.md @@ -0,0 +1,24 @@ +# Sample Document Title + +## Section 1 + +This is some introductory text in section 1. + +## Subsection 1.1 + +- * First list item + +- * Second list item + +This is some introductory text in section 1.1. + +- - A dash list item + +## Section 2 + +This is some text in section 2. + +| Header 1 | Header 2 | +|------------|------------| +| Value 1 | Value 2 | +| Value 3 | Value 4 | \ No newline at end of file diff --git a/tests/data/test_01.asciidoc b/tests/data/test_01.asciidoc new file mode 100644 index 00000000..5e7fcdf6 --- /dev/null +++ b/tests/data/test_01.asciidoc @@ -0,0 +1,20 @@ += Sample Document Title + +== Section 1 + +This is some introductory text in section 1. + +=== Subsection 1.1 +* First list item +* Second list item + +This is some introductory text in section 1.1. + +- A dash list item + +== Section 2 +This is some text in section 2. + +|Header 1|Header 2| +|Value 1|Value 2| +|Value 3|Value 4| \ No newline at end of file diff --git a/tests/test_backend_asciidoc.py b/tests/test_backend_asciidoc.py new file mode 100644 index 00000000..eaad403a --- /dev/null +++ b/tests/test_backend_asciidoc.py @@ -0,0 +1,54 @@ +import glob +import os + +from pathlib import Path + +import pytest +from docling_core.types.doc import BoundingBox + +from docling.backend.asciidoc_backend import ( + AsciidocBackend, +) +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument + + +def _get_backend(fname): + in_doc = InputDocument( + path_or_stream=fname, + format=InputFormat.ASCIIDOC, + backend=AsciidocBackend, + ) + + doc_backend = in_doc._backend + return doc_backend + + +def test_asciidocs_examples(): + + fnames = sorted(glob.glob("./tests/data/*.asciidoc")) + + for fname in fnames: + print(f"reading {fname}") + + bname = os.path.basename(fname) + gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname+".md") + + doc_backend = _get_backend(Path(fname)) + doc = doc_backend.convert() + + pred_mddoc = doc.export_to_markdown() + + if os.path.exists(gname): + with open(gname, "r") as fr: + true_mddoc = fr.read() + + assert pred_mddoc==true_mddoc, "pred_mddoc!=true_mddoc for asciidoc" + else: + with open(gname, "w") as fw: + fw.write(pred_mddoc) + + print("\n\n", doc.export_to_markdown()) + + +