mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
first working asciidoc parser
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
1138cae7f1
commit
5016daeae3
@ -65,7 +65,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
doc = DoclingDocument(name=docname, origin=origin)
|
doc = DoclingDocument(name=docname, origin=origin)
|
||||||
|
|
||||||
doc = self.parse_stream(doc)
|
doc = self.parse(doc)
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
@ -77,7 +77,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
content=""
|
content=""
|
||||||
with open(self.path_or_stream, "r") as fr:
|
with open(self.path_or_stream, "r") as fr:
|
||||||
self.lines = fr.read_lines()
|
self.lines = fr.readlines()
|
||||||
|
|
||||||
#self.lines = file_content.splitlines()
|
#self.lines = file_content.splitlines()
|
||||||
|
|
||||||
@ -91,7 +91,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
# Title
|
# Title
|
||||||
if self.is_title(line):
|
if self.is_title(line):
|
||||||
item = self.parse_title(line)
|
item = self.parse_title(line)
|
||||||
doc.set_title(text=item["text"])
|
doc.add_text(text=item["text"], label="title")
|
||||||
|
|
||||||
# Section headers
|
# Section headers
|
||||||
elif self.is_section_header(line):
|
elif self.is_section_header(line):
|
||||||
@ -104,7 +104,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
in_list = True
|
in_list = True
|
||||||
|
|
||||||
item = self.parse_list_item(line)
|
item = self.parse_list_item(line)
|
||||||
doc.add_listitem(item["text"])
|
doc.add_list_item(item["text"])
|
||||||
|
|
||||||
elif in_list and not self.is_list_item(line):
|
elif in_list and not self.is_list_item(line):
|
||||||
in_list = False
|
in_list = False
|
||||||
@ -113,12 +113,11 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
elif self.is_table_line(line):
|
elif self.is_table_line(line):
|
||||||
in_table = True
|
in_table = True
|
||||||
table_data.append(self.parse_table_line(line))
|
table_data.append(self.parse_table_line(line))
|
||||||
continue
|
|
||||||
|
|
||||||
elif in_table and not self.is_table_line(line):
|
elif in_table and not self.is_table_line(line):
|
||||||
|
|
||||||
grid = self.populate_table_as_grid(table_data)
|
data = self.populate_table_as_grid(table_data)
|
||||||
doc.add_table(data=grid)
|
doc.add_table(data=data)
|
||||||
|
|
||||||
in_table = False
|
in_table = False
|
||||||
table_data = []
|
table_data = []
|
||||||
@ -126,8 +125,15 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
# Plain text
|
# Plain text
|
||||||
elif line:
|
elif line:
|
||||||
item = self.parse_text(line)
|
item = self.parse_text(line)
|
||||||
doc.add_text(text=item["text"])
|
doc.add_text(text=item["text"], label="text")
|
||||||
|
|
||||||
|
if in_table and len(table_data)>0:
|
||||||
|
data = self.populate_table_as_grid(table_data)
|
||||||
|
doc.add_table(data=data)
|
||||||
|
|
||||||
|
in_table = False
|
||||||
|
table_data = []
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
# Title
|
# Title
|
||||||
@ -161,13 +167,34 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
return [cell.strip() for cell in line.split('|') if cell.strip()]
|
return [cell.strip() for cell in line.split('|') if cell.strip()]
|
||||||
|
|
||||||
def populate_table_as_grid(self, table_data):
|
def populate_table_as_grid(self, table_data):
|
||||||
|
|
||||||
|
num_rows = len(table_data)
|
||||||
|
|
||||||
# Adjust the table data into a grid format
|
# Adjust the table data into a grid format
|
||||||
max_cols = max(len(row) for row in table_data)
|
num_cols = max(len(row) for row in table_data)
|
||||||
grid = []
|
|
||||||
for row in table_data:
|
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
||||||
|
for row_idx,row in enumerate(table_data):
|
||||||
# Pad rows with empty strings to match column count
|
# Pad rows with empty strings to match column count
|
||||||
grid.append(row + [''] * (max_cols - len(row)))
|
#grid.append(row + [''] * (max_cols - len(row)))
|
||||||
return grid
|
|
||||||
|
for col_idx,text in enumerate(row):
|
||||||
|
row_span = 1
|
||||||
|
col_span = 1
|
||||||
|
|
||||||
|
cell = TableCell(
|
||||||
|
text=text,
|
||||||
|
row_span=row_span,
|
||||||
|
col_span=col_span,
|
||||||
|
start_row_offset_idx=row_idx,
|
||||||
|
end_row_offset_idx=row_idx + row_span,
|
||||||
|
start_col_offset_idx=col_idx,
|
||||||
|
end_col_offset_idx=col_idx + col_span,
|
||||||
|
col_header=False,
|
||||||
|
row_header=False)
|
||||||
|
data.table_cells.append(cell)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
# Plain text
|
# Plain text
|
||||||
def parse_text(self, line):
|
def parse_text(self, line):
|
||||||
|
24
tests/data/groundtruth/docling_v2/test_01.asciidoc.md
Normal file
24
tests/data/groundtruth/docling_v2/test_01.asciidoc.md
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
# Sample Document Title
|
||||||
|
|
||||||
|
## Section 1
|
||||||
|
|
||||||
|
This is some introductory text in section 1.
|
||||||
|
|
||||||
|
## Subsection 1.1
|
||||||
|
|
||||||
|
- * First list item
|
||||||
|
|
||||||
|
- * Second list item
|
||||||
|
|
||||||
|
This is some introductory text in section 1.1.
|
||||||
|
|
||||||
|
- - A dash list item
|
||||||
|
|
||||||
|
## Section 2
|
||||||
|
|
||||||
|
This is some text in section 2.
|
||||||
|
|
||||||
|
| Header 1 | Header 2 |
|
||||||
|
|------------|------------|
|
||||||
|
| Value 1 | Value 2 |
|
||||||
|
| Value 3 | Value 4 |
|
20
tests/data/test_01.asciidoc
Normal file
20
tests/data/test_01.asciidoc
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
= Sample Document Title
|
||||||
|
|
||||||
|
== Section 1
|
||||||
|
|
||||||
|
This is some introductory text in section 1.
|
||||||
|
|
||||||
|
=== Subsection 1.1
|
||||||
|
* First list item
|
||||||
|
* Second list item
|
||||||
|
|
||||||
|
This is some introductory text in section 1.1.
|
||||||
|
|
||||||
|
- A dash list item
|
||||||
|
|
||||||
|
== Section 2
|
||||||
|
This is some text in section 2.
|
||||||
|
|
||||||
|
|Header 1|Header 2|
|
||||||
|
|Value 1|Value 2|
|
||||||
|
|Value 3|Value 4|
|
54
tests/test_backend_asciidoc.py
Normal file
54
tests/test_backend_asciidoc.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
import glob
|
||||||
|
import os
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from docling_core.types.doc import BoundingBox
|
||||||
|
|
||||||
|
from docling.backend.asciidoc_backend import (
|
||||||
|
AsciidocBackend,
|
||||||
|
)
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
|
||||||
|
def _get_backend(fname):
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=fname,
|
||||||
|
format=InputFormat.ASCIIDOC,
|
||||||
|
backend=AsciidocBackend,
|
||||||
|
)
|
||||||
|
|
||||||
|
doc_backend = in_doc._backend
|
||||||
|
return doc_backend
|
||||||
|
|
||||||
|
|
||||||
|
def test_asciidocs_examples():
|
||||||
|
|
||||||
|
fnames = sorted(glob.glob("./tests/data/*.asciidoc"))
|
||||||
|
|
||||||
|
for fname in fnames:
|
||||||
|
print(f"reading {fname}")
|
||||||
|
|
||||||
|
bname = os.path.basename(fname)
|
||||||
|
gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname+".md")
|
||||||
|
|
||||||
|
doc_backend = _get_backend(Path(fname))
|
||||||
|
doc = doc_backend.convert()
|
||||||
|
|
||||||
|
pred_mddoc = doc.export_to_markdown()
|
||||||
|
|
||||||
|
if os.path.exists(gname):
|
||||||
|
with open(gname, "r") as fr:
|
||||||
|
true_mddoc = fr.read()
|
||||||
|
|
||||||
|
assert pred_mddoc==true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
|
||||||
|
else:
|
||||||
|
with open(gname, "w") as fw:
|
||||||
|
fw.write(pred_mddoc)
|
||||||
|
|
||||||
|
print("\n\n", doc.export_to_markdown())
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user