reformatted the code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-10-18 16:57:26 +02:00
parent 5016daeae3
commit 70b2ae3fab
3 changed files with 51 additions and 52 deletions

View File

@ -1,6 +1,5 @@
import re
import logging import logging
import re
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import Set, Union
@ -16,7 +15,8 @@ from docling_core.types.doc import (
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
#from docling.datamodel.document import InputDocument
# from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -29,7 +29,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
self.path_or_stream = path_or_stream self.path_or_stream = path_or_stream
self.valid = True self.valid = True
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
@ -61,12 +61,12 @@ class AsciidocBackend(DeclarativeDocumentBackend):
if len(fname) > 0: if len(fname) > 0:
docname = Path(fname).stem docname = Path(fname).stem
else: else:
docname = "stream" docname = "stream"
doc = DoclingDocument(name=docname, origin=origin) doc = DoclingDocument(name=docname, origin=origin)
doc = self.parse(doc) doc = self.parse(doc)
return doc return doc
def parse(self, doc: DoclingDocument): def parse(self, doc: DoclingDocument):
@ -75,19 +75,19 @@ class AsciidocBackend(DeclarativeDocumentBackend):
title, section headers, text, lists, and tables. title, section headers, text, lists, and tables.
""" """
content="" content = ""
with open(self.path_or_stream, "r") as fr: with open(self.path_or_stream, "r") as fr:
self.lines = fr.readlines() self.lines = fr.readlines()
#self.lines = file_content.splitlines() # self.lines = file_content.splitlines()
in_list = False in_list = False
in_table = False in_table = False
table_data = [] table_data = []
for line in self.lines: for line in self.lines:
line = line.strip() line = line.strip()
# Title # Title
if self.is_title(line): if self.is_title(line):
item = self.parse_title(line) item = self.parse_title(line)
@ -105,41 +105,41 @@ class AsciidocBackend(DeclarativeDocumentBackend):
item = self.parse_list_item(line) item = self.parse_list_item(line)
doc.add_list_item(item["text"]) doc.add_list_item(item["text"])
elif in_list and not self.is_list_item(line): elif in_list and not self.is_list_item(line):
in_list = False in_list = False
# Tables # Tables
elif self.is_table_line(line): elif self.is_table_line(line):
in_table = True in_table = True
table_data.append(self.parse_table_line(line)) table_data.append(self.parse_table_line(line))
elif in_table and not self.is_table_line(line): elif in_table and not self.is_table_line(line):
data = self.populate_table_as_grid(table_data) data = self.populate_table_as_grid(table_data)
doc.add_table(data=data) doc.add_table(data=data)
in_table = False in_table = False
table_data = [] table_data = []
# Plain text # Plain text
elif line: elif line:
item = self.parse_text(line) item = self.parse_text(line)
doc.add_text(text=item["text"], label="text") doc.add_text(text=item["text"], label="text")
if in_table and len(table_data)>0: if in_table and len(table_data) > 0:
data = self.populate_table_as_grid(table_data) data = self.populate_table_as_grid(table_data)
doc.add_table(data=data) doc.add_table(data=data)
in_table = False in_table = False
table_data = [] table_data = []
return doc return doc
# Title # Title
def is_title(self, line): def is_title(self, line):
return re.match(r"^= ", line) return re.match(r"^= ", line)
def parse_title(self, line): def parse_title(self, line):
return {"type": "title", "text": line[2:].strip()} return {"type": "title", "text": line[2:].strip()}
@ -148,9 +148,13 @@ class AsciidocBackend(DeclarativeDocumentBackend):
return re.match(r"^==+", line) return re.match(r"^==+", line)
def parse_section_header(self, line): def parse_section_header(self, line):
header_level = line.count('=') # number of '=' represents level header_level = line.count("=") # number of '=' represents level
return {"type": "header", "level": header_level, "text": line[header_level:].strip()} return {
"type": "header",
"level": header_level,
"text": line[header_level:].strip(),
}
# Lists # Lists
def is_list_item(self, line): def is_list_item(self, line):
return re.match(r"^(\*|-|\d+\.|\w+\.) ", line) return re.match(r"^(\*|-|\d+\.|\w+\.) ", line)
@ -164,24 +168,24 @@ class AsciidocBackend(DeclarativeDocumentBackend):
def parse_table_line(self, line): def parse_table_line(self, line):
# Split table cells and trim extra spaces # Split table cells and trim extra spaces
return [cell.strip() for cell in line.split('|') if cell.strip()] return [cell.strip() for cell in line.split("|") if cell.strip()]
def populate_table_as_grid(self, table_data): def populate_table_as_grid(self, table_data):
num_rows = len(table_data) num_rows = len(table_data)
# Adjust the table data into a grid format # Adjust the table data into a grid format
num_cols = max(len(row) for row in table_data) num_cols = max(len(row) for row in table_data)
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[]) data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
for row_idx,row in enumerate(table_data): for row_idx, row in enumerate(table_data):
# Pad rows with empty strings to match column count # Pad rows with empty strings to match column count
#grid.append(row + [''] * (max_cols - len(row))) # grid.append(row + [''] * (max_cols - len(row)))
for col_idx,text in enumerate(row): for col_idx, text in enumerate(row):
row_span = 1 row_span = 1
col_span = 1 col_span = 1
cell = TableCell( cell = TableCell(
text=text, text=text,
row_span=row_span, row_span=row_span,
@ -191,11 +195,12 @@ class AsciidocBackend(DeclarativeDocumentBackend):
start_col_offset_idx=col_idx, start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span, end_col_offset_idx=col_idx + col_span,
col_header=False, col_header=False,
row_header=False) row_header=False,
data.table_cells.append(cell) )
data.table_cells.append(cell)
return data return data
# Plain text # Plain text
def parse_text(self, line): def parse_text(self, line):
return {"type": "text", "text": line} return {"type": "text", "text": line}

View File

@ -46,7 +46,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.PDF: ["pdf"], InputFormat.PDF: ["pdf"],
InputFormat.HTML: ["html", "htm", "xhtml"], InputFormat.HTML: ["html", "htm", "xhtml"],
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"], InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", ".asciidoc", "asc"], InputFormat.ASCIIDOC: ["adoc", ".asciidoc", "asc"],
} }
FormatToMimeType: Dict[InputFormat, Set[str]] = { FormatToMimeType: Dict[InputFormat, Set[str]] = {

View File

@ -1,14 +1,11 @@
import glob import glob
import os import os
from pathlib import Path from pathlib import Path
import pytest import pytest
from docling_core.types.doc import BoundingBox from docling_core.types.doc import BoundingBox
from docling.backend.asciidoc_backend import ( from docling.backend.asciidoc_backend import AsciidocBackend
AsciidocBackend,
)
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
@ -25,15 +22,15 @@ def _get_backend(fname):
def test_asciidocs_examples(): def test_asciidocs_examples():
fnames = sorted(glob.glob("./tests/data/*.asciidoc")) fnames = sorted(glob.glob("./tests/data/*.asciidoc"))
for fname in fnames: for fname in fnames:
print(f"reading {fname}") print(f"reading {fname}")
bname = os.path.basename(fname) bname = os.path.basename(fname)
gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname+".md") gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md")
doc_backend = _get_backend(Path(fname)) doc_backend = _get_backend(Path(fname))
doc = doc_backend.convert() doc = doc_backend.convert()
@ -43,12 +40,9 @@ def test_asciidocs_examples():
with open(gname, "r") as fr: with open(gname, "r") as fr:
true_mddoc = fr.read() true_mddoc = fr.read()
assert pred_mddoc==true_mddoc, "pred_mddoc!=true_mddoc for asciidoc" assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
else: else:
with open(gname, "w") as fw: with open(gname, "w") as fw:
fw.write(pred_mddoc) fw.write(pred_mddoc)
print("\n\n", doc.export_to_markdown()) print("\n\n", doc.export_to_markdown())