From 70b2ae3fabc9196bc1021695db490e758d0a13db Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Fri, 18 Oct 2024 16:57:26 +0200 Subject: [PATCH] reformatted the code Signed-off-by: Peter Staar --- docling/backend/asciidoc_backend.py | 81 +++++++++++++++-------------- docling/datamodel/base_models.py | 2 +- tests/test_backend_asciidoc.py | 20 +++---- 3 files changed, 51 insertions(+), 52 deletions(-) diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py index 164a4465..8a6ca7df 100644 --- a/docling/backend/asciidoc_backend.py +++ b/docling/backend/asciidoc_backend.py @@ -1,6 +1,5 @@ -import re - import logging +import re from io import BytesIO from pathlib import Path from typing import Set, Union @@ -16,7 +15,8 @@ from docling_core.types.doc import ( from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.datamodel.base_models import InputFormat -#from docling.datamodel.document import InputDocument + +# from docling.datamodel.document import InputDocument _log = logging.getLogger(__name__) @@ -29,7 +29,7 @@ class AsciidocBackend(DeclarativeDocumentBackend): self.path_or_stream = path_or_stream self.valid = True - + def is_valid(self) -> bool: return self.valid @@ -61,12 +61,12 @@ class AsciidocBackend(DeclarativeDocumentBackend): if len(fname) > 0: docname = Path(fname).stem else: - docname = "stream" - + docname = "stream" + doc = DoclingDocument(name=docname, origin=origin) doc = self.parse(doc) - + return doc def parse(self, doc: DoclingDocument): @@ -75,19 +75,19 @@ class AsciidocBackend(DeclarativeDocumentBackend): title, section headers, text, lists, and tables. """ - content="" + content = "" with open(self.path_or_stream, "r") as fr: self.lines = fr.readlines() - - #self.lines = file_content.splitlines() - + + # self.lines = file_content.splitlines() + in_list = False in_table = False table_data = [] - + for line in self.lines: line = line.strip() - + # Title if self.is_title(line): item = self.parse_title(line) @@ -105,41 +105,41 @@ class AsciidocBackend(DeclarativeDocumentBackend): item = self.parse_list_item(line) doc.add_list_item(item["text"]) - + elif in_list and not self.is_list_item(line): in_list = False - + # Tables elif self.is_table_line(line): - in_table = True + in_table = True table_data.append(self.parse_table_line(line)) - + elif in_table and not self.is_table_line(line): - + data = self.populate_table_as_grid(table_data) doc.add_table(data=data) - + in_table = False table_data = [] - + # Plain text elif line: item = self.parse_text(line) doc.add_text(text=item["text"], label="text") - if in_table and len(table_data)>0: + if in_table and len(table_data) > 0: data = self.populate_table_as_grid(table_data) doc.add_table(data=data) - + in_table = False table_data = [] - + return doc # Title def is_title(self, line): return re.match(r"^= ", line) - + def parse_title(self, line): return {"type": "title", "text": line[2:].strip()} @@ -148,9 +148,13 @@ class AsciidocBackend(DeclarativeDocumentBackend): return re.match(r"^==+", line) def parse_section_header(self, line): - header_level = line.count('=') # number of '=' represents level - return {"type": "header", "level": header_level, "text": line[header_level:].strip()} - + header_level = line.count("=") # number of '=' represents level + return { + "type": "header", + "level": header_level, + "text": line[header_level:].strip(), + } + # Lists def is_list_item(self, line): return re.match(r"^(\*|-|\d+\.|\w+\.) ", line) @@ -164,24 +168,24 @@ class AsciidocBackend(DeclarativeDocumentBackend): def parse_table_line(self, line): # Split table cells and trim extra spaces - return [cell.strip() for cell in line.split('|') if cell.strip()] + return [cell.strip() for cell in line.split("|") if cell.strip()] def populate_table_as_grid(self, table_data): num_rows = len(table_data) - + # Adjust the table data into a grid format num_cols = max(len(row) for row in table_data) - data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[]) - for row_idx,row in enumerate(table_data): + data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[]) + for row_idx, row in enumerate(table_data): # Pad rows with empty strings to match column count - #grid.append(row + [''] * (max_cols - len(row))) + # grid.append(row + [''] * (max_cols - len(row))) - for col_idx,text in enumerate(row): + for col_idx, text in enumerate(row): row_span = 1 col_span = 1 - + cell = TableCell( text=text, row_span=row_span, @@ -191,11 +195,12 @@ class AsciidocBackend(DeclarativeDocumentBackend): start_col_offset_idx=col_idx, end_col_offset_idx=col_idx + col_span, col_header=False, - row_header=False) - data.table_cells.append(cell) - + row_header=False, + ) + data.table_cells.append(cell) + return data - + # Plain text def parse_text(self, line): return {"type": "text", "text": line} diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 0a424351..53f8ddef 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -46,7 +46,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = { InputFormat.PDF: ["pdf"], InputFormat.HTML: ["html", "htm", "xhtml"], InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"], - InputFormat.ASCIIDOC: ["adoc", ".asciidoc", "asc"], + InputFormat.ASCIIDOC: ["adoc", ".asciidoc", "asc"], } FormatToMimeType: Dict[InputFormat, Set[str]] = { diff --git a/tests/test_backend_asciidoc.py b/tests/test_backend_asciidoc.py index eaad403a..5c159125 100644 --- a/tests/test_backend_asciidoc.py +++ b/tests/test_backend_asciidoc.py @@ -1,14 +1,11 @@ import glob import os - from pathlib import Path import pytest from docling_core.types.doc import BoundingBox -from docling.backend.asciidoc_backend import ( - AsciidocBackend, -) +from docling.backend.asciidoc_backend import AsciidocBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument @@ -25,15 +22,15 @@ def _get_backend(fname): def test_asciidocs_examples(): - + fnames = sorted(glob.glob("./tests/data/*.asciidoc")) - + for fname in fnames: print(f"reading {fname}") bname = os.path.basename(fname) - gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname+".md") - + gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md") + doc_backend = _get_backend(Path(fname)) doc = doc_backend.convert() @@ -43,12 +40,9 @@ def test_asciidocs_examples(): with open(gname, "r") as fr: true_mddoc = fr.read() - assert pred_mddoc==true_mddoc, "pred_mddoc!=true_mddoc for asciidoc" - else: + assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc" + else: with open(gname, "w") as fw: fw.write(pred_mddoc) print("\n\n", doc.export_to_markdown()) - - -