mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
reformatted the code
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
5016daeae3
commit
70b2ae3fab
@ -1,6 +1,5 @@
|
||||
import re
|
||||
|
||||
import logging
|
||||
import re
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
@ -16,7 +15,8 @@ from docling_core.types.doc import (
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
#from docling.datamodel.document import InputDocument
|
||||
|
||||
# from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@ -75,11 +75,11 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
title, section headers, text, lists, and tables.
|
||||
"""
|
||||
|
||||
content=""
|
||||
content = ""
|
||||
with open(self.path_or_stream, "r") as fr:
|
||||
self.lines = fr.readlines()
|
||||
|
||||
#self.lines = file_content.splitlines()
|
||||
# self.lines = file_content.splitlines()
|
||||
|
||||
in_list = False
|
||||
in_table = False
|
||||
@ -127,7 +127,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
item = self.parse_text(line)
|
||||
doc.add_text(text=item["text"], label="text")
|
||||
|
||||
if in_table and len(table_data)>0:
|
||||
if in_table and len(table_data) > 0:
|
||||
data = self.populate_table_as_grid(table_data)
|
||||
doc.add_table(data=data)
|
||||
|
||||
@ -148,8 +148,12 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
return re.match(r"^==+", line)
|
||||
|
||||
def parse_section_header(self, line):
|
||||
header_level = line.count('=') # number of '=' represents level
|
||||
return {"type": "header", "level": header_level, "text": line[header_level:].strip()}
|
||||
header_level = line.count("=") # number of '=' represents level
|
||||
return {
|
||||
"type": "header",
|
||||
"level": header_level,
|
||||
"text": line[header_level:].strip(),
|
||||
}
|
||||
|
||||
# Lists
|
||||
def is_list_item(self, line):
|
||||
@ -164,7 +168,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
|
||||
def parse_table_line(self, line):
|
||||
# Split table cells and trim extra spaces
|
||||
return [cell.strip() for cell in line.split('|') if cell.strip()]
|
||||
return [cell.strip() for cell in line.split("|") if cell.strip()]
|
||||
|
||||
def populate_table_as_grid(self, table_data):
|
||||
|
||||
@ -174,11 +178,11 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
num_cols = max(len(row) for row in table_data)
|
||||
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
||||
for row_idx,row in enumerate(table_data):
|
||||
for row_idx, row in enumerate(table_data):
|
||||
# Pad rows with empty strings to match column count
|
||||
#grid.append(row + [''] * (max_cols - len(row)))
|
||||
# grid.append(row + [''] * (max_cols - len(row)))
|
||||
|
||||
for col_idx,text in enumerate(row):
|
||||
for col_idx, text in enumerate(row):
|
||||
row_span = 1
|
||||
col_span = 1
|
||||
|
||||
@ -191,7 +195,8 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + col_span,
|
||||
col_header=False,
|
||||
row_header=False)
|
||||
row_header=False,
|
||||
)
|
||||
data.table_cells.append(cell)
|
||||
|
||||
return data
|
||||
|
@ -1,14 +1,11 @@
|
||||
import glob
|
||||
import os
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from docling_core.types.doc import BoundingBox
|
||||
|
||||
from docling.backend.asciidoc_backend import (
|
||||
AsciidocBackend,
|
||||
)
|
||||
from docling.backend.asciidoc_backend import AsciidocBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
@ -32,7 +29,7 @@ def test_asciidocs_examples():
|
||||
print(f"reading {fname}")
|
||||
|
||||
bname = os.path.basename(fname)
|
||||
gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname+".md")
|
||||
gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md")
|
||||
|
||||
doc_backend = _get_backend(Path(fname))
|
||||
doc = doc_backend.convert()
|
||||
@ -43,12 +40,9 @@ def test_asciidocs_examples():
|
||||
with open(gname, "r") as fr:
|
||||
true_mddoc = fr.read()
|
||||
|
||||
assert pred_mddoc==true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
|
||||
assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
|
||||
else:
|
||||
with open(gname, "w") as fw:
|
||||
fw.write(pred_mddoc)
|
||||
|
||||
print("\n\n", doc.export_to_markdown())
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user