mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
reformatted the code
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
5016daeae3
commit
70b2ae3fab
@ -1,6 +1,5 @@
|
|||||||
import re
|
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Set, Union
|
from typing import Set, Union
|
||||||
@ -16,7 +15,8 @@ from docling_core.types.doc import (
|
|||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
#from docling.datamodel.document import InputDocument
|
|
||||||
|
# from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -29,7 +29,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
self.path_or_stream = path_or_stream
|
self.path_or_stream = path_or_stream
|
||||||
|
|
||||||
self.valid = True
|
self.valid = True
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
return self.valid
|
return self.valid
|
||||||
|
|
||||||
@ -61,12 +61,12 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
if len(fname) > 0:
|
if len(fname) > 0:
|
||||||
docname = Path(fname).stem
|
docname = Path(fname).stem
|
||||||
else:
|
else:
|
||||||
docname = "stream"
|
docname = "stream"
|
||||||
|
|
||||||
doc = DoclingDocument(name=docname, origin=origin)
|
doc = DoclingDocument(name=docname, origin=origin)
|
||||||
|
|
||||||
doc = self.parse(doc)
|
doc = self.parse(doc)
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def parse(self, doc: DoclingDocument):
|
def parse(self, doc: DoclingDocument):
|
||||||
@ -75,19 +75,19 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
title, section headers, text, lists, and tables.
|
title, section headers, text, lists, and tables.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
content=""
|
content = ""
|
||||||
with open(self.path_or_stream, "r") as fr:
|
with open(self.path_or_stream, "r") as fr:
|
||||||
self.lines = fr.readlines()
|
self.lines = fr.readlines()
|
||||||
|
|
||||||
#self.lines = file_content.splitlines()
|
# self.lines = file_content.splitlines()
|
||||||
|
|
||||||
in_list = False
|
in_list = False
|
||||||
in_table = False
|
in_table = False
|
||||||
table_data = []
|
table_data = []
|
||||||
|
|
||||||
for line in self.lines:
|
for line in self.lines:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
|
|
||||||
# Title
|
# Title
|
||||||
if self.is_title(line):
|
if self.is_title(line):
|
||||||
item = self.parse_title(line)
|
item = self.parse_title(line)
|
||||||
@ -105,41 +105,41 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
item = self.parse_list_item(line)
|
item = self.parse_list_item(line)
|
||||||
doc.add_list_item(item["text"])
|
doc.add_list_item(item["text"])
|
||||||
|
|
||||||
elif in_list and not self.is_list_item(line):
|
elif in_list and not self.is_list_item(line):
|
||||||
in_list = False
|
in_list = False
|
||||||
|
|
||||||
# Tables
|
# Tables
|
||||||
elif self.is_table_line(line):
|
elif self.is_table_line(line):
|
||||||
in_table = True
|
in_table = True
|
||||||
table_data.append(self.parse_table_line(line))
|
table_data.append(self.parse_table_line(line))
|
||||||
|
|
||||||
elif in_table and not self.is_table_line(line):
|
elif in_table and not self.is_table_line(line):
|
||||||
|
|
||||||
data = self.populate_table_as_grid(table_data)
|
data = self.populate_table_as_grid(table_data)
|
||||||
doc.add_table(data=data)
|
doc.add_table(data=data)
|
||||||
|
|
||||||
in_table = False
|
in_table = False
|
||||||
table_data = []
|
table_data = []
|
||||||
|
|
||||||
# Plain text
|
# Plain text
|
||||||
elif line:
|
elif line:
|
||||||
item = self.parse_text(line)
|
item = self.parse_text(line)
|
||||||
doc.add_text(text=item["text"], label="text")
|
doc.add_text(text=item["text"], label="text")
|
||||||
|
|
||||||
if in_table and len(table_data)>0:
|
if in_table and len(table_data) > 0:
|
||||||
data = self.populate_table_as_grid(table_data)
|
data = self.populate_table_as_grid(table_data)
|
||||||
doc.add_table(data=data)
|
doc.add_table(data=data)
|
||||||
|
|
||||||
in_table = False
|
in_table = False
|
||||||
table_data = []
|
table_data = []
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
# Title
|
# Title
|
||||||
def is_title(self, line):
|
def is_title(self, line):
|
||||||
return re.match(r"^= ", line)
|
return re.match(r"^= ", line)
|
||||||
|
|
||||||
def parse_title(self, line):
|
def parse_title(self, line):
|
||||||
return {"type": "title", "text": line[2:].strip()}
|
return {"type": "title", "text": line[2:].strip()}
|
||||||
|
|
||||||
@ -148,9 +148,13 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
return re.match(r"^==+", line)
|
return re.match(r"^==+", line)
|
||||||
|
|
||||||
def parse_section_header(self, line):
|
def parse_section_header(self, line):
|
||||||
header_level = line.count('=') # number of '=' represents level
|
header_level = line.count("=") # number of '=' represents level
|
||||||
return {"type": "header", "level": header_level, "text": line[header_level:].strip()}
|
return {
|
||||||
|
"type": "header",
|
||||||
|
"level": header_level,
|
||||||
|
"text": line[header_level:].strip(),
|
||||||
|
}
|
||||||
|
|
||||||
# Lists
|
# Lists
|
||||||
def is_list_item(self, line):
|
def is_list_item(self, line):
|
||||||
return re.match(r"^(\*|-|\d+\.|\w+\.) ", line)
|
return re.match(r"^(\*|-|\d+\.|\w+\.) ", line)
|
||||||
@ -164,24 +168,24 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
def parse_table_line(self, line):
|
def parse_table_line(self, line):
|
||||||
# Split table cells and trim extra spaces
|
# Split table cells and trim extra spaces
|
||||||
return [cell.strip() for cell in line.split('|') if cell.strip()]
|
return [cell.strip() for cell in line.split("|") if cell.strip()]
|
||||||
|
|
||||||
def populate_table_as_grid(self, table_data):
|
def populate_table_as_grid(self, table_data):
|
||||||
|
|
||||||
num_rows = len(table_data)
|
num_rows = len(table_data)
|
||||||
|
|
||||||
# Adjust the table data into a grid format
|
# Adjust the table data into a grid format
|
||||||
num_cols = max(len(row) for row in table_data)
|
num_cols = max(len(row) for row in table_data)
|
||||||
|
|
||||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
||||||
for row_idx,row in enumerate(table_data):
|
for row_idx, row in enumerate(table_data):
|
||||||
# Pad rows with empty strings to match column count
|
# Pad rows with empty strings to match column count
|
||||||
#grid.append(row + [''] * (max_cols - len(row)))
|
# grid.append(row + [''] * (max_cols - len(row)))
|
||||||
|
|
||||||
for col_idx,text in enumerate(row):
|
for col_idx, text in enumerate(row):
|
||||||
row_span = 1
|
row_span = 1
|
||||||
col_span = 1
|
col_span = 1
|
||||||
|
|
||||||
cell = TableCell(
|
cell = TableCell(
|
||||||
text=text,
|
text=text,
|
||||||
row_span=row_span,
|
row_span=row_span,
|
||||||
@ -191,11 +195,12 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
start_col_offset_idx=col_idx,
|
start_col_offset_idx=col_idx,
|
||||||
end_col_offset_idx=col_idx + col_span,
|
end_col_offset_idx=col_idx + col_span,
|
||||||
col_header=False,
|
col_header=False,
|
||||||
row_header=False)
|
row_header=False,
|
||||||
data.table_cells.append(cell)
|
)
|
||||||
|
data.table_cells.append(cell)
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
# Plain text
|
# Plain text
|
||||||
def parse_text(self, line):
|
def parse_text(self, line):
|
||||||
return {"type": "text", "text": line}
|
return {"type": "text", "text": line}
|
||||||
|
@ -46,7 +46,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|||||||
InputFormat.PDF: ["pdf"],
|
InputFormat.PDF: ["pdf"],
|
||||||
InputFormat.HTML: ["html", "htm", "xhtml"],
|
InputFormat.HTML: ["html", "htm", "xhtml"],
|
||||||
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
||||||
InputFormat.ASCIIDOC: ["adoc", ".asciidoc", "asc"],
|
InputFormat.ASCIIDOC: ["adoc", ".asciidoc", "asc"],
|
||||||
}
|
}
|
||||||
|
|
||||||
FormatToMimeType: Dict[InputFormat, Set[str]] = {
|
FormatToMimeType: Dict[InputFormat, Set[str]] = {
|
||||||
|
@ -1,14 +1,11 @@
|
|||||||
import glob
|
import glob
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from docling_core.types.doc import BoundingBox
|
from docling_core.types.doc import BoundingBox
|
||||||
|
|
||||||
from docling.backend.asciidoc_backend import (
|
from docling.backend.asciidoc_backend import AsciidocBackend
|
||||||
AsciidocBackend,
|
|
||||||
)
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
@ -25,15 +22,15 @@ def _get_backend(fname):
|
|||||||
|
|
||||||
|
|
||||||
def test_asciidocs_examples():
|
def test_asciidocs_examples():
|
||||||
|
|
||||||
fnames = sorted(glob.glob("./tests/data/*.asciidoc"))
|
fnames = sorted(glob.glob("./tests/data/*.asciidoc"))
|
||||||
|
|
||||||
for fname in fnames:
|
for fname in fnames:
|
||||||
print(f"reading {fname}")
|
print(f"reading {fname}")
|
||||||
|
|
||||||
bname = os.path.basename(fname)
|
bname = os.path.basename(fname)
|
||||||
gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname+".md")
|
gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md")
|
||||||
|
|
||||||
doc_backend = _get_backend(Path(fname))
|
doc_backend = _get_backend(Path(fname))
|
||||||
doc = doc_backend.convert()
|
doc = doc_backend.convert()
|
||||||
|
|
||||||
@ -43,12 +40,9 @@ def test_asciidocs_examples():
|
|||||||
with open(gname, "r") as fr:
|
with open(gname, "r") as fr:
|
||||||
true_mddoc = fr.read()
|
true_mddoc = fr.read()
|
||||||
|
|
||||||
assert pred_mddoc==true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
|
assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
|
||||||
else:
|
else:
|
||||||
with open(gname, "w") as fw:
|
with open(gname, "w") as fw:
|
||||||
fw.write(pred_mddoc)
|
fw.write(pred_mddoc)
|
||||||
|
|
||||||
print("\n\n", doc.export_to_markdown())
|
print("\n\n", doc.export_to_markdown())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user