mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
fix: fix HTML table parser and JATS backend bugs (#1948)
Fix a bug in parsing HTML tables in HTML backend. Fix a bug in test file that prevented JATS backend tests. Ensure that the JATS backend creates headings with the right level. Remove unnecessary data files for testing JATS backend. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
d6d2dbe2f9
commit
e1e3053695
@@ -14,9 +14,9 @@ from .verify_utils import verify_document, verify_export
|
||||
GENERATE = GEN_TEST_DATA
|
||||
|
||||
|
||||
def get_pubmed_paths():
|
||||
directory = Path(os.path.dirname(__file__) + "/data/pubmed/")
|
||||
xml_files = sorted(directory.rglob("*.xml"))
|
||||
def get_jats_paths():
|
||||
directory = Path(os.path.dirname(__file__) + "/data/jats/")
|
||||
xml_files = sorted(directory.rglob("*.nxml"))
|
||||
return xml_files
|
||||
|
||||
|
||||
@@ -25,20 +25,20 @@ def get_converter():
|
||||
return converter
|
||||
|
||||
|
||||
def test_e2e_pubmed_conversions(use_stream=False):
|
||||
pubmed_paths = get_pubmed_paths()
|
||||
def test_e2e_jats_conversions(use_stream=False):
|
||||
jats_paths = get_jats_paths()
|
||||
converter = get_converter()
|
||||
|
||||
for pubmed_path in pubmed_paths:
|
||||
for jats_path in jats_paths:
|
||||
gt_path = (
|
||||
pubmed_path.parent.parent / "groundtruth" / "docling_v2" / pubmed_path.name
|
||||
jats_path.parent.parent / "groundtruth" / "docling_v2" / jats_path.name
|
||||
)
|
||||
if use_stream:
|
||||
buf = BytesIO(pubmed_path.open("rb").read())
|
||||
stream = DocumentStream(name=pubmed_path.name, stream=buf)
|
||||
buf = BytesIO(jats_path.open("rb").read())
|
||||
stream = DocumentStream(name=jats_path.name, stream=buf)
|
||||
conv_result: ConversionResult = converter.convert(stream)
|
||||
else:
|
||||
conv_result: ConversionResult = converter.convert(pubmed_path)
|
||||
conv_result: ConversionResult = converter.convert(jats_path)
|
||||
doc: DoclingDocument = conv_result.document
|
||||
|
||||
pred_md: str = doc.export_to_markdown()
|
||||
@@ -54,9 +54,9 @@ def test_e2e_pubmed_conversions(use_stream=False):
|
||||
assert verify_document(doc, str(gt_path) + ".json", GENERATE), "export to json"
|
||||
|
||||
|
||||
def test_e2e_pubmed_conversions_stream():
|
||||
test_e2e_pubmed_conversions(use_stream=True)
|
||||
def test_e2e_jats_conversions_stream():
|
||||
test_e2e_jats_conversions(use_stream=True)
|
||||
|
||||
|
||||
def test_e2e_pubmed_conversions_no_stream():
|
||||
test_e2e_pubmed_conversions(use_stream=False)
|
||||
def test_e2e_jats_conversions_no_stream():
|
||||
test_e2e_jats_conversions(use_stream=False)
|
||||
|
||||
Reference in New Issue
Block a user