use HTMLParser and add options from CLI

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-07-25 15:10:05 +02:00
parent 7c3f9b7ab1
commit 3e4093db58
4 changed files with 20 additions and 26 deletions

View File

@ -305,13 +305,13 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
ocr_file = self._tar.extractfile(ocr_info.path)
assert ocr_file is not None
ocr_content = ocr_file.read()
ocr_root: etree._Element = etree.fromstring(ocr_content)
parser = etree.HTMLParser()
ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser)
line_cells: List[TextCell] = []
word_cells: List[TextCell] = []
ns = {"x": "http://www.w3.org/1999/xhtml"}
page_div = ocr_root.xpath("//x:div[@class='ocr_page']", namespaces=ns)
page_div = ocr_root.xpath("//div[@class='ocr_page']")
size = Size(width=im.size[0], height=im.size[1])
if page_div:
@ -326,9 +326,7 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
im = im.convert("RGB")
# Extract all ocrx_word spans
for ix, word in enumerate(
ocr_root.xpath("//x:span[@class='ocrx_word']", namespaces=ns)
):
for ix, word in enumerate(ocr_root.xpath("//span[@class='ocrx_word']")):
text = "".join(word.itertext()).strip()
title = word.attrib.get("title", "")
rect = _extract_rect(title)
@ -347,9 +345,7 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
# Extract all ocr_line spans
# line: etree._Element
for ix, line in enumerate(
ocr_root.xpath("//x:span[@class='ocr_line']", namespaces=ns)
):
for ix, line in enumerate(ocr_root.xpath("//span[@class='ocr_line']")):
text = "".join(line.itertext()).strip()
title = line.attrib.get("title", "")
rect = _extract_rect(title)

View File

@ -26,6 +26,7 @@ from rich.console import Console
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
@ -601,9 +602,18 @@ def convert( # noqa: C901
backend=backend, # pdf_backend
)
# METS GBS options
mets_gbs_options = pipeline_options.model_copy()
mets_gbs_options.do_ocr = False
mets_gbs_format_option = PdfFormatOption(
pipeline_options=mets_gbs_options,
backend=MetsGbsDocumentBackend,
)
format_options = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
InputFormat.XML_METS_GBS: mets_gbs_format_option,
}
elif pipeline == ProcessingPipeline.VLM:

View File

@ -482,7 +482,7 @@ class _DocumentConversionInput(BaseModel):
if member.name.endswith(".xml"):
file = tar.extractfile(member)
if file is not None:
content_str = file.read().decode()
content_str = file.read().decode(errors="ignore")
if "http://www.loc.gov/METS/" in content_str:
return "application/mets+xml"
return None

View File

@ -9,7 +9,7 @@ from docling.datamodel.document import InputDocument
@pytest.fixture
def test_doc_path():
return Path("/Users/dol/Downloads/32044009881525.tar.gz")
return Path("/Users/dol/Downloads/32044009881525_select.tar.gz")
def _get_backend(pdf_doc):
@ -39,7 +39,7 @@ def test_process_pages(test_doc_path):
def test_get_text_from_rect(test_doc_path):
doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
page_backend: MetsGbsPageBackend = doc_backend.load_page(9)
page_backend: MetsGbsPageBackend = doc_backend.load_page(0)
# Get the title text of the DocLayNet paper
textpiece = page_backend.get_text_in_rect(
@ -56,7 +56,7 @@ def test_get_text_from_rect(test_doc_path):
def test_crop_page_image(test_doc_path):
doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
page_backend: MetsGbsPageBackend = doc_backend.load_page(9)
page_backend: MetsGbsPageBackend = doc_backend.load_page(0)
page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=270, t=587, r=1385, b=1995)
@ -68,22 +68,10 @@ def test_crop_page_image(test_doc_path):
doc_backend.unload()
def test_crop_page_image_jp2(test_doc_path):
doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
page_backend: MetsGbsPageBackend = doc_backend.load_page(1)
page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=160, t=29, r=732, b=173))
# im.show()
# Explicitly clean up resources
page_backend.unload()
doc_backend.unload()
def test_num_pages(test_doc_path):
doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
assert doc_backend.is_valid()
assert doc_backend.page_count() == 276
assert doc_backend.page_count() == 3
# Explicitly clean up resources to prevent race conditions in CI
doc_backend.unload()