mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
use HTMLParser and add options from CLI
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
7c3f9b7ab1
commit
3e4093db58
@ -305,13 +305,13 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
|
|||||||
ocr_file = self._tar.extractfile(ocr_info.path)
|
ocr_file = self._tar.extractfile(ocr_info.path)
|
||||||
assert ocr_file is not None
|
assert ocr_file is not None
|
||||||
ocr_content = ocr_file.read()
|
ocr_content = ocr_file.read()
|
||||||
ocr_root: etree._Element = etree.fromstring(ocr_content)
|
parser = etree.HTMLParser()
|
||||||
|
ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser)
|
||||||
|
|
||||||
line_cells: List[TextCell] = []
|
line_cells: List[TextCell] = []
|
||||||
word_cells: List[TextCell] = []
|
word_cells: List[TextCell] = []
|
||||||
|
|
||||||
ns = {"x": "http://www.w3.org/1999/xhtml"}
|
page_div = ocr_root.xpath("//div[@class='ocr_page']")
|
||||||
page_div = ocr_root.xpath("//x:div[@class='ocr_page']", namespaces=ns)
|
|
||||||
|
|
||||||
size = Size(width=im.size[0], height=im.size[1])
|
size = Size(width=im.size[0], height=im.size[1])
|
||||||
if page_div:
|
if page_div:
|
||||||
@ -326,9 +326,7 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
|
|||||||
im = im.convert("RGB")
|
im = im.convert("RGB")
|
||||||
|
|
||||||
# Extract all ocrx_word spans
|
# Extract all ocrx_word spans
|
||||||
for ix, word in enumerate(
|
for ix, word in enumerate(ocr_root.xpath("//span[@class='ocrx_word']")):
|
||||||
ocr_root.xpath("//x:span[@class='ocrx_word']", namespaces=ns)
|
|
||||||
):
|
|
||||||
text = "".join(word.itertext()).strip()
|
text = "".join(word.itertext()).strip()
|
||||||
title = word.attrib.get("title", "")
|
title = word.attrib.get("title", "")
|
||||||
rect = _extract_rect(title)
|
rect = _extract_rect(title)
|
||||||
@ -347,9 +345,7 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
|
|||||||
|
|
||||||
# Extract all ocr_line spans
|
# Extract all ocr_line spans
|
||||||
# line: etree._Element
|
# line: etree._Element
|
||||||
for ix, line in enumerate(
|
for ix, line in enumerate(ocr_root.xpath("//span[@class='ocr_line']")):
|
||||||
ocr_root.xpath("//x:span[@class='ocr_line']", namespaces=ns)
|
|
||||||
):
|
|
||||||
text = "".join(line.itertext()).strip()
|
text = "".join(line.itertext()).strip()
|
||||||
title = line.attrib.get("title", "")
|
title = line.attrib.get("title", "")
|
||||||
rect = _extract_rect(title)
|
rect = _extract_rect(title)
|
||||||
|
@ -26,6 +26,7 @@ from rich.console import Console
|
|||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||||
|
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||||
@ -601,9 +602,18 @@ def convert( # noqa: C901
|
|||||||
backend=backend, # pdf_backend
|
backend=backend, # pdf_backend
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# METS GBS options
|
||||||
|
mets_gbs_options = pipeline_options.model_copy()
|
||||||
|
mets_gbs_options.do_ocr = False
|
||||||
|
mets_gbs_format_option = PdfFormatOption(
|
||||||
|
pipeline_options=mets_gbs_options,
|
||||||
|
backend=MetsGbsDocumentBackend,
|
||||||
|
)
|
||||||
|
|
||||||
format_options = {
|
format_options = {
|
||||||
InputFormat.PDF: pdf_format_option,
|
InputFormat.PDF: pdf_format_option,
|
||||||
InputFormat.IMAGE: pdf_format_option,
|
InputFormat.IMAGE: pdf_format_option,
|
||||||
|
InputFormat.XML_METS_GBS: mets_gbs_format_option,
|
||||||
}
|
}
|
||||||
|
|
||||||
elif pipeline == ProcessingPipeline.VLM:
|
elif pipeline == ProcessingPipeline.VLM:
|
||||||
|
@ -482,7 +482,7 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
if member.name.endswith(".xml"):
|
if member.name.endswith(".xml"):
|
||||||
file = tar.extractfile(member)
|
file = tar.extractfile(member)
|
||||||
if file is not None:
|
if file is not None:
|
||||||
content_str = file.read().decode()
|
content_str = file.read().decode(errors="ignore")
|
||||||
if "http://www.loc.gov/METS/" in content_str:
|
if "http://www.loc.gov/METS/" in content_str:
|
||||||
return "application/mets+xml"
|
return "application/mets+xml"
|
||||||
return None
|
return None
|
||||||
|
@ -9,7 +9,7 @@ from docling.datamodel.document import InputDocument
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def test_doc_path():
|
def test_doc_path():
|
||||||
return Path("/Users/dol/Downloads/32044009881525.tar.gz")
|
return Path("/Users/dol/Downloads/32044009881525_select.tar.gz")
|
||||||
|
|
||||||
|
|
||||||
def _get_backend(pdf_doc):
|
def _get_backend(pdf_doc):
|
||||||
@ -39,7 +39,7 @@ def test_process_pages(test_doc_path):
|
|||||||
|
|
||||||
def test_get_text_from_rect(test_doc_path):
|
def test_get_text_from_rect(test_doc_path):
|
||||||
doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
|
doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
|
||||||
page_backend: MetsGbsPageBackend = doc_backend.load_page(9)
|
page_backend: MetsGbsPageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
# Get the title text of the DocLayNet paper
|
# Get the title text of the DocLayNet paper
|
||||||
textpiece = page_backend.get_text_in_rect(
|
textpiece = page_backend.get_text_in_rect(
|
||||||
@ -56,7 +56,7 @@ def test_get_text_from_rect(test_doc_path):
|
|||||||
|
|
||||||
def test_crop_page_image(test_doc_path):
|
def test_crop_page_image(test_doc_path):
|
||||||
doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
|
doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
|
||||||
page_backend: MetsGbsPageBackend = doc_backend.load_page(9)
|
page_backend: MetsGbsPageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
page_backend.get_page_image(
|
page_backend.get_page_image(
|
||||||
scale=2, cropbox=BoundingBox(l=270, t=587, r=1385, b=1995)
|
scale=2, cropbox=BoundingBox(l=270, t=587, r=1385, b=1995)
|
||||||
@ -68,22 +68,10 @@ def test_crop_page_image(test_doc_path):
|
|||||||
doc_backend.unload()
|
doc_backend.unload()
|
||||||
|
|
||||||
|
|
||||||
def test_crop_page_image_jp2(test_doc_path):
|
|
||||||
doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
|
|
||||||
page_backend: MetsGbsPageBackend = doc_backend.load_page(1)
|
|
||||||
|
|
||||||
page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=160, t=29, r=732, b=173))
|
|
||||||
# im.show()
|
|
||||||
|
|
||||||
# Explicitly clean up resources
|
|
||||||
page_backend.unload()
|
|
||||||
doc_backend.unload()
|
|
||||||
|
|
||||||
|
|
||||||
def test_num_pages(test_doc_path):
|
def test_num_pages(test_doc_path):
|
||||||
doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
|
doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
|
||||||
assert doc_backend.is_valid()
|
assert doc_backend.is_valid()
|
||||||
assert doc_backend.page_count() == 276
|
assert doc_backend.page_count() == 3
|
||||||
|
|
||||||
# Explicitly clean up resources to prevent race conditions in CI
|
# Explicitly clean up resources to prevent race conditions in CI
|
||||||
doc_backend.unload()
|
doc_backend.unload()
|
||||||
|
Loading…
Reference in New Issue
Block a user