mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Fixes for cell indexing
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
7bc1c1ac3d
commit
7c3f9b7ab1
@ -326,7 +326,9 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
|
|||||||
im = im.convert("RGB")
|
im = im.convert("RGB")
|
||||||
|
|
||||||
# Extract all ocrx_word spans
|
# Extract all ocrx_word spans
|
||||||
for word in ocr_root.xpath("//x:span[@class='ocrx_word']", namespaces=ns):
|
for ix, word in enumerate(
|
||||||
|
ocr_root.xpath("//x:span[@class='ocrx_word']", namespaces=ns)
|
||||||
|
):
|
||||||
text = "".join(word.itertext()).strip()
|
text = "".join(word.itertext()).strip()
|
||||||
title = word.attrib.get("title", "")
|
title = word.attrib.get("title", "")
|
||||||
rect = _extract_rect(title)
|
rect = _extract_rect(title)
|
||||||
@ -334,13 +336,20 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
|
|||||||
if rect:
|
if rect:
|
||||||
word_cells.append(
|
word_cells.append(
|
||||||
TextCell(
|
TextCell(
|
||||||
text=text, orig=text, rect=rect, from_ocr=True, confidence=conf
|
index=ix,
|
||||||
|
text=text,
|
||||||
|
orig=text,
|
||||||
|
rect=rect,
|
||||||
|
from_ocr=True,
|
||||||
|
confidence=conf,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract all ocr_line spans
|
# Extract all ocr_line spans
|
||||||
# line: etree._Element
|
# line: etree._Element
|
||||||
for line in ocr_root.xpath("//x:span[@class='ocr_line']", namespaces=ns):
|
for ix, line in enumerate(
|
||||||
|
ocr_root.xpath("//x:span[@class='ocr_line']", namespaces=ns)
|
||||||
|
):
|
||||||
text = "".join(line.itertext()).strip()
|
text = "".join(line.itertext()).strip()
|
||||||
title = line.attrib.get("title", "")
|
title = line.attrib.get("title", "")
|
||||||
rect = _extract_rect(title)
|
rect = _extract_rect(title)
|
||||||
@ -348,7 +357,12 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
|
|||||||
if rect:
|
if rect:
|
||||||
line_cells.append(
|
line_cells.append(
|
||||||
TextCell(
|
TextCell(
|
||||||
text=text, orig=text, rect=rect, from_ocr=True, confidence=conf
|
index=ix,
|
||||||
|
text=text,
|
||||||
|
orig=text,
|
||||||
|
rect=rect,
|
||||||
|
from_ocr=True,
|
||||||
|
confidence=conf,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user