mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Add code to expose text direction of cell
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
d7c9874a88
commit
7bdd6868ed
@ -11,7 +11,7 @@ from PIL import Image, ImageDraw
|
|||||||
from pypdfium2 import PdfPage
|
from pypdfium2 import PdfPage
|
||||||
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
from docling.datamodel.base_models import Cell, Size
|
from docling.datamodel.base_models import Cell, Size, TextDirection
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
@ -97,6 +97,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|||||||
y0 = cell_data[cells_header.index("y0")]
|
y0 = cell_data[cells_header.index("y0")]
|
||||||
x1 = cell_data[cells_header.index("x1")]
|
x1 = cell_data[cells_header.index("x1")]
|
||||||
y1 = cell_data[cells_header.index("y1")]
|
y1 = cell_data[cells_header.index("y1")]
|
||||||
|
ltr = cell_data[cells_header.index("left_to_right")]
|
||||||
|
|
||||||
if x1 < x0:
|
if x1 < x0:
|
||||||
x0, x1 = x1, x0
|
x0, x1 = x1, x0
|
||||||
@ -116,6 +117,11 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|||||||
t=y1 * page_size.height / parser_height,
|
t=y1 * page_size.height / parser_height,
|
||||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||||
).to_top_left_origin(page_size.height),
|
).to_top_left_origin(page_size.height),
|
||||||
|
text_direction=(
|
||||||
|
TextDirection.LEFT_TO_RIGHT
|
||||||
|
if ltr
|
||||||
|
else TextDirection.RIGHT_TO_LEFT
|
||||||
|
),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
cell_counter += 1
|
cell_counter += 1
|
||||||
|
@ -120,10 +120,16 @@ class ErrorItem(BaseModel):
|
|||||||
error_message: str
|
error_message: str
|
||||||
|
|
||||||
|
|
||||||
|
class TextDirection(str, Enum):
|
||||||
|
LEFT_TO_RIGHT = "left_to_right"
|
||||||
|
RIGHT_TO_LEFT = "right_to_left"
|
||||||
|
|
||||||
|
|
||||||
class Cell(BaseModel):
|
class Cell(BaseModel):
|
||||||
id: int
|
id: int
|
||||||
text: str
|
text: str
|
||||||
bbox: BoundingBox
|
bbox: BoundingBox
|
||||||
|
text_direction: TextDirection = TextDirection.LEFT_TO_RIGHT
|
||||||
|
|
||||||
|
|
||||||
class OcrCell(Cell):
|
class OcrCell(Cell):
|
||||||
|
@ -264,6 +264,7 @@ class GlmModel:
|
|||||||
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
||||||
|
|
||||||
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
||||||
|
1 == 1
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
|
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
|
||||||
|
@ -11,6 +11,7 @@ from docling.datamodel.base_models import (
|
|||||||
Page,
|
Page,
|
||||||
PageElement,
|
PageElement,
|
||||||
Table,
|
Table,
|
||||||
|
TextDirection,
|
||||||
TextElement,
|
TextElement,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
@ -75,12 +76,23 @@ class PageAssembleModel(BasePageModel):
|
|||||||
for cluster in page.predictions.layout.clusters:
|
for cluster in page.predictions.layout.clusters:
|
||||||
# _log.info("Cluster label seen:", cluster.label)
|
# _log.info("Cluster label seen:", cluster.label)
|
||||||
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
||||||
|
textlines = []
|
||||||
|
|
||||||
|
dominant_text_direction = TextDirection.LEFT_TO_RIGHT
|
||||||
|
|
||||||
|
# Naive code: dominant text direction == direction of first cell.
|
||||||
|
for cell in cluster.cells:
|
||||||
|
dominant_text_direction = cell.text_direction
|
||||||
|
break
|
||||||
|
|
||||||
|
for cell in cluster.cells:
|
||||||
|
text = cell.text.replace("\x02", "-").strip()
|
||||||
|
if text:
|
||||||
|
# if dominant_text_direction == TextDirection.RIGHT_TO_LEFT:
|
||||||
|
# textlines.insert(0, text) # Prepend RTL text
|
||||||
|
# else:
|
||||||
|
textlines.append(text) # Append LTR text
|
||||||
|
|
||||||
textlines = [
|
|
||||||
cell.text.replace("\x02", "-").strip()
|
|
||||||
for cell in cluster.cells
|
|
||||||
if len(cell.text.strip()) > 0
|
|
||||||
]
|
|
||||||
text = self.sanitize_text(textlines)
|
text = self.sanitize_text(textlines)
|
||||||
text_el = TextElement(
|
text_el = TextElement(
|
||||||
label=cluster.label,
|
label=cluster.label,
|
||||||
|
@ -49,6 +49,8 @@ def export_documents(
|
|||||||
with (output_dir / f"{doc_filename}.md").open("w") as fp:
|
with (output_dir / f"{doc_filename}.md").open("w") as fp:
|
||||||
fp.write(conv_res.document.export_to_markdown())
|
fp.write(conv_res.document.export_to_markdown())
|
||||||
|
|
||||||
|
conv_res.document.save_as_html(output_dir / f"{doc_filename}.html")
|
||||||
|
|
||||||
# Export Docling document format to text:
|
# Export Docling document format to text:
|
||||||
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
|
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
|
||||||
fp.write(conv_res.document.export_to_markdown(strict_text=True))
|
fp.write(conv_res.document.export_to_markdown(strict_text=True))
|
||||||
@ -103,10 +105,13 @@ def main():
|
|||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
input_doc_paths = [
|
input_doc_paths = [
|
||||||
Path("./tests/data/2206.01062.pdf"),
|
Path("./tests/data/pdf/right_to_left_01.pdf"),
|
||||||
Path("./tests/data/2203.01017v2.pdf"),
|
Path("./tests/data/pdf/right_to_left_02.pdf"),
|
||||||
Path("./tests/data/2305.03393v1.pdf"),
|
Path("./tests/data/pdf/right_to_left_03.pdf"),
|
||||||
Path("./tests/data/redp5110_sampled.pdf"),
|
Path("./tests/data/pdf/2206.01062.pdf"),
|
||||||
|
Path("./tests/data/pdf/2203.01017v2.pdf"),
|
||||||
|
Path("./tests/data/pdf/2305.03393v1.pdf"),
|
||||||
|
Path("./tests/data/pdf/redp5110_sampled.pdf"),
|
||||||
]
|
]
|
||||||
|
|
||||||
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
|
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
|
||||||
|
@ -26,6 +26,17 @@ def _get_backend(pdf_doc):
|
|||||||
return doc_backend
|
return doc_backend
|
||||||
|
|
||||||
|
|
||||||
|
def test_cell_ordering():
|
||||||
|
pdf_doc = Path("tests/data/pdf/right_to_left_01.pdf")
|
||||||
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
|
for page_index in range(0, doc_backend.page_count()):
|
||||||
|
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(page_index)
|
||||||
|
cells = list(page_backend.get_text_cells())
|
||||||
|
|
||||||
|
1 == 1
|
||||||
|
|
||||||
|
|
||||||
def test_text_cell_counts():
|
def test_text_cell_counts():
|
||||||
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
|
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user