Apply renamings to DocItemLabel

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-09-25 12:22:02 +02:00
8 changed files with 290 additions and 328 deletions

View File

@@ -9,7 +9,6 @@ from typing import Annotated, Iterable, List, Optional
import typer
from docling_core.utils.file import resolve_file_source
from pydantic import AnyUrl
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
@@ -181,58 +180,25 @@ def convert(
else:
input_doc_paths.append(source)
###########################################################################
match backend:
case Backend.PYPDFIUM2:
do_cell_matching = ocr # only do cell matching when OCR enabled
pdf_backend = PyPdfiumDocumentBackend
case Backend.DOCLING:
do_cell_matching = True
pdf_backend = DoclingParseDocumentBackend
case _:
raise RuntimeError(f"Unexpected backend type {backend}")
# The following sections contain a combination of PipelineOptions
# and PDF Backends for various configurations.
# Uncomment one section at the time to see the differences in the output.
doc_converter = None
if backend == Backend.PYPDFIUM2 and not ocr: # PyPdfium without OCR
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = False
doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=PyPdfiumDocumentBackend,
)
elif backend == Backend.PYPDFIUM2.value and ocr: # PyPdfium with OCR
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=PyPdfiumDocumentBackend,
)
elif backend == Backend.DOCLING.value and not ocr: # Docling Parse without OCR
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
elif backend == Backend.DOCLING.value and ocr: # Docling Parse with OCR
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
###########################################################################
pipeline_options = PipelineOptions(
do_ocr=ocr,
do_table_structure=True,
)
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=pdf_backend,
)
# Define input files
input = DocumentConversionInput.from_paths(input_doc_paths)

View File

@@ -6,7 +6,7 @@ from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
from docling_core.types.experimental.base import BoundingBox, Size
from docling_core.types.experimental.document import BaseFigureData, TableCell
from docling_core.types.experimental.labels import PageLabel
from docling_core.types.experimental.labels import DocItemLabel
from PIL.Image import Image
from pydantic import BaseModel, ConfigDict, Field, model_validator
from typing_extensions import Self
@@ -51,14 +51,14 @@ class OcrCell(Cell):
class Cluster(BaseModel):
id: int
label: PageLabel
label: DocItemLabel
bbox: BoundingBox
confidence: float = 1.0
cells: List[Cell] = []
class BasePageElement(BaseModel):
label: PageLabel
label: DocItemLabel
id: int
page_no: int
cluster: Cluster

View File

@@ -12,7 +12,7 @@ from docling_core.types import Table as DsSchemaTable
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
from docling_core.types.doc.base import Figure, TableCell
from docling_core.types.experimental.document import DoclingDocument, FileInfo
from docling_core.types.experimental.labels import PageLabel
from docling_core.types.experimental.labels import DocItemLabel
from pydantic import BaseModel
from typing_extensions import deprecated
@@ -35,21 +35,21 @@ from docling.utils.utils import create_file_hash
_log = logging.getLogger(__name__)
layout_label_to_ds_type = {
PageLabel.TITLE: "title",
PageLabel.DOCUMENT_INDEX: "table-of-contents",
PageLabel.SECTION_HEADER: "subtitle-level-1",
PageLabel.CHECKBOX_SELECTED: "checkbox-selected",
PageLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
PageLabel.CAPTION: "caption",
PageLabel.PAGE_HEADER: "page-header",
PageLabel.PAGE_FOOTER: "page-footer",
PageLabel.FOOTNOTE: "footnote",
PageLabel.TABLE: "table",
PageLabel.FORMULA: "equation",
PageLabel.LIST_ITEM: "paragraph",
PageLabel.CODE: "paragraph",
PageLabel.PICTURE: "figure",
PageLabel.TEXT: "paragraph",
DocItemLabel.TITLE: "title",
DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
DocItemLabel.SECTION_HEADER: "subtitle-level-1",
DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
DocItemLabel.CAPTION: "caption",
DocItemLabel.PAGE_HEADER: "page-header",
DocItemLabel.PAGE_FOOTER: "page-footer",
DocItemLabel.FOOTNOTE: "footnote",
DocItemLabel.TABLE: "table",
DocItemLabel.FORMULA: "equation",
DocItemLabel.LIST_ITEM: "paragraph",
DocItemLabel.CODE: "paragraph",
DocItemLabel.PICTURE: "figure",
DocItemLabel.TEXT: "paragraph",
}
_EMPTY_DOC = DsDocument(
@@ -330,8 +330,10 @@ class ConvertedDocument(BaseModel):
"paragraph",
"caption",
"table",
"figure",
],
strict_text: bool = False,
image_placeholder: str = "<!-- image -->",
):
return self.output.export_to_markdown(
delim=delim,
@@ -339,6 +341,7 @@ class ConvertedDocument(BaseModel):
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
strict_text=strict_text,
image_placeholder=image_placeholder,
)
def render_as_text(

View File

@@ -5,7 +5,7 @@ import time
from typing import Iterable, List
from docling_core.types.experimental.base import CoordOrigin
from docling_core.types.experimental.labels import PageLabel
from docling_core.types.experimental.labels import DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
from PIL import ImageDraw
@@ -24,23 +24,23 @@ _log = logging.getLogger(__name__)
class LayoutModel:
TEXT_ELEM_LABELS = [
PageLabel.TEXT,
PageLabel.FOOTNOTE,
PageLabel.CAPTION,
PageLabel.CHECKBOX_UNSELECTED,
PageLabel.CHECKBOX_SELECTED,
PageLabel.SECTION_HEADER,
PageLabel.PAGE_HEADER,
PageLabel.PAGE_FOOTER,
PageLabel.CODE,
PageLabel.LIST_ITEM,
DocItemLabel.TEXT,
DocItemLabel.FOOTNOTE,
DocItemLabel.CAPTION,
DocItemLabel.CHECKBOX_UNSELECTED,
DocItemLabel.CHECKBOX_SELECTED,
DocItemLabel.SECTION_HEADER,
DocItemLabel.PAGE_HEADER,
DocItemLabel.PAGE_FOOTER,
DocItemLabel.CODE,
DocItemLabel.LIST_ITEM,
# "Formula",
]
PAGE_HEADER_LABELS = [PageLabel.PAGE_HEADER, PageLabel.PAGE_FOOTER]
PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
TABLE_LABEL = PageLabel.TABLE
FIGURE_LABEL = PageLabel.PICTURE
FORMULA_LABEL = PageLabel.FORMULA
TABLE_LABEL = DocItemLabel.TABLE
FIGURE_LABEL = DocItemLabel.PICTURE
FORMULA_LABEL = DocItemLabel.FORMULA
def __init__(self, config):
self.config = config
@@ -51,27 +51,27 @@ class LayoutModel:
def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
MIN_INTERSECTION = 0.2
CLASS_THRESHOLDS = {
PageLabel.CAPTION: 0.35,
PageLabel.FOOTNOTE: 0.35,
PageLabel.FORMULA: 0.35,
PageLabel.LIST_ITEM: 0.35,
PageLabel.PAGE_FOOTER: 0.35,
PageLabel.PAGE_HEADER: 0.35,
PageLabel.PICTURE: 0.2, # low threshold adjust to capture chemical structures for examples.
PageLabel.SECTION_HEADER: 0.45,
PageLabel.TABLE: 0.35,
PageLabel.TEXT: 0.45,
PageLabel.TITLE: 0.45,
PageLabel.DOCUMENT_INDEX: 0.45,
PageLabel.CODE: 0.45,
PageLabel.CHECKBOX_SELECTED: 0.45,
PageLabel.CHECKBOX_UNSELECTED: 0.45,
PageLabel.FORM: 0.45,
PageLabel.KEY_VALUE_REGION: 0.45,
DocItemLabel.CAPTION: 0.35,
DocItemLabel.FOOTNOTE: 0.35,
DocItemLabel.FORMULA: 0.35,
DocItemLabel.LIST_ITEM: 0.35,
DocItemLabel.PAGE_FOOTER: 0.35,
DocItemLabel.PAGE_HEADER: 0.35,
DocItemLabel.PICTURE: 0.2, # low threshold adjust to capture chemical structures for examples.
DocItemLabel.SECTION_HEADER: 0.45,
DocItemLabel.TABLE: 0.35,
DocItemLabel.TEXT: 0.45,
DocItemLabel.TITLE: 0.45,
DocItemLabel.DOCUMENT_INDEX: 0.45,
DocItemLabel.CODE: 0.45,
DocItemLabel.CHECKBOX_SELECTED: 0.45,
DocItemLabel.CHECKBOX_UNSELECTED: 0.45,
DocItemLabel.FORM: 0.45,
DocItemLabel.KEY_VALUE_REGION: 0.45,
}
CLASS_REMAPPINGS = {
PageLabel.DOCUMENT_INDEX: PageLabel.TABLE,
DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
}
_log.debug("================= Start postprocess function ====================")
@@ -258,7 +258,7 @@ class LayoutModel:
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height),
confidence=c["confidence"],
label=PageLabel(c["type"]),
label=DocItemLabel(c["type"]),
cells=cluster_cells,
)
clusters_out_new.append(c_new)
@@ -271,7 +271,7 @@ class LayoutModel:
for ix, pred_item in enumerate(
self.layout_predictor.predict(page.get_image(scale=1.0))
):
label = PageLabel(
label = DocItemLabel(
pred_item["label"].lower().replace(" ", "_").replace("-", "_")
) # Temporary, until docling-ibm-model uses docling-core types
cluster = Cluster(

View File

@@ -2,7 +2,7 @@ import copy
import logging
import networkx as nx
from docling_core.types.experimental.labels import PageLabel
from docling_core.types.experimental.labels import DocItemLabel
logger = logging.getLogger("layout_utils")
@@ -371,7 +371,7 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
"Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
)
logger.debug(" with cells: " + str(new_cluster["cell_ids"]))
if len(cluster["cell_ids"]) == 0 and cluster["type"] != PageLabel.PICTURE:
if len(cluster["cell_ids"]) == 0 and cluster["type"] != DocItemLabel.PICTURE:
logger.debug(" Empty non-picture, removed")
continue ## Skip this former cluster, now without cells.
new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
@@ -381,14 +381,14 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
if not (cluster["type"] in [PageLabel.TABLE, PageLabel.PICTURE]):
if not (cluster["type"] in [DocItemLabel.TABLE, DocItemLabel.PICTURE]):
## A text-like cluster. The bbox only needs to be around the text cells:
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
new_bbox = surrounding_list(
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
)
logger.debug(" New bounding box:" + str(new_bbox))
if cluster["type"] == PageLabel.PICTURE:
if cluster["type"] == DocItemLabel.PICTURE:
## We only make the bbox completely comprise included text cells:
logger.debug(" Picture")
if len(cluster["cell_ids"]) != 0:
@@ -588,7 +588,7 @@ def set_orphan_as_text(
max_id = -1
figures = []
for cluster in cluster_predictions:
if cluster["type"] == PageLabel.PICTURE:
if cluster["type"] == DocItemLabel.PICTURE:
figures.append(cluster)
if cluster["id"] > max_id:
@@ -639,13 +639,13 @@ def set_orphan_as_text(
# if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
if fig_flag == False and lines_detector == False:
# get class from low confidence detections if not set as text:
class_type = PageLabel.TEXT
class_type = DocItemLabel.TEXT
for cluster in cluster_predictions_low:
intersection = compute_intersection(
orph_cell["bbox"], cluster["bbox"]
)
class_type = PageLabel.TEXT
class_type = DocItemLabel.TEXT
if (
cluster["confidence"] > 0.1
and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
@@ -719,7 +719,9 @@ def merge_cells(cluster_predictions):
if cluster["id"] == node:
lines.append(cluster)
cluster_predictions.remove(cluster)
new_merged_cluster = build_cluster_from_lines(lines, PageLabel.TEXT, max_id)
new_merged_cluster = build_cluster_from_lines(
lines, DocItemLabel.TEXT, max_id
)
cluster_predictions.append(new_merged_cluster)
return cluster_predictions
@@ -754,9 +756,9 @@ def clean_up_clusters(
# remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
elif img_table == True:
if (
cluster_1["type"] == PageLabel.TEXT
and cluster_2["type"] == PageLabel.PICTURE
or cluster_2["type"] == PageLabel.TABLE
cluster_1["type"] == DocItemLabel.TEXT
and cluster_2["type"] == DocItemLabel.PICTURE
or cluster_2["type"] == DocItemLabel.TABLE
):
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
DuplicateDeletedClusterIDs.append(cluster_1["id"])
@@ -773,7 +775,7 @@ def clean_up_clusters(
# remove tables that have one pdf cell
if one_cell_table == True:
if (
cluster_1["type"] == PageLabel.TABLE
cluster_1["type"] == DocItemLabel.TABLE
and len(cluster_1["cell_ids"]) < 2
):
DuplicateDeletedClusterIDs.append(cluster_1["id"])