From a436be73676101cc9461a17ae7a9ae72316a5096 Mon Sep 17 00:00:00 2001 From: Christoph Auer <60343111+cau-git@users.noreply.github.com> Date: Mon, 14 Jul 2025 18:32:01 +0200 Subject: [PATCH] feat: Add option to control empty clusters in layout postprocessing (#1940) Add option to control empty clusters in layout postprocessing Signed-off-by: Christoph Auer --- docling/datamodel/pipeline_options.py | 3 +++ docling/utils/layout_postprocessor.py | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 99039e1f..06169fb8 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -279,6 +279,9 @@ class LayoutOptions(BaseModel): """Options for layout processing.""" create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells + keep_empty_clusters: bool = ( + False # Whether to keep clusters that contain no text cells + ) model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2 diff --git a/docling/utils/layout_postprocessor.py b/docling/utils/layout_postprocessor.py index a98b3aab..effce01b 100644 --- a/docling/utils/layout_postprocessor.py +++ b/docling/utils/layout_postprocessor.py @@ -267,8 +267,9 @@ class LayoutPostprocessor: # Initial cell assignment clusters = self._assign_cells_to_clusters(clusters) - # Remove clusters with no cells - clusters = [cluster for cluster in clusters if cluster.cells] + # Remove clusters with no cells (if keep_empty_clusters is False) + if not self.options.keep_empty_clusters: + clusters = [cluster for cluster in clusters if cluster.cells] # Handle orphaned cells unassigned = self._find_unassigned_cells(clusters)