mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-25 19:44:34 +00:00
Merge a1acce83b9
into 98e2fcff63
This commit is contained in:
commit
94410b6d34
@ -219,6 +219,11 @@ class LayoutPostprocessor:
|
|||||||
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
|
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self._regular_bbox_tuples = [
|
||||||
|
(c, c.bbox.l, c.bbox.t, c.bbox.r, c.bbox.b) for c in self.regular_clusters
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
|
def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
|
||||||
"""Main processing pipeline."""
|
"""Main processing pipeline."""
|
||||||
self.regular_clusters = self._process_regular_clusters()
|
self.regular_clusters = self._process_regular_clusters()
|
||||||
@ -329,15 +334,28 @@ class LayoutPostprocessor:
|
|||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# PRE-PROCESS REGULAR CLUSTERS BY BOUNDS
|
||||||
|
# For each special, pre-filter only regular clusters whose bbox intersects
|
||||||
|
# the special's bbox, using a quick rectangle intersection test.
|
||||||
|
regular_bbox_tuples = self._regular_bbox_tuples # local for speed
|
||||||
for special in special_clusters:
|
for special in special_clusters:
|
||||||
contained = []
|
contained = []
|
||||||
for cluster in self.regular_clusters:
|
s_bbox = special.bbox
|
||||||
|
sl, st, sr, sb = s_bbox.l, s_bbox.t, s_bbox.r, s_bbox.b
|
||||||
|
|
||||||
|
# Find only those regular clusters whose bbox intersects the special's bbox
|
||||||
|
possible = []
|
||||||
|
for cluster, left, top, right, bottom in regular_bbox_tuples:
|
||||||
|
if left < sr and right > sl and top < sb and bottom > st:
|
||||||
|
possible.append(cluster)
|
||||||
|
|
||||||
|
# Now do the expensive computation only for these
|
||||||
|
for cluster in possible:
|
||||||
containment = cluster.bbox.intersection_over_self(special.bbox)
|
containment = cluster.bbox.intersection_over_self(special.bbox)
|
||||||
if containment > 0.8:
|
if containment > 0.8:
|
||||||
contained.append(cluster)
|
contained.append(cluster)
|
||||||
|
|
||||||
if contained:
|
if contained:
|
||||||
# Sort contained clusters by minimum cell ID:
|
|
||||||
contained = self._sort_clusters(contained, mode="id")
|
contained = self._sort_clusters(contained, mode="id")
|
||||||
special.children = contained
|
special.children = contained
|
||||||
|
|
||||||
@ -380,24 +398,36 @@ class LayoutPostprocessor:
|
|||||||
should be removed.
|
should be removed.
|
||||||
"""
|
"""
|
||||||
wrappers_to_remove = set()
|
wrappers_to_remove = set()
|
||||||
|
# Precompute table clusters with their bbox coordinates for intersection testing
|
||||||
|
table_clusters = [
|
||||||
|
(c, c.bbox.l, c.bbox.t, c.bbox.r, c.bbox.b)
|
||||||
|
for c in self.regular_clusters
|
||||||
|
if c.label == DocItemLabel.TABLE
|
||||||
|
]
|
||||||
|
|
||||||
for wrapper in special_clusters:
|
for wrapper in special_clusters:
|
||||||
if wrapper.label not in self.WRAPPER_TYPES:
|
if wrapper.label not in self.WRAPPER_TYPES:
|
||||||
continue # only treat KEY_VALUE_REGION for now.
|
continue # only treat KEY_VALUE_REGION for now.
|
||||||
|
|
||||||
for regular in self.regular_clusters:
|
wbb = wrapper.bbox
|
||||||
if regular.label == DocItemLabel.TABLE:
|
wl, wt, wr, wb = wbb.l, wbb.t, wbb.r, wbb.b
|
||||||
# Calculate overlap
|
|
||||||
overlap_ratio = wrapper.bbox.intersection_over_self(regular.bbox)
|
|
||||||
|
|
||||||
conf_diff = wrapper.confidence - regular.confidence
|
# restrict to table-regulars whose bbox intersects
|
||||||
|
possible = []
|
||||||
|
for c, left, top, right, bottom in table_clusters:
|
||||||
|
if left < wr and right > wl and top < wb and bottom > wt:
|
||||||
|
possible.append(c)
|
||||||
|
|
||||||
# If wrapper is mostly overlapping with a TABLE, remove the wrapper
|
for regular in possible:
|
||||||
if (
|
overlap_ratio = wrapper.bbox.intersection_over_self(regular.bbox)
|
||||||
overlap_ratio > 0.9 and conf_diff < 0.1
|
conf_diff = wrapper.confidence - regular.confidence
|
||||||
): # self.OVERLAP_PARAMS["wrapper"]["conf_threshold"]): # 80% overlap threshold
|
|
||||||
wrappers_to_remove.add(wrapper.id)
|
# If wrapper is mostly overlapping with a TABLE, remove the wrapper
|
||||||
break
|
if (
|
||||||
|
overlap_ratio > 0.9 and conf_diff < 0.1
|
||||||
|
): # self.OVERLAP_PARAMS["wrapper"]["conf_threshold"]): # 80% overlap threshold
|
||||||
|
wrappers_to_remove.add(wrapper.id)
|
||||||
|
break
|
||||||
|
|
||||||
# Filter out the identified wrappers
|
# Filter out the identified wrappers
|
||||||
special_clusters = [
|
special_clusters = [
|
||||||
|
Loading…
Reference in New Issue
Block a user