mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-25 19:44:34 +00:00
clean up
This commit is contained in:
parent
3b8deae9ce
commit
bd8b1c42d4
@ -194,7 +194,6 @@ class LayoutPostprocessor:
|
|||||||
# DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
|
# DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
|
||||||
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
||||||
}
|
}
|
||||||
# All constants, class attributes here as before.
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, page: Page, clusters: List[Cluster], options: LayoutOptions
|
self, page: Page, clusters: List[Cluster], options: LayoutOptions
|
||||||
@ -220,18 +219,10 @@ class LayoutPostprocessor:
|
|||||||
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
|
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
|
||||||
)
|
)
|
||||||
|
|
||||||
# ---- NEW OPTIMIZED: Precompute quick-access table for regular cluster bboxes ----
|
|
||||||
# (For hot-path bbox filtering inside _process_special_clusters/_handle_cross_type_overlaps)
|
|
||||||
# Cluster ID mapped to (cluster, bbox, (l,t,r,b)) for fast lookup.
|
|
||||||
self._regular_bbox_tuples = [
|
self._regular_bbox_tuples = [
|
||||||
(c, c.bbox.l, c.bbox.t, c.bbox.r, c.bbox.b) for c in self.regular_clusters
|
(c, c.bbox.l, c.bbox.t, c.bbox.r, c.bbox.b) for c in self.regular_clusters
|
||||||
]
|
]
|
||||||
self._regular_bboxes = [
|
|
||||||
(c.bbox.l, c.bbox.t, c.bbox.r, c.bbox.b) for c in self.regular_clusters
|
|
||||||
]
|
|
||||||
self._regular_clusters_list = (
|
|
||||||
self.regular_clusters
|
|
||||||
) # For index access with above
|
|
||||||
|
|
||||||
def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
|
def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
|
||||||
"""Main processing pipeline."""
|
"""Main processing pipeline."""
|
||||||
@ -323,6 +314,7 @@ class LayoutPostprocessor:
|
|||||||
|
|
||||||
special_clusters = self._handle_cross_type_overlaps(special_clusters)
|
special_clusters = self._handle_cross_type_overlaps(special_clusters)
|
||||||
|
|
||||||
|
# Calculate page area from known page size
|
||||||
assert self.page_size is not None
|
assert self.page_size is not None
|
||||||
page_area = self.page_size.width * self.page_size.height
|
page_area = self.page_size.width * self.page_size.height
|
||||||
if page_area > 0:
|
if page_area > 0:
|
||||||
@ -336,7 +328,7 @@ class LayoutPostprocessor:
|
|||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
# ---- OPTIMIZED: PRE-PROCESS REGULAR CLUSTERS BY BOUNDS ----
|
# PRE-PROCESS REGULAR CLUSTERS BY BOUNDS
|
||||||
# For each special, pre-filter only regular clusters whose bbox intersects
|
# For each special, pre-filter only regular clusters whose bbox intersects
|
||||||
# the special's bbox, using a quick rectangle intersection test.
|
# the special's bbox, using a quick rectangle intersection test.
|
||||||
regular_bbox_tuples = self._regular_bbox_tuples # local for speed
|
regular_bbox_tuples = self._regular_bbox_tuples # local for speed
|
||||||
@ -347,23 +339,22 @@ class LayoutPostprocessor:
|
|||||||
|
|
||||||
# Find only those regular clusters whose bbox intersects the special's bbox
|
# Find only those regular clusters whose bbox intersects the special's bbox
|
||||||
possible = []
|
possible = []
|
||||||
for c, l, t, r, b in regular_bbox_tuples:
|
for c, left, top, right, bottom in regular_bbox_tuples:
|
||||||
if l < sr and r > sl and t < sb and b > st:
|
if left < sr and right > sl and top < sb and bottom > st:
|
||||||
possible.append((c, l, t, r, b))
|
possible.append(c)
|
||||||
|
|
||||||
# Now do the expensive computation only for these
|
# Now do the expensive computation only for these
|
||||||
append_contained = contained.append
|
for c in possible:
|
||||||
for c, _, _, _, _ in possible:
|
|
||||||
containment = c.bbox.intersection_over_self(special.bbox)
|
containment = c.bbox.intersection_over_self(special.bbox)
|
||||||
if containment > 0.8:
|
if containment > 0.8:
|
||||||
append_contained(c)
|
contained.append(c)
|
||||||
|
|
||||||
if contained:
|
if contained:
|
||||||
contained = self._sort_clusters(contained, mode="id")
|
contained = self._sort_clusters(contained, mode="id")
|
||||||
special.children = contained
|
special.children = contained
|
||||||
|
|
||||||
|
# Adjust bbox only for Form and Key-Value-Region, not Table or Picture
|
||||||
if special.label in [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]:
|
if special.label in [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]:
|
||||||
# This is still cheap, keeps the minimal code change
|
|
||||||
special.bbox = BoundingBox(
|
special.bbox = BoundingBox(
|
||||||
l=min(c.bbox.l for c in contained),
|
l=min(c.bbox.l for c in contained),
|
||||||
t=min(c.bbox.t for c in contained),
|
t=min(c.bbox.t for c in contained),
|
||||||
@ -401,7 +392,7 @@ class LayoutPostprocessor:
|
|||||||
should be removed.
|
should be removed.
|
||||||
"""
|
"""
|
||||||
wrappers_to_remove = set()
|
wrappers_to_remove = set()
|
||||||
# OPTIMIZED: Pre-index tables only for bbox fast filter
|
# Precompute table clusters with their bbox coordinates for intersection testing
|
||||||
table_clusters = [
|
table_clusters = [
|
||||||
(c, c.bbox.l, c.bbox.t, c.bbox.r, c.bbox.b)
|
(c, c.bbox.l, c.bbox.t, c.bbox.r, c.bbox.b)
|
||||||
for c in self.regular_clusters
|
for c in self.regular_clusters
|
||||||
@ -417,8 +408,8 @@ class LayoutPostprocessor:
|
|||||||
|
|
||||||
# restrict to table-regulars whose bbox intersects
|
# restrict to table-regulars whose bbox intersects
|
||||||
possible = []
|
possible = []
|
||||||
for c, l, t, r, b in table_clusters:
|
for c, left, top, right, bottom in table_clusters:
|
||||||
if l < wr and r > wl and t < wb and b > wt:
|
if left < wr and right > wl and top < wb and bottom > wt:
|
||||||
possible.append(c)
|
possible.append(c)
|
||||||
|
|
||||||
for regular in possible:
|
for regular in possible:
|
||||||
@ -523,7 +514,9 @@ class LayoutPostprocessor:
|
|||||||
spatial_index = (
|
spatial_index = (
|
||||||
self.regular_index
|
self.regular_index
|
||||||
if cluster_type == "regular"
|
if cluster_type == "regular"
|
||||||
else self.picture_index if cluster_type == "picture" else self.wrapper_index
|
else self.picture_index
|
||||||
|
if cluster_type == "picture"
|
||||||
|
else self.wrapper_index
|
||||||
)
|
)
|
||||||
|
|
||||||
# Map of currently valid clusters
|
# Map of currently valid clusters
|
||||||
|
Loading…
Reference in New Issue
Block a user