mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-03 07:52:20 +00:00
Annoying fixes for historical python versions
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
d29a245b8c
commit
6d38c7cc75
@ -168,7 +168,9 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
trace = "\n".join(traceback.format_exception(e))
|
trace = "\n".join(
|
||||||
|
traceback.format_exception(type(e), e, e.__traceback__)
|
||||||
|
)
|
||||||
_log.warning(
|
_log.warning(
|
||||||
f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
|
f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
|
||||||
f"{trace}"
|
f"{trace}"
|
||||||
|
@ -79,7 +79,7 @@ class SpatialClusterIndex:
|
|||||||
y_candidates = self.y_intervals.find_containing(
|
y_candidates = self.y_intervals.find_containing(
|
||||||
bbox.t
|
bbox.t
|
||||||
) | self.y_intervals.find_containing(bbox.b)
|
) | self.y_intervals.find_containing(bbox.b)
|
||||||
return spatial | x_candidates | y_candidates
|
return spatial.union(x_candidates).union(y_candidates)
|
||||||
|
|
||||||
def check_overlap(
|
def check_overlap(
|
||||||
self,
|
self,
|
||||||
@ -108,34 +108,47 @@ class SpatialClusterIndex:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Interval:
|
||||||
|
"""Helper class for sortable intervals."""
|
||||||
|
|
||||||
|
def __init__(self, min_val: float, max_val: float, id: int):
|
||||||
|
self.min_val = min_val
|
||||||
|
self.max_val = max_val
|
||||||
|
self.id = id
|
||||||
|
|
||||||
|
def __lt__(self, other):
|
||||||
|
if isinstance(other, Interval):
|
||||||
|
return self.min_val < other.min_val
|
||||||
|
return self.min_val < other
|
||||||
|
|
||||||
|
|
||||||
class IntervalTree:
|
class IntervalTree:
|
||||||
"""Memory-efficient interval tree for 1D overlap queries."""
|
"""Memory-efficient interval tree for 1D overlap queries."""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.intervals: List[Tuple[float, float, int]] = (
|
self.intervals: List[Interval] = [] # Sorted by min_val
|
||||||
[]
|
|
||||||
) # (min, max, id) sorted by min
|
|
||||||
|
|
||||||
def insert(self, min_val: float, max_val: float, id: int):
|
def insert(self, min_val: float, max_val: float, id: int):
|
||||||
bisect.insort(self.intervals, (min_val, max_val, id), key=lambda x: x[0])
|
interval = Interval(min_val, max_val, id)
|
||||||
|
bisect.insort(self.intervals, interval)
|
||||||
|
|
||||||
def find_containing(self, point: float) -> Set[int]:
|
def find_containing(self, point: float) -> Set[int]:
|
||||||
"""Find all intervals containing the point."""
|
"""Find all intervals containing the point."""
|
||||||
pos = bisect.bisect_left(self.intervals, (point, float("-inf"), -1))
|
pos = bisect.bisect_left(self.intervals, point)
|
||||||
result = set()
|
result = set()
|
||||||
|
|
||||||
# Check intervals starting before point
|
# Check intervals starting before point
|
||||||
for min_val, max_val, id in reversed(self.intervals[:pos]):
|
for interval in reversed(self.intervals[:pos]):
|
||||||
if min_val <= point <= max_val:
|
if interval.min_val <= point <= interval.max_val:
|
||||||
result.add(id)
|
result.add(interval.id)
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
# Check intervals starting at/after point
|
# Check intervals starting at/after point
|
||||||
for min_val, max_val, id in self.intervals[pos:]:
|
for interval in self.intervals[pos:]:
|
||||||
if point <= max_val:
|
if point <= interval.max_val:
|
||||||
if min_val <= point:
|
if interval.min_val <= point:
|
||||||
result.add(id)
|
result.add(interval.id)
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -158,7 +171,7 @@ class LayoutPostprocessor:
|
|||||||
DocItemLabel.TABLE,
|
DocItemLabel.TABLE,
|
||||||
DocItemLabel.DOCUMENT_INDEX,
|
DocItemLabel.DOCUMENT_INDEX,
|
||||||
}
|
}
|
||||||
SPECIAL_TYPES = WRAPPER_TYPES | {DocItemLabel.PICTURE}
|
SPECIAL_TYPES = WRAPPER_TYPES.union({DocItemLabel.PICTURE})
|
||||||
|
|
||||||
CONFIDENCE_THRESHOLDS = {
|
CONFIDENCE_THRESHOLDS = {
|
||||||
DocItemLabel.CAPTION: 0.5,
|
DocItemLabel.CAPTION: 0.5,
|
||||||
|
Loading…
Reference in New Issue
Block a user