Annoying fixes for historical python versions

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-12-17 16:31:59 +01:00
parent d29a245b8c
commit 6d38c7cc75
2 changed files with 30 additions and 15 deletions

View File

@ -168,7 +168,9 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
except Exception as e: except Exception as e:
conv_res.status = ConversionStatus.FAILURE conv_res.status = ConversionStatus.FAILURE
trace = "\n".join(traceback.format_exception(e)) trace = "\n".join(
traceback.format_exception(type(e), e, e.__traceback__)
)
_log.warning( _log.warning(
f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n" f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
f"{trace}" f"{trace}"

View File

@ -79,7 +79,7 @@ class SpatialClusterIndex:
y_candidates = self.y_intervals.find_containing( y_candidates = self.y_intervals.find_containing(
bbox.t bbox.t
) | self.y_intervals.find_containing(bbox.b) ) | self.y_intervals.find_containing(bbox.b)
return spatial | x_candidates | y_candidates return spatial.union(x_candidates).union(y_candidates)
def check_overlap( def check_overlap(
self, self,
@ -108,34 +108,47 @@ class SpatialClusterIndex:
) )
class Interval:
"""Helper class for sortable intervals."""
def __init__(self, min_val: float, max_val: float, id: int):
self.min_val = min_val
self.max_val = max_val
self.id = id
def __lt__(self, other):
if isinstance(other, Interval):
return self.min_val < other.min_val
return self.min_val < other
class IntervalTree: class IntervalTree:
"""Memory-efficient interval tree for 1D overlap queries.""" """Memory-efficient interval tree for 1D overlap queries."""
def __init__(self): def __init__(self):
self.intervals: List[Tuple[float, float, int]] = ( self.intervals: List[Interval] = [] # Sorted by min_val
[]
) # (min, max, id) sorted by min
def insert(self, min_val: float, max_val: float, id: int): def insert(self, min_val: float, max_val: float, id: int):
bisect.insort(self.intervals, (min_val, max_val, id), key=lambda x: x[0]) interval = Interval(min_val, max_val, id)
bisect.insort(self.intervals, interval)
def find_containing(self, point: float) -> Set[int]: def find_containing(self, point: float) -> Set[int]:
"""Find all intervals containing the point.""" """Find all intervals containing the point."""
pos = bisect.bisect_left(self.intervals, (point, float("-inf"), -1)) pos = bisect.bisect_left(self.intervals, point)
result = set() result = set()
# Check intervals starting before point # Check intervals starting before point
for min_val, max_val, id in reversed(self.intervals[:pos]): for interval in reversed(self.intervals[:pos]):
if min_val <= point <= max_val: if interval.min_val <= point <= interval.max_val:
result.add(id) result.add(interval.id)
else: else:
break break
# Check intervals starting at/after point # Check intervals starting at/after point
for min_val, max_val, id in self.intervals[pos:]: for interval in self.intervals[pos:]:
if point <= max_val: if point <= interval.max_val:
if min_val <= point: if interval.min_val <= point:
result.add(id) result.add(interval.id)
else: else:
break break
@ -158,7 +171,7 @@ class LayoutPostprocessor:
DocItemLabel.TABLE, DocItemLabel.TABLE,
DocItemLabel.DOCUMENT_INDEX, DocItemLabel.DOCUMENT_INDEX,
} }
SPECIAL_TYPES = WRAPPER_TYPES | {DocItemLabel.PICTURE} SPECIAL_TYPES = WRAPPER_TYPES.union({DocItemLabel.PICTURE})
CONFIDENCE_THRESHOLDS = { CONFIDENCE_THRESHOLDS = {
DocItemLabel.CAPTION: 0.5, DocItemLabel.CAPTION: 0.5,