diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index dc6d2693..bb1fe058 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -6,7 +6,7 @@ from typing import Iterable, List, Optional, Union import pypdfium2 as pdfium from docling_core.types.doc import BoundingBox, CoordOrigin, Size -from docling_parse.docling_parse import pdf_parser_v1 +from docling_parse.pdf_parsers import pdf_parser_v1 from PIL import Image, ImageDraw from pypdfium2 import PdfPage diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py index b518850e..93f33466 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docling/backend/docling_parse_v2_backend.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union import pypdfium2 as pdfium from docling_core.types.doc import BoundingBox, CoordOrigin -from docling_parse.docling_parse import pdf_parser_v2 +from docling_parse.pdf_parsers import pdf_parser_v2 from PIL import Image, ImageDraw from pypdfium2 import PdfPage diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 55a19ac3..0bc31b16 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -121,6 +121,7 @@ class Cluster(BaseModel): bbox: BoundingBox confidence: float = 1.0 cells: List[Cell] = [] + children: List["Cluster"] = [] # Add child cluster support class BasePageElement(BaseModel): @@ -135,6 +136,12 @@ class LayoutPrediction(BaseModel): clusters: List[Cluster] = [] +class ContainerElement( + BasePageElement +): # Used for Form and Key-Value-Regions, only for typing. + pass + + class Table(BasePageElement): otsl_seq: List[str] num_rows: int = 0 @@ -174,7 +181,7 @@ class PagePredictions(BaseModel): equations_prediction: Optional[EquationPrediction] = None -PageElement = Union[TextElement, Table, FigureElement] +PageElement = Union[TextElement, Table, FigureElement, ContainerElement] class AssembledUnit(BaseModel): diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index e5b49343..242052f7 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -77,6 +77,8 @@ layout_label_to_ds_type = { DocItemLabel.PICTURE: "figure", DocItemLabel.TEXT: "paragraph", DocItemLabel.PARAGRAPH: "paragraph", + DocItemLabel.FORM: DocItemLabel.FORM.value, + DocItemLabel.KEY_VALUE_REGION: DocItemLabel.KEY_VALUE_REGION.value, } _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy") diff --git a/docling/datamodel/settings.py b/docling/datamodel/settings.py index b1c47305..46bab75c 100644 --- a/docling/datamodel/settings.py +++ b/docling/datamodel/settings.py @@ -31,6 +31,7 @@ class DebugSettings(BaseModel): visualize_cells: bool = False visualize_ocr: bool = False visualize_layout: bool = False + visualize_raw_layout: bool = False visualize_tables: bool = False profile_pipeline_timings: bool = False diff --git a/docling/document_converter.py b/docling/document_converter.py index 503a4c5b..4637159f 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -10,6 +10,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.asciidoc_backend import AsciiDocBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.msexcel_backend import MsExcelDocumentBackend @@ -84,7 +85,7 @@ class HTMLFormatOption(FormatOption): class PdfFormatOption(FormatOption): pipeline_cls: Type = StandardPdfPipeline - backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend + backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend class ImageFormatOption(FormatOption): diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py index 0a066bfa..5a42feac 100644 --- a/docling/models/ds_glm_model.py +++ b/docling/models/ds_glm_model.py @@ -4,7 +4,6 @@ from pathlib import Path from typing import List, Union from deepsearch_glm.nlp_utils import init_nlp_model -from deepsearch_glm.utils.doc_utils import to_docling_document from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox @@ -24,11 +23,18 @@ from docling_core.types.legacy_doc.document import ( from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument from PIL import ImageDraw -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, TypeAdapter -from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement +from docling.datamodel.base_models import ( + Cluster, + ContainerElement, + FigureElement, + Table, + TextElement, +) from docling.datamodel.document import ConversionResult, layout_label_to_ds_type from docling.datamodel.settings import settings +from docling.utils.glm_utils import to_docling_document from docling.utils.profiling import ProfilingScope, TimeRecorder from docling.utils.utils import create_hash @@ -45,7 +51,9 @@ class GlmModel: if self.options.model_names != "": load_pretrained_nlp_models() - self.model = init_nlp_model(model_names=self.options.model_names) + self.model = init_nlp_model( + model_names=self.options.model_names, loglevel="ERROR" + ) def _to_legacy_document(self, conv_res) -> DsDocument: title = "" @@ -207,7 +215,31 @@ class GlmModel: ) ], obj_type=layout_label_to_ds_type.get(element.label), - # data=[[]], + payload={ + "children": TypeAdapter(List[Cluster]).dump_python( + element.cluster.children + ) + }, # hack to channel child clusters through GLM + ) + ) + elif isinstance(element, ContainerElement): + main_text.append( + BaseText( + text="", + payload={ + "children": TypeAdapter(List[Cluster]).dump_python( + element.cluster.children + ) + }, # hack to channel child clusters through GLM + obj_type=layout_label_to_ds_type.get(element.label), + name=element.label, + prov=[ + Prov( + bbox=target_bbox, + page=element.page_no + 1, + span=[0, 0], + ) + ], ) ) @@ -232,7 +264,7 @@ class GlmModel: def __call__(self, conv_res: ConversionResult) -> DoclingDocument: with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT): ds_doc = self._to_legacy_document(conv_res) - ds_doc_dict = ds_doc.model_dump(by_alias=True) + ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True) glm_doc = self.model.apply_on_doc(ds_doc_dict) diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index cd8009fd..dd4dbee5 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -7,7 +7,7 @@ from typing import Iterable, List from docling_core.types.doc import CoordOrigin, DocItemLabel from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor -from PIL import ImageDraw +from PIL import Image, ImageDraw from docling.datamodel.base_models import ( BoundingBox, @@ -21,7 +21,7 @@ from docling.datamodel.pipeline_options import AcceleratorOptions from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel from docling.utils import accelerator_utils as au -from docling.utils import layout_utils as lu +from docling.utils.layout_postprocessor import LayoutPostprocessor from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) @@ -47,6 +47,7 @@ class LayoutModel(BasePageModel): TABLE_LABEL = DocItemLabel.TABLE FIGURE_LABEL = DocItemLabel.PICTURE FORMULA_LABEL = DocItemLabel.FORMULA + CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION] def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions): device = au.decide_device(accelerator_options.device) @@ -54,230 +55,103 @@ class LayoutModel(BasePageModel): artifacts_path, device, accelerator_options.num_threads ) - def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height): - MIN_INTERSECTION = 0.2 - CLASS_THRESHOLDS = { - DocItemLabel.CAPTION: 0.35, - DocItemLabel.FOOTNOTE: 0.35, - DocItemLabel.FORMULA: 0.35, - DocItemLabel.LIST_ITEM: 0.35, - DocItemLabel.PAGE_FOOTER: 0.35, - DocItemLabel.PAGE_HEADER: 0.35, - DocItemLabel.PICTURE: 0.2, # low threshold adjust to capture chemical structures for examples. - DocItemLabel.SECTION_HEADER: 0.45, - DocItemLabel.TABLE: 0.35, - DocItemLabel.TEXT: 0.45, - DocItemLabel.TITLE: 0.45, - DocItemLabel.DOCUMENT_INDEX: 0.45, - DocItemLabel.CODE: 0.45, - DocItemLabel.CHECKBOX_SELECTED: 0.45, - DocItemLabel.CHECKBOX_UNSELECTED: 0.45, - DocItemLabel.FORM: 0.45, - DocItemLabel.KEY_VALUE_REGION: 0.45, + def draw_clusters_and_cells_side_by_side( + self, conv_res, page, clusters, mode_prefix: str, show: bool = False + ): + """ + Draws a page image side by side with clusters filtered into two categories: + - Left: Clusters excluding FORM, KEY_VALUE_REGION, and PICTURE. + - Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE. + """ + label_to_color = { + DocItemLabel.TEXT: (255, 255, 153), # Light Yellow + DocItemLabel.CAPTION: (255, 204, 153), # Light Orange + DocItemLabel.LIST_ITEM: (153, 153, 255), # Light Purple + DocItemLabel.FORMULA: (192, 192, 192), # Gray + DocItemLabel.TABLE: (255, 204, 204), # Light Pink + DocItemLabel.PICTURE: (255, 204, 164), # Light Beige + DocItemLabel.SECTION_HEADER: (255, 153, 153), # Light Red + DocItemLabel.PAGE_HEADER: (204, 255, 204), # Light Green + DocItemLabel.PAGE_FOOTER: ( + 204, + 255, + 204, + ), # Light Green (same as Page-Header) + DocItemLabel.TITLE: (255, 153, 153), # Light Red (same as Section-Header) + DocItemLabel.FOOTNOTE: (200, 200, 255), # Light Blue + DocItemLabel.DOCUMENT_INDEX: (220, 220, 220), # Light Gray + DocItemLabel.CODE: (255, 223, 186), # Peach + DocItemLabel.CHECKBOX_SELECTED: (255, 182, 193), # Pale Green + DocItemLabel.CHECKBOX_UNSELECTED: (255, 182, 193), # Light Pink + DocItemLabel.FORM: (200, 255, 255), # Light Cyan + DocItemLabel.KEY_VALUE_REGION: (183, 65, 14), # Rusty orange } - CLASS_REMAPPINGS = { - DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE, - DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER, + # Filter clusters for left and right images + exclude_labels = { + DocItemLabel.FORM, + DocItemLabel.KEY_VALUE_REGION, + DocItemLabel.PICTURE, } + left_clusters = [c for c in clusters if c.label not in exclude_labels] + right_clusters = [c for c in clusters if c.label in exclude_labels] - _log.debug("================= Start postprocess function ====================") - start_time = time.time() - # Apply Confidence Threshold to cluster predictions - # confidence = self.conf_threshold - clusters_mod = [] + # Create a deep copy of the original image for both sides + left_image = copy.deepcopy(page.image) + right_image = copy.deepcopy(page.image) - for cluster in clusters_in: - confidence = CLASS_THRESHOLDS[cluster.label] - if cluster.confidence >= confidence: - # annotation["created_by"] = "high_conf_pred" + # Function to draw clusters on an image + def draw_clusters(image, clusters): + draw = ImageDraw.Draw(image, "RGBA") + for c_tl in clusters: + all_clusters = [c_tl, *c_tl.children] + for c in all_clusters: + cell_color = (0, 0, 0, 40) # Transparent black for cells + for tc in c.cells: + cx0, cy0, cx1, cy1 = tc.bbox.as_tuple() + draw.rectangle( + [(cx0, cy0), (cx1, cy1)], + outline=None, + fill=cell_color, + ) - # Remap class labels where needed. - if cluster.label in CLASS_REMAPPINGS.keys(): - cluster.label = CLASS_REMAPPINGS[cluster.label] - clusters_mod.append(cluster) + x0, y0, x1, y1 = c.bbox.as_tuple() + cluster_fill_color = ( + *list(label_to_color.get(c.label)), # type: ignore + 70, + ) + cluster_outline_color = ( + *list(label_to_color.get(c.label)), # type: ignore + 255, + ) + draw.rectangle( + [(x0, y0), (x1, y1)], + outline=cluster_outline_color, + fill=cluster_fill_color, + ) - # map to dictionary clusters and cells, with bottom left origin - clusters_orig = [ - { - "id": c.id, - "bbox": list( - c.bbox.to_bottom_left_origin(page_height).as_tuple() - ), # TODO - "confidence": c.confidence, - "cell_ids": [], - "type": c.label, - } - for c in clusters_in - ] + # Draw clusters on both images + draw_clusters(left_image, left_clusters) + draw_clusters(right_image, right_clusters) - clusters_out = [ - { - "id": c.id, - "bbox": list( - c.bbox.to_bottom_left_origin(page_height).as_tuple() - ), # TODO - "confidence": c.confidence, - "created_by": "high_conf_pred", - "cell_ids": [], - "type": c.label, - } - for c in clusters_mod - ] + # Combine the images side by side + combined_width = left_image.width * 2 + combined_height = left_image.height + combined_image = Image.new("RGB", (combined_width, combined_height)) + combined_image.paste(left_image, (0, 0)) + combined_image.paste(right_image, (left_image.width, 0)) - del clusters_mod - - raw_cells = [ - { - "id": c.id, - "bbox": list( - c.bbox.to_bottom_left_origin(page_height).as_tuple() - ), # TODO - "text": c.text, - } - for c in cells - ] - cell_count = len(raw_cells) - - _log.debug("---- 0. Treat cluster overlaps ------") - clusters_out = lu.remove_cluster_duplicates_by_conf(clusters_out, 0.8) - - _log.debug( - "---- 1. Initially assign cells to clusters based on minimum intersection ------" - ) - ## Check for cells included in or touched by clusters: - clusters_out = lu.assigning_cell_ids_to_clusters( - clusters_out, raw_cells, MIN_INTERSECTION - ) - - _log.debug("---- 2. Assign Orphans with Low Confidence Detections") - # Creates a map of cell_id->cluster_id - ( - clusters_around_cells, - orphan_cell_indices, - ambiguous_cell_indices, - ) = lu.cell_id_state_map(clusters_out, cell_count) - - # Assign orphan cells with lower confidence predictions - clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred( - clusters_out, clusters_orig, raw_cells, orphan_cell_indices - ) - - # Refresh the cell_ids assignment, after creating new clusters using low conf predictions - clusters_out = lu.assigning_cell_ids_to_clusters( - clusters_out, raw_cells, MIN_INTERSECTION - ) - - _log.debug("---- 3. Settle Ambigous Cells") - # Creates an update map after assignment of cell_id->cluster_id - ( - clusters_around_cells, - orphan_cell_indices, - ambiguous_cell_indices, - ) = lu.cell_id_state_map(clusters_out, cell_count) - - # Settle pdf cells that belong to multiple clusters - clusters_out, ambiguous_cell_indices = lu.remove_ambigous_pdf_cell_by_conf( - clusters_out, raw_cells, ambiguous_cell_indices - ) - - _log.debug("---- 4. Set Orphans as Text") - ( - clusters_around_cells, - orphan_cell_indices, - ambiguous_cell_indices, - ) = lu.cell_id_state_map(clusters_out, cell_count) - - clusters_out, orphan_cell_indices = lu.set_orphan_as_text( - clusters_out, clusters_orig, raw_cells, orphan_cell_indices - ) - - _log.debug("---- 5. Merge Cells & and adapt the bounding boxes") - # Merge cells orphan cells - clusters_out = lu.merge_cells(clusters_out) - - # Clean up clusters that remain from merged and unreasonable clusters - clusters_out = lu.clean_up_clusters( - clusters_out, - raw_cells, - merge_cells=True, - img_table=True, - one_cell_table=True, - ) - - new_clusters = lu.adapt_bboxes(raw_cells, clusters_out, orphan_cell_indices) - clusters_out = new_clusters - - ## We first rebuild where every cell is now: - ## Now we write into a prediction cells list, not into the raw cells list. - ## As we don't need previous labels, we best overwrite any old list, because that might - ## have been sorted differently. - ( - clusters_around_cells, - orphan_cell_indices, - ambiguous_cell_indices, - ) = lu.cell_id_state_map(clusters_out, cell_count) - - target_cells = [] - for ix, cell in enumerate(raw_cells): - new_cell = { - "id": ix, - "rawcell_id": ix, - "label": "None", - "bbox": cell["bbox"], - "text": cell["text"], - } - for cluster_index in clusters_around_cells[ - ix - ]: # By previous analysis, this is always 1 cluster. - new_cell["label"] = clusters_out[cluster_index]["type"] - target_cells.append(new_cell) - # _log.debug("New label of cell " + str(ix) + " is " + str(new_cell["label"])) - cells_out = target_cells - - ## ------------------------------- - ## Sort clusters into reasonable reading order, and sort the cells inside each cluster - _log.debug("---- 5. Sort clusters in reading order ------") - sorted_clusters = lu.produce_reading_order( - clusters_out, "raw_cell_ids", "raw_cell_ids", True - ) - clusters_out = sorted_clusters - - # end_time = timer() - _log.debug("---- End of postprocessing function ------") - end_time = time.time() - start_time - _log.debug(f"Finished post processing in seconds={end_time:.3f}") - - cells_out_new = [ - Cell( - id=c["id"], # type: ignore - bbox=BoundingBox.from_tuple( - coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore - ).to_top_left_origin(page_height), - text=c["text"], # type: ignore + if show: + combined_image.show() + else: + out_path: Path = ( + Path(settings.debug.debug_output_path) + / f"debug_{conv_res.input.file.stem}" ) - for c in cells_out - ] + out_path.mkdir(parents=True, exist_ok=True) - del cells_out - - clusters_out_new = [] - for c in clusters_out: - cluster_cells = [ - ccell for ccell in cells_out_new if ccell.id in c["cell_ids"] # type: ignore - ] - c_new = Cluster( - id=c["id"], # type: ignore - bbox=BoundingBox.from_tuple( - coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore - ).to_top_left_origin(page_height), - confidence=c["confidence"], # type: ignore - label=DocItemLabel(c["type"]), - cells=cluster_cells, - ) - clusters_out_new.append(c_new) - - return clusters_out_new, cells_out_new + out_file = out_path / f"{mode_prefix}_layout_page_{page.page_no:05}.png" + combined_image.save(str(out_file), format="png") def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] @@ -310,43 +184,78 @@ class LayoutModel(BasePageModel): ) clusters.append(cluster) - # Map cells to clusters - # TODO: Remove, postprocess should take care of it anyway. - for cell in page.cells: - for cluster in clusters: - if not cell.bbox.area() > 0: - overlap_frac = 0.0 - else: - overlap_frac = ( - cell.bbox.intersection_area_with(cluster.bbox) - / cell.bbox.area() - ) - - if overlap_frac > 0.5: - cluster.cells.append(cell) - - # Pre-sort clusters - # clusters = self.sort_clusters_by_cell_order(clusters) - # DEBUG code: - def draw_clusters_and_cells(show: bool = False): + def draw_clusters_and_cells( + clusters, mode_prefix: str, show: bool = False + ): + label_to_color = { + DocItemLabel.TEXT: (255, 255, 153), # Light Yellow + DocItemLabel.CAPTION: (255, 204, 153), # Light Orange + DocItemLabel.LIST_ITEM: (153, 153, 255), # Light Purple + DocItemLabel.FORMULA: (192, 192, 192), # Gray + DocItemLabel.TABLE: (255, 204, 204), # Light Pink + DocItemLabel.PICTURE: (255, 255, 204), # Light Beige + DocItemLabel.SECTION_HEADER: (255, 153, 153), # Light Red + DocItemLabel.PAGE_HEADER: (204, 255, 204), # Light Green + DocItemLabel.PAGE_FOOTER: ( + 204, + 255, + 204, + ), # Light Green (same as Page-Header) + DocItemLabel.TITLE: ( + 255, + 153, + 153, + ), # Light Red (same as Section-Header) + DocItemLabel.FOOTNOTE: (200, 200, 255), # Light Blue + DocItemLabel.DOCUMENT_INDEX: (220, 220, 220), # Light Gray + DocItemLabel.CODE: (255, 223, 186), # Peach + DocItemLabel.CHECKBOX_SELECTED: ( + 255, + 182, + 193, + ), # Pale Green + DocItemLabel.CHECKBOX_UNSELECTED: ( + 255, + 182, + 193, + ), # Light Pink + DocItemLabel.FORM: (200, 255, 255), # Light Cyan + DocItemLabel.KEY_VALUE_REGION: ( + 183, + 65, + 14, + ), # Rusty orange + } + image = copy.deepcopy(page.image) if image is not None: - draw = ImageDraw.Draw(image) + draw = ImageDraw.Draw(image, "RGBA") for c in clusters: - x0, y0, x1, y1 = c.bbox.as_tuple() - draw.rectangle([(x0, y0), (x1, y1)], outline="green") - - cell_color = ( - random.randint(30, 140), - random.randint(30, 140), - random.randint(30, 140), - ) + cell_color = (0, 0, 0, 40) for tc in c.cells: # [:1]: - x0, y0, x1, y1 = tc.bbox.as_tuple() + cx0, cy0, cx1, cy1 = tc.bbox.as_tuple() draw.rectangle( - [(x0, y0), (x1, y1)], outline=cell_color + [(cx0, cy0), (cx1, cy1)], + outline=None, + fill=cell_color, ) + + x0, y0, x1, y1 = c.bbox.as_tuple() + cluster_fill_color = ( + *list(label_to_color.get(c.label)), # type: ignore + 70, + ) + cluster_outline_color = ( + *list(label_to_color.get(c.label)), # type: ignore + 255, + ) + draw.rectangle( + [(x0, y0), (x1, y1)], + outline=cluster_outline_color, + fill=cluster_fill_color, + ) + if show: image.show() else: @@ -357,19 +266,30 @@ class LayoutModel(BasePageModel): out_path.mkdir(parents=True, exist_ok=True) out_file = ( - out_path / f"layout_page_{page.page_no:05}.png" + out_path + / f"{mode_prefix}_layout_page_{page.page_no:05}.png" ) image.save(str(out_file), format="png") - # draw_clusters_and_cells() + if settings.debug.visualize_raw_layout: + self.draw_clusters_and_cells_side_by_side( + conv_res, page, clusters, mode_prefix="raw" + ) - clusters, page.cells = self.postprocess( - clusters, page.cells, page.size.height + # Apply postprocessing + processed_clusters, processed_cells = LayoutPostprocessor( + page.cells, clusters + ).postprocess() + # processed_clusters, processed_cells = clusters, page.cells + + page.cells = processed_cells + page.predictions.layout = LayoutPrediction( + clusters=processed_clusters ) - page.predictions.layout = LayoutPrediction(clusters=clusters) - if settings.debug.visualize_layout: - draw_clusters_and_cells() + self.draw_clusters_and_cells_side_by_side( + conv_res, page, processed_clusters, mode_prefix="postprocessed" + ) yield page diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index 9b064ead..4c27400f 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -6,6 +6,7 @@ from pydantic import BaseModel from docling.datamodel.base_models import ( AssembledUnit, + ContainerElement, FigureElement, Page, PageElement, @@ -159,6 +160,15 @@ class PageAssembleModel(BasePageModel): ) elements.append(equation) body.append(equation) + elif cluster.label in LayoutModel.CONTAINER_LABELS: + container_el = ContainerElement( + label=cluster.label, + id=cluster.id, + page_no=page.page_no, + cluster=cluster, + ) + elements.append(container_el) + body.append(container_el) page.assembled = AssembledUnit( elements=elements, headers=headers, body=body diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 2fcb53a5..d9a48a49 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -40,7 +40,8 @@ _log = logging.getLogger(__name__) class StandardPdfPipeline(PaginatedPipeline): # TODO: Revise after having the models in HF # _layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt" - _layout_model_path = "model_artifacts/layout/" + + _layout_model_path = "model_artifacts/layout" _table_model_path = "model_artifacts/tableformer" def __init__(self, pipeline_options: PdfPipelineOptions): diff --git a/docling/utils/glm_utils.py b/docling/utils/glm_utils.py new file mode 100644 index 00000000..13681017 --- /dev/null +++ b/docling/utils/glm_utils.py @@ -0,0 +1,336 @@ +import re +from pathlib import Path +from typing import List + +import pandas as pd +from docling_core.types.doc import ( + BoundingBox, + CoordOrigin, + DocItemLabel, + DoclingDocument, + DocumentOrigin, + GroupLabel, + ProvenanceItem, + Size, + TableCell, + TableData, +) + + +def resolve_item(paths, obj): + """Find item in document from a reference path""" + + if len(paths) == 0: + return obj + + if paths[0] == "#": + return resolve_item(paths[1:], obj) + + try: + key = int(paths[0]) + except: + key = paths[0] + + if len(paths) == 1: + if isinstance(key, str) and key in obj: + return obj[key] + elif isinstance(key, int) and key < len(obj): + return obj[key] + else: + return None + + elif len(paths) > 1: + if isinstance(key, str) and key in obj: + return resolve_item(paths[1:], obj[key]) + elif isinstance(key, int) and key < len(obj): + return resolve_item(paths[1:], obj[key]) + else: + return None + + else: + return None + + +def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]: + unique_objects = [] + seen_spans = set() + + for sublist in grid: + for obj in sublist: + # Convert the spans list to a tuple of tuples for hashing + spans_tuple = tuple(tuple(span) for span in obj["spans"]) + if spans_tuple not in seen_spans: + seen_spans.add(spans_tuple) + unique_objects.append(obj) + + return unique_objects + + +def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: + origin = DocumentOrigin( + mimetype="application/pdf", + filename=doc_glm["file-info"]["filename"], + binary_hash=doc_glm["file-info"]["document-hash"], + ) + doc_name = Path(origin.filename).stem + + doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin) + + for page_dim in doc_glm["page-dimensions"]: + page_no = int(page_dim["page"]) + size = Size(width=page_dim["width"], height=page_dim["height"]) + + doc.add_page(page_no=page_no, size=size) + + if "properties" in doc_glm: + props = pd.DataFrame( + doc_glm["properties"]["data"], columns=doc_glm["properties"]["headers"] + ) + else: + props = pd.DataFrame() + + current_list = None + + for ix, pelem in enumerate(doc_glm["page-elements"]): + ptype = pelem["type"] + span_i = pelem["span"][0] + span_j = pelem["span"][1] + + if "iref" not in pelem: + # print(json.dumps(pelem, indent=2)) + continue + + iref = pelem["iref"] + + if re.match("#/figures/(\\d+)/captions/(.+)", iref): + # print(f"skip {iref}") + continue + + if re.match("#/tables/(\\d+)/captions/(.+)", iref): + # print(f"skip {iref}") + continue + + path = iref.split("/") + obj = resolve_item(path, doc_glm) + + if obj is None: + current_list = None + print(f"warning: undefined {path}") + continue + + if ptype == "figure": + current_list = None + text = "" + caption_refs = [] + for caption in obj["captions"]: + text += caption["text"] + + for nprov in caption["prov"]: + npaths = nprov["$ref"].split("/") + nelem = resolve_item(npaths, doc_glm) + + if nelem is None: + # print(f"warning: undefined caption {npaths}") + continue + + span_i = nelem["span"][0] + span_j = nelem["span"][1] + + cap_text = caption["text"][span_i:span_j] + + # doc_glm["page-elements"].remove(nelem) + + prov = ProvenanceItem( + page_no=nelem["page"], + charspan=tuple(nelem["span"]), + bbox=BoundingBox.from_tuple( + nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT + ), + ) + + caption_obj = doc.add_text( + label=DocItemLabel.CAPTION, text=cap_text, prov=prov + ) + caption_refs.append(caption_obj.get_ref()) + + prov = ProvenanceItem( + page_no=pelem["page"], + charspan=(0, len(text)), + bbox=BoundingBox.from_tuple( + pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT + ), + ) + + pic = doc.add_picture(prov=prov) + pic.captions.extend(caption_refs) + _add_child_elements(pic, doc, obj, pelem) + + elif ptype == "table": + current_list = None + text = "" + caption_refs = [] + for caption in obj["captions"]: + text += caption["text"] + + for nprov in caption["prov"]: + npaths = nprov["$ref"].split("/") + nelem = resolve_item(npaths, doc_glm) + + if nelem is None: + # print(f"warning: undefined caption {npaths}") + continue + + span_i = nelem["span"][0] + span_j = nelem["span"][1] + + cap_text = caption["text"][span_i:span_j] + + # doc_glm["page-elements"].remove(nelem) + + prov = ProvenanceItem( + page_no=nelem["page"], + charspan=tuple(nelem["span"]), + bbox=BoundingBox.from_tuple( + nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT + ), + ) + + caption_obj = doc.add_text( + label=DocItemLabel.CAPTION, text=cap_text, prov=prov + ) + caption_refs.append(caption_obj.get_ref()) + + table_cells_glm = _flatten_table_grid(obj["data"]) + + table_cells = [] + for tbl_cell_glm in table_cells_glm: + if tbl_cell_glm["bbox"] is not None: + bbox = BoundingBox.from_tuple( + tbl_cell_glm["bbox"], origin=CoordOrigin.BOTTOMLEFT + ) + else: + bbox = None + + is_col_header = False + is_row_header = False + is_row_section = False + + if tbl_cell_glm["type"] == "col_header": + is_col_header = True + elif tbl_cell_glm["type"] == "row_header": + is_row_header = True + elif tbl_cell_glm["type"] == "row_section": + is_row_section = True + + table_cells.append( + TableCell( + row_span=tbl_cell_glm["row-span"][1] + - tbl_cell_glm["row-span"][0], + col_span=tbl_cell_glm["col-span"][1] + - tbl_cell_glm["col-span"][0], + start_row_offset_idx=tbl_cell_glm["row-span"][0], + end_row_offset_idx=tbl_cell_glm["row-span"][1], + start_col_offset_idx=tbl_cell_glm["col-span"][0], + end_col_offset_idx=tbl_cell_glm["col-span"][1], + text=tbl_cell_glm["text"], + bbox=bbox, + column_header=is_col_header, + row_header=is_row_header, + row_section=is_row_section, + ) + ) + + tbl_data = TableData( + num_rows=obj.get("#-rows", 0), + num_cols=obj.get("#-cols", 0), + table_cells=table_cells, + ) + + prov = ProvenanceItem( + page_no=pelem["page"], + charspan=(0, 0), + bbox=BoundingBox.from_tuple( + pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT + ), + ) + + tbl = doc.add_table(data=tbl_data, prov=prov) + tbl.captions.extend(caption_refs) + + elif ptype in ["form", "key_value_region"]: + label = DocItemLabel(ptype) + container_el = doc.add_group(label=GroupLabel.UNSPECIFIED, name=label) + + _add_child_elements(container_el, doc, obj, pelem) + + elif "text" in obj: + text = obj["text"][span_i:span_j] + + type_label = pelem["type"] + name_label = pelem["name"] + if update_name_label and len(props) > 0 and type_label == "paragraph": + prop = props[ + (props["type"] == "semantic") & (props["subj_path"] == iref) + ] + if len(prop) == 1 and prop.iloc[0]["confidence"] > 0.85: + name_label = prop.iloc[0]["label"] + + prov = ProvenanceItem( + page_no=pelem["page"], + charspan=(0, len(text)), + bbox=BoundingBox.from_tuple( + pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT + ), + ) + label = DocItemLabel(name_label) + + if label == DocItemLabel.LIST_ITEM: + if current_list is None: + current_list = doc.add_group(label=GroupLabel.LIST, name="list") + + # TODO: Infer if this is a numbered or a bullet list item + doc.add_list_item( + text=text, enumerated=False, prov=prov, parent=current_list + ) + elif label == DocItemLabel.SECTION_HEADER: + current_list = None + + doc.add_heading(text=text, prov=prov) + else: + current_list = None + + doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov) + + return doc + + +def _add_child_elements(container_el, doc, obj, pelem): + payload = obj.get("payload") + if payload is not None: + children = payload.get("children", []) + + for child in children: + c_label = DocItemLabel(child["label"]) + c_bbox = BoundingBox.model_validate(child["bbox"]).to_bottom_left_origin( + doc.pages[pelem["page"]].size.height + ) + c_text = " ".join( + [ + cell["text"].replace("\x02", "-").strip() + for cell in child["cells"] + if len(cell["text"].strip()) > 0 + ] + ) + + c_prov = ProvenanceItem( + page_no=pelem["page"], charspan=(0, len(c_text)), bbox=c_bbox + ) + if c_label == DocItemLabel.LIST_ITEM: + # TODO: Infer if this is a numbered or a bullet list item + doc.add_list_item(parent=container_el, text=c_text, prov=c_prov) + elif c_label == DocItemLabel.SECTION_HEADER: + doc.add_heading(parent=container_el, text=c_text, prov=c_prov) + else: + doc.add_text( + parent=container_el, label=c_label, text=c_text, prov=c_prov + ) diff --git a/docling/utils/layout_postprocessor.py b/docling/utils/layout_postprocessor.py new file mode 100644 index 00000000..802e7f82 --- /dev/null +++ b/docling/utils/layout_postprocessor.py @@ -0,0 +1,496 @@ +import bisect +import logging +import sys +from collections import defaultdict +from typing import Dict, List, Set, Tuple + +from docling_core.types.doc import DocItemLabel +from rtree import index + +from docling.datamodel.base_models import BoundingBox, Cell, Cluster + +_log = logging.getLogger(__name__) + + +class UnionFind: + """Efficient Union-Find data structure for grouping elements.""" + + def __init__(self, elements): + self.parent = {elem: elem for elem in elements} + self.rank = {elem: 0 for elem in elements} + + def find(self, x): + if self.parent[x] != x: + self.parent[x] = self.find(self.parent[x]) # Path compression + return self.parent[x] + + def union(self, x, y): + root_x, root_y = self.find(x), self.find(y) + if root_x == root_y: + return + + if self.rank[root_x] > self.rank[root_y]: + self.parent[root_y] = root_x + elif self.rank[root_x] < self.rank[root_y]: + self.parent[root_x] = root_y + else: + self.parent[root_y] = root_x + self.rank[root_x] += 1 + + def get_groups(self) -> Dict[int, List[int]]: + """Returns groups as {root: [elements]}.""" + groups = defaultdict(list) + for elem in self.parent: + groups[self.find(elem)].append(elem) + return groups + + +class SpatialClusterIndex: + """Efficient spatial indexing for clusters using R-tree and interval trees.""" + + def __init__(self, clusters: List[Cluster]): + p = index.Property() + p.dimension = 2 + self.spatial_index = index.Index(properties=p) + self.x_intervals = IntervalTree() + self.y_intervals = IntervalTree() + self.clusters_by_id: Dict[int, Cluster] = {} + + for cluster in clusters: + self.add_cluster(cluster) + + def add_cluster(self, cluster: Cluster): + bbox = cluster.bbox + self.spatial_index.insert(cluster.id, bbox.as_tuple()) + self.x_intervals.insert(bbox.l, bbox.r, cluster.id) + self.y_intervals.insert(bbox.t, bbox.b, cluster.id) + self.clusters_by_id[cluster.id] = cluster + + def remove_cluster(self, cluster: Cluster): + self.spatial_index.delete(cluster.id, cluster.bbox.as_tuple()) + del self.clusters_by_id[cluster.id] + + def find_candidates(self, bbox: BoundingBox) -> Set[int]: + """Find potential overlapping cluster IDs using all indexes.""" + spatial = set(self.spatial_index.intersection(bbox.as_tuple())) + x_candidates = self.x_intervals.find_containing( + bbox.l + ) | self.x_intervals.find_containing(bbox.r) + y_candidates = self.y_intervals.find_containing( + bbox.t + ) | self.y_intervals.find_containing(bbox.b) + return spatial | x_candidates | y_candidates + + def check_overlap( + self, + bbox1: BoundingBox, + bbox2: BoundingBox, + overlap_threshold: float, + containment_threshold: float, + ) -> bool: + """Check if two bboxes overlap sufficiently.""" + area1, area2 = bbox1.area(), bbox2.area() + if area1 <= 0 or area2 <= 0: + return False + + overlap_area = bbox1.intersection_area_with(bbox2) + if overlap_area <= 0: + return False + + iou = overlap_area / (area1 + area2 - overlap_area) + containment1 = overlap_area / area1 + containment2 = overlap_area / area2 + + return ( + iou > overlap_threshold + or containment1 > containment_threshold + or containment2 > containment_threshold + ) + + +class IntervalTree: + """Memory-efficient interval tree for 1D overlap queries.""" + + def __init__(self): + self.intervals: List[Tuple[float, float, int]] = ( + [] + ) # (min, max, id) sorted by min + + def insert(self, min_val: float, max_val: float, id: int): + bisect.insort(self.intervals, (min_val, max_val, id), key=lambda x: x[0]) + + def find_containing(self, point: float) -> Set[int]: + """Find all intervals containing the point.""" + pos = bisect.bisect_left(self.intervals, (point, float("-inf"), -1)) + result = set() + + # Check intervals starting before point + for min_val, max_val, id in reversed(self.intervals[:pos]): + if min_val <= point <= max_val: + result.add(id) + else: + break + + # Check intervals starting at/after point + for min_val, max_val, id in self.intervals[pos:]: + if point <= max_val: + if min_val <= point: + result.add(id) + else: + break + + return result + + +class LayoutPostprocessor: + """Postprocesses layout predictions by cleaning up clusters and mapping cells.""" + + # Cluster type-specific parameters for overlap resolution + OVERLAP_PARAMS = { + "regular": {"area_threshold": 1.3, "conf_threshold": 0.05}, + "picture": {"area_threshold": 2.0, "conf_threshold": 0.3}, + "wrapper": {"area_threshold": 2.0, "conf_threshold": 0.2}, + } + + WRAPPER_TYPES = {DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION} + SPECIAL_TYPES = WRAPPER_TYPES | {DocItemLabel.PICTURE} + + CONFIDENCE_THRESHOLDS = { + DocItemLabel.CAPTION: 0.35, + DocItemLabel.FOOTNOTE: 0.35, + DocItemLabel.FORMULA: 0.35, + DocItemLabel.LIST_ITEM: 0.35, + DocItemLabel.PAGE_FOOTER: 0.35, + DocItemLabel.PAGE_HEADER: 0.35, + DocItemLabel.PICTURE: 0.1, + DocItemLabel.SECTION_HEADER: 0.45, + DocItemLabel.TABLE: 0.35, + DocItemLabel.TEXT: 0.45, + DocItemLabel.TITLE: 0.45, + DocItemLabel.CODE: 0.45, + DocItemLabel.CHECKBOX_SELECTED: 0.45, + DocItemLabel.CHECKBOX_UNSELECTED: 0.45, + DocItemLabel.FORM: 0.45, + DocItemLabel.KEY_VALUE_REGION: 0.45, + DocItemLabel.DOCUMENT_INDEX: 0.45, + } + + LABEL_REMAPPING = { + DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE, + DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER, + } + + def __init__(self, cells: List[Cell], clusters: List[Cluster]): + """Initialize processor with cells and clusters.""" + """Initialize processor with cells and spatial indices.""" + self.cells = cells + self.regular_clusters = [ + c for c in clusters if c.label not in self.SPECIAL_TYPES + ] + self.special_clusters = [c for c in clusters if c.label in self.SPECIAL_TYPES] + + # Build spatial indices once + self.regular_index = SpatialClusterIndex(self.regular_clusters) + self.picture_index = SpatialClusterIndex( + [c for c in self.special_clusters if c.label == DocItemLabel.PICTURE] + ) + self.wrapper_index = SpatialClusterIndex( + [c for c in self.special_clusters if c.label in self.WRAPPER_TYPES] + ) + + def postprocess(self) -> Tuple[List[Cluster], List[Cell]]: + """Main processing pipeline.""" + self.regular_clusters = self._process_regular_clusters() + self.special_clusters = self._process_special_clusters() + + # Remove regular clusters that are included in wrappers + contained_ids = { + child.id + for wrapper in self.special_clusters + if wrapper.label in self.SPECIAL_TYPES + for child in wrapper.children + } + self.regular_clusters = [ + c for c in self.regular_clusters if c.id not in contained_ids + ] + + # Combine and sort final clusters + final_clusters = self._sort_clusters( + self.regular_clusters + self.special_clusters + ) + return final_clusters, self.cells + + def _process_regular_clusters(self) -> List[Cluster]: + """Process regular clusters with iterative refinement.""" + clusters = [ + c + for c in self.regular_clusters + if c.confidence >= self.CONFIDENCE_THRESHOLDS[c.label] + ] + + # Apply label remapping + for cluster in clusters: + if cluster.label in self.LABEL_REMAPPING: + cluster.label = self.LABEL_REMAPPING[cluster.label] + + # Initial cell assignment + clusters = self._assign_cells_to_clusters(clusters) + + # Handle orphaned cells + unassigned = self._find_unassigned_cells(clusters) + if unassigned: + next_id = max((c.id for c in clusters), default=0) + 1 + orphan_clusters = [ + Cluster( + id=next_id + i, + label=DocItemLabel.TEXT, + bbox=cell.bbox, + confidence=0.0, + cells=[cell], + ) + for i, cell in enumerate(unassigned) + ] + clusters.extend(orphan_clusters) + + # Iterative refinement + prev_count = len(clusters) + 1 + for _ in range(3): # Maximum 3 iterations + if prev_count == len(clusters): + break + prev_count = len(clusters) + clusters = self._adjust_cluster_bboxes(clusters) + clusters = self._remove_overlapping_clusters(clusters, "regular") + + return clusters + + def _process_special_clusters(self) -> List[Cluster]: + special_clusters = [ + c + for c in self.special_clusters + if c.confidence >= self.CONFIDENCE_THRESHOLDS[c.label] + ] + + for special in special_clusters: + contained = [] + for cluster in self.regular_clusters: + overlap = cluster.bbox.intersection_area_with(special.bbox) + if overlap > 0: + containment = overlap / cluster.bbox.area() + if containment > 0.8: + contained.append(cluster) + + if contained: + # Sort contained clusters by minimum cell ID + contained.sort( + key=lambda cluster: ( + min(cell.id for cell in cluster.cells) + if cluster.cells + else sys.maxsize + ) + ) + special.children = contained + + # Adjust bbox only for wrapper types + if special.label in self.WRAPPER_TYPES: + special.bbox = BoundingBox( + l=min(c.bbox.l for c in contained), + t=min(c.bbox.t for c in contained), + r=max(c.bbox.r for c in contained), + b=max(c.bbox.b for c in contained), + ) + + picture_clusters = [ + c for c in special_clusters if c.label == DocItemLabel.PICTURE + ] + picture_clusters = self._remove_overlapping_clusters( + picture_clusters, "picture" + ) + + wrapper_clusters = [ + c for c in special_clusters if c.label in self.WRAPPER_TYPES + ] + wrapper_clusters = self._remove_overlapping_clusters( + wrapper_clusters, "wrapper" + ) + + return picture_clusters + wrapper_clusters + + def _remove_overlapping_clusters( + self, + clusters: List[Cluster], + cluster_type: str, + overlap_threshold: float = 0.8, + containment_threshold: float = 0.8, + ) -> List[Cluster]: + if not clusters: + return [] + + spatial_index = ( + self.regular_index + if cluster_type == "regular" + else self.picture_index if cluster_type == "picture" else self.wrapper_index + ) + + # Map of currently valid clusters + valid_clusters = {c.id: c for c in clusters} + uf = UnionFind(valid_clusters.keys()) + params = self.OVERLAP_PARAMS[cluster_type] + + for cluster in clusters: + candidates = spatial_index.find_candidates(cluster.bbox) + candidates &= valid_clusters.keys() # Only keep existing candidates + candidates.discard(cluster.id) + + for other_id in candidates: + if spatial_index.check_overlap( + cluster.bbox, + valid_clusters[other_id].bbox, + overlap_threshold, + containment_threshold, + ): + uf.union(cluster.id, other_id) + + result = [] + for group in uf.get_groups().values(): + if len(group) == 1: + result.append(valid_clusters[group[0]]) + continue + + group_clusters = [valid_clusters[cid] for cid in group] + current_best = None + + for candidate in group_clusters: + should_select = True + for other in group_clusters: + if other == candidate: + continue + + area_ratio = candidate.bbox.area() / other.bbox.area() + conf_diff = other.confidence - candidate.confidence + + if ( + area_ratio <= params["area_threshold"] + and conf_diff > params["conf_threshold"] + ): + should_select = False + break + + if should_select: + if current_best is None or ( + candidate.bbox.area() > current_best.bbox.area() + and current_best.confidence - candidate.confidence + <= params["conf_threshold"] + ): + current_best = candidate + + best = current_best if current_best else group_clusters[0] + for cluster in group_clusters: + if cluster != best: + best.cells.extend(cluster.cells) + result.append(best) + + return result + + def _select_best_cluster( + self, + clusters: List[Cluster], + area_threshold: float, + conf_threshold: float, + ) -> Cluster: + """Iteratively select best cluster based on area and confidence thresholds.""" + current_best = None + for candidate in clusters: + should_select = True + for other in clusters: + if other == candidate: + continue + + area_ratio = candidate.bbox.area() / other.bbox.area() + conf_diff = other.confidence - candidate.confidence + + if area_ratio <= area_threshold and conf_diff > conf_threshold: + should_select = False + break + + if should_select: + if current_best is None or ( + candidate.bbox.area() > current_best.bbox.area() + and current_best.confidence - candidate.confidence <= conf_threshold + ): + current_best = candidate + + return current_best if current_best else clusters[0] + + def _assign_cells_to_clusters( + self, clusters: List[Cluster], min_overlap: float = 0.2 + ) -> List[Cluster]: + """Assign cells to best overlapping cluster.""" + for cluster in clusters: + cluster.cells = [] + + for cell in self.cells: + if not cell.text.strip(): + continue + + best_overlap = min_overlap + best_cluster = None + + for cluster in clusters: + if cell.bbox.area() <= 0: + continue + + overlap = cell.bbox.intersection_area_with(cluster.bbox) + overlap_ratio = overlap / cell.bbox.area() + + if overlap_ratio > best_overlap: + best_overlap = overlap_ratio + best_cluster = cluster + + if best_cluster is not None: + best_cluster.cells.append(cell) + + return clusters + + def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[Cell]: + """Find cells not assigned to any cluster.""" + assigned = {cell.id for cluster in clusters for cell in cluster.cells} + return [ + cell for cell in self.cells if cell.id not in assigned and cell.text.strip() + ] + + def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]: + """Adjust cluster bounding boxes to contain their cells.""" + for cluster in clusters: + if not cluster.cells: + continue + + cells_bbox = BoundingBox( + l=min(cell.bbox.l for cell in cluster.cells), + t=min(cell.bbox.t for cell in cluster.cells), + r=max(cell.bbox.r for cell in cluster.cells), + b=max(cell.bbox.b for cell in cluster.cells), + ) + + if cluster.label == DocItemLabel.TABLE: + # For tables, take union of current bbox and cells bbox + cluster.bbox = BoundingBox( + l=min(cluster.bbox.l, cells_bbox.l), + t=min(cluster.bbox.t, cells_bbox.t), + r=max(cluster.bbox.r, cells_bbox.r), + b=max(cluster.bbox.b, cells_bbox.b), + ) + else: + cluster.bbox = cells_bbox + + return clusters + + def _sort_clusters(self, clusters: List[Cluster]) -> List[Cluster]: + """Sort clusters in reading order (top-to-bottom, left-to-right).""" + + def reading_order_key(cluster: Cluster) -> Tuple[float, float]: + if cluster.cells and cluster.label != DocItemLabel.PICTURE: + first_cell = min(cluster.cells, key=lambda c: (c.bbox.t, c.bbox.l)) + return (first_cell.bbox.t, first_cell.bbox.l) + return (cluster.bbox.t, cluster.bbox.l) + + return sorted(clusters, key=reading_order_key) diff --git a/docling/utils/layout_utils.py b/docling/utils/layout_utils.py deleted file mode 100644 index ceb18047..00000000 --- a/docling/utils/layout_utils.py +++ /dev/null @@ -1,812 +0,0 @@ -import copy -import logging - -import networkx as nx -from docling_core.types.doc import DocItemLabel - -logger = logging.getLogger("layout_utils") - - -## ------------------------------- -## Geometric helper functions -## The coordinates grow left to right, and bottom to top. -## The bounding box list elements 0 to 3 are x_left, y_bottom, x_right, y_top. - - -def area(bbox): - return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) - - -def contains(bbox_i, bbox_j): - ## Returns True if bbox_i contains bbox_j, else False - return ( - bbox_i[0] <= bbox_j[0] - and bbox_i[1] <= bbox_j[1] - and bbox_i[2] >= bbox_j[2] - and bbox_i[3] >= bbox_j[3] - ) - - -def is_intersecting(bbox_i, bbox_j): - return not ( - bbox_i[2] < bbox_j[0] - or bbox_i[0] > bbox_j[2] - or bbox_i[3] < bbox_j[1] - or bbox_i[1] > bbox_j[3] - ) - - -def bb_iou(boxA, boxB): - # determine the (x, y)-coordinates of the intersection rectangle - xA = max(boxA[0], boxB[0]) - yA = max(boxA[1], boxB[1]) - xB = min(boxA[2], boxB[2]) - yB = min(boxA[3], boxB[3]) - # compute the area of intersection rectangle - interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1) - # compute the area of both the prediction and ground-truth - # rectangles - boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1) - boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1) - # compute the intersection over union by taking the intersection - # area and dividing it by the sum of prediction + ground-truth - # areas - the interesection area - iou = interArea / float(boxAArea + boxBArea - interArea) - # return the intersection over union value - return iou - - -def compute_intersection(bbox_i, bbox_j): - ## Returns the size of the intersection area of the two boxes - if not is_intersecting(bbox_i, bbox_j): - return 0 - ## Determine the (x, y)-coordinates of the intersection rectangle: - xA = max(bbox_i[0], bbox_j[0]) - yA = max(bbox_i[1], bbox_j[1]) - xB = min(bbox_i[2], bbox_j[2]) - yB = min(bbox_i[3], bbox_j[3]) - ## Compute the area of intersection rectangle: - interArea = (xB - xA) * (yB - yA) - if interArea < 0: - logger.debug("Warning: Negative intersection detected!") - return 0 - return interArea - - -def surrounding(bbox_i, bbox_j): - ## Computes minimal box that contains both input boxes - sbox = [] - sbox.append(min(bbox_i[0], bbox_j[0])) - sbox.append(min(bbox_i[1], bbox_j[1])) - sbox.append(max(bbox_i[2], bbox_j[2])) - sbox.append(max(bbox_i[3], bbox_j[3])) - return sbox - - -def surrounding_list(bbox_list): - ## Computes minimal box that contains all boxes in the input list - ## The list should be non-empty, but just in case it's not: - if len(bbox_list) == 0: - sbox = [0, 0, 0, 0] - else: - sbox = [] - sbox.append(min([bbox[0] for bbox in bbox_list])) - sbox.append(min([bbox[1] for bbox in bbox_list])) - sbox.append(max([bbox[2] for bbox in bbox_list])) - sbox.append(max([bbox[3] for bbox in bbox_list])) - return sbox - - -def vertical_overlap(bboxA, bboxB): - ## bbox[1] is the lower bound, bbox[3] the upper bound (larger number) - if bboxB[3] < bboxA[1]: ## B below A - return False - elif bboxA[3] < bboxB[1]: ## A below B - return False - else: - return True - - -def vertical_overlap_fraction(bboxA, bboxB): - ## Returns the vertical overlap as fraction of the lower bbox height. - ## bbox[1] is the lower bound, bbox[3] the upper bound (larger number) - ## Height 0 is permitted in the input. - heightA = bboxA[3] - bboxA[1] - heightB = bboxB[3] - bboxB[1] - min_height = min(heightA, heightB) - if bboxA[3] >= bboxB[3]: ## A starts higher or equal - if ( - bboxA[1] <= bboxB[1] - ): ## B is completely in A; this can include height of B = 0: - fraction = 1 - else: - overlap = max(bboxB[3] - bboxA[1], 0) - fraction = overlap / max(min_height, 0.001) - else: - if ( - bboxB[1] <= bboxA[1] - ): ## A is completely in B; this can include height of A = 0: - fraction = 1 - else: - overlap = max(bboxA[3] - bboxB[1], 0) - fraction = overlap / max(min_height, 0.001) - return fraction - - -## ------------------------------- -## Cluster-and-cell relations - - -def compute_enclosed_cells( - cluster_bbox, raw_cells, min_cell_intersection_with_cluster=0.2 -): - cells_in_cluster = [] - cells_in_cluster_int = [] - for ix, cell in enumerate(raw_cells): - cell_bbox = cell["bbox"] - intersection = compute_intersection(cell_bbox, cluster_bbox) - frac_area = area(cell_bbox) * min_cell_intersection_with_cluster - - if ( - intersection > frac_area and frac_area > 0 - ): # intersect > certain fraction of cell - cells_in_cluster.append(ix) - cells_in_cluster_int.append(intersection) - elif contains( - cluster_bbox, - [cell_bbox[0] + 3, cell_bbox[1] + 3, cell_bbox[2] - 3, cell_bbox[3] - 3], - ): - cells_in_cluster.append(ix) - return cells_in_cluster, cells_in_cluster_int - - -def find_clusters_around_cells(cell_count, clusters): - ## Per raw cell, find to which clusters it belongs. - ## Return list of these indices in the raw-cell order. - clusters_around_cells = [[] for _ in range(cell_count)] - for cl_ix, cluster in enumerate(clusters): - for ix in cluster["cell_ids"]: - clusters_around_cells[ix].append(cl_ix) - return clusters_around_cells - - -def find_cell_index(raw_ix, cell_array): - ## "raw_ix" is a rawcell_id. - ## "cell_array" has the structure of an (annotation) cells array. - ## Returns index of cell in cell_array that has this rawcell_id. - for ix, cell in enumerate(cell_array): - if cell["rawcell_id"] == raw_ix: - return ix - - -def find_cell_indices(cluster, cell_array): - ## "cluster" must have the structure as in a clusters array in a prediction, - ## "cell_array" that of a cells array. - ## Returns list of indices of cells in cell_array that have the rawcell_ids as in the cluster, - ## in the order of the rawcell_ids. - result = [] - for raw_ix in sorted(cluster["cell_ids"]): - ## Find the cell with this rawcell_id (if any) - for ix, cell in enumerate(cell_array): - if cell["rawcell_id"] == raw_ix: - result.append(ix) - return result - - -def find_first_cell_index(cluster, cell_array): - ## "cluster" must be a dict with key "cell_ids"; it can also be a line. - ## "cell_array" has the structure of a cells array in an annotation. - ## Returns index of cell in cell_array that has the lowest rawcell_id from the cluster. - result = [] ## We keep it a list as it can be empty (picture without text cells) - if len(cluster["cell_ids"]) == 0: - return result - raw_ix = min(cluster["cell_ids"]) - ## Find the cell with this rawcell_id (if any) - for ix, cell in enumerate(cell_array): - if cell["rawcell_id"] == raw_ix: - result.append(ix) - break ## One is enough; should be only one anyway. - if result == []: - logger.debug( - " Warning: Raw cell " + str(raw_ix) + " not found in annotation cells" - ) - return result - - -## ------------------------------- -## Cluster labels and text - - -def relabel_cluster(cluster, cl_ix, new_label, target_pred): - ## "cluster" must have the structure as in a clusters array in a prediction, - ## "cl_ix" is its index in target_pred, - ## "new_label" is the intended new label, - ## "target_pred" is the entire current target prediction. - ## Sets label on the cluster itself, and on the cells in the target_pred. - ## Returns new_label so that also the cl_label variable in the main code is easily set. - target_pred["clusters"][cl_ix]["type"] = new_label - cluster_target_cells = find_cell_indices(cluster, target_pred["cells"]) - for ix in cluster_target_cells: - target_pred["cells"][ix]["label"] = new_label - return new_label - - -def find_cluster_text(cluster, raw_cells): - ## "cluster" must be a dict with "cell_ids"; it can also be a line. - ## "raw_cells" must have the format of item["raw"]["cells"] - ## Returns the text of the cluster, with blanks between the cell contents - ## (which seem to be words or phrases without starting or trailing blanks). - ## Note that in formulas, this may give a lot more blanks than originally - cluster_text = "" - for raw_ix in sorted(cluster["cell_ids"]): - cluster_text = cluster_text + raw_cells[raw_ix]["text"] + " " - return cluster_text.rstrip() - - -def find_cluster_text_without_blanks(cluster, raw_cells): - ## "cluster" must be a dict with "cell_ids"; it can also be a line. - ## "raw_cells" must have the format of item["raw"]["cells"] - ## Returns the text of the cluster, without blanks between the cell contents - ## Interesting in formula analysis. - cluster_text = "" - for raw_ix in sorted(cluster["cell_ids"]): - cluster_text = cluster_text + raw_cells[raw_ix]["text"] - return cluster_text.rstrip() - - -## ------------------------------- -## Clusters and lines -## (Most line-oriented functions are only needed in TextAnalysisGivenClusters, -## but this one also in FormulaAnalysis) - - -def build_cluster_from_lines(lines, label, id): - ## Lines must be a non-empty list of dicts (lines) with elements "cell_ids" and "bbox" - ## (There is no condition that they are really geometrically lines) - ## A cluster in standard format is returned with given label and id - local_lines = copy.deepcopy( - lines - ) ## without this, it changes "lines" also outside this function - first_line = local_lines.pop(0) - cluster = { - "id": id, - "type": label, - "cell_ids": first_line["cell_ids"], - "bbox": first_line["bbox"], - "confidence": 0, - "created_by": "merged_cells", - } - confidence = 0 - counter = 0 - for line in local_lines: - new_cell_ids = cluster["cell_ids"] + line["cell_ids"] - cluster["cell_ids"] = new_cell_ids - cluster["bbox"] = surrounding(cluster["bbox"], line["bbox"]) - counter += 1 - confidence += line["confidence"] - confidence = confidence / counter - cluster["confidence"] = confidence - return cluster - - -## ------------------------------- -## Reading order - - -def produce_reading_order(clusters, cluster_sort_type, cell_sort_type, sort_ids): - ## In: - ## Clusters: list as in predictions. - ## cluster_sort_type: string, currently only "raw_cells". - ## cell_sort_type: string, currently only "raw_cells". - ## sort_ids: Boolean, whether the cluster ids should be adapted to their new position - ## Out: Another clusters list, sorted according to the type. - - logger.debug("---- Start cluster sorting ------") - - if cell_sort_type == "raw_cell_ids": - for cl in clusters: - sorted_cell_ids = sorted(cl["cell_ids"]) - cl["cell_ids"] = sorted_cell_ids - else: - logger.debug( - "Unknown cell_sort_type `" - + cell_sort_type - + "`, no cell sorting will happen." - ) - - if cluster_sort_type == "raw_cell_ids": - clusters_with_cells = [cl for cl in clusters if cl["cell_ids"] != []] - clusters_without_cells = [cl for cl in clusters if cl["cell_ids"] == []] - logger.debug( - "Clusters with cells: " + str([cl["id"] for cl in clusters_with_cells]) - ) - logger.debug( - " Their first cell ids: " - + str([cl["cell_ids"][0] for cl in clusters_with_cells]) - ) - logger.debug( - "Clusters without cells: " - + str([cl["id"] for cl in clusters_without_cells]) - ) - clusters_with_cells_sorted = sorted( - clusters_with_cells, key=lambda cluster: cluster["cell_ids"][0] - ) - logger.debug( - " First cell ids after sorting: " - + str([cl["cell_ids"][0] for cl in clusters_with_cells_sorted]) - ) - sorted_clusters = clusters_with_cells_sorted + clusters_without_cells - else: - logger.debug( - "Unknown cluster_sort_type: `" - + cluster_sort_type - + "`, no cluster sorting will happen." - ) - - if sort_ids: - for i, cl in enumerate(sorted_clusters): - cl["id"] = i - return sorted_clusters - - -## ------------------------------- -## Line Splitting - - -def sort_cells_horizontal(line_cell_ids, raw_cells): - ## "line_cells" should be a non-empty list of (raw) cell_ids - ## "raw_cells" has the structure of item["raw"]["cells"]. - ## Sorts the cells in the line by x0 (left start). - new_line_cell_ids = sorted( - line_cell_ids, key=lambda cell_id: raw_cells[cell_id]["bbox"][0] - ) - return new_line_cell_ids - - -def adapt_bboxes(raw_cells, clusters, orphan_cell_indices): - new_clusters = [] - for ix, cluster in enumerate(clusters): - new_cluster = copy.deepcopy(cluster) - logger.debug( - "Treating cluster " + str(ix) + ", type " + str(new_cluster["type"]) - ) - logger.debug(" with cells: " + str(new_cluster["cell_ids"])) - if len(cluster["cell_ids"]) == 0 and cluster["type"] != DocItemLabel.PICTURE: - logger.debug(" Empty non-picture, removed") - continue ## Skip this former cluster, now without cells. - new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices) - new_cluster["bbox"] = new_bbox - new_clusters.append(new_cluster) - return new_clusters - - -def adapt_bbox(raw_cells, cluster, orphan_cell_indices): - if not (cluster["type"] in [DocItemLabel.TABLE, DocItemLabel.PICTURE]): - ## A text-like cluster. The bbox only needs to be around the text cells: - logger.debug(" Initial bbox: " + str(cluster["bbox"])) - new_bbox = surrounding_list( - [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]] - ) - logger.debug(" New bounding box:" + str(new_bbox)) - if cluster["type"] == DocItemLabel.PICTURE: - ## We only make the bbox completely comprise included text cells: - logger.debug(" Picture") - if len(cluster["cell_ids"]) != 0: - min_bbox = surrounding_list( - [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]] - ) - logger.debug(" Minimum bbox: " + str(min_bbox)) - logger.debug(" Initial bbox: " + str(cluster["bbox"])) - new_bbox = surrounding(min_bbox, cluster["bbox"]) - logger.debug(" New bbox (initial and text cells): " + str(new_bbox)) - else: - logger.debug(" without text cells, no change.") - new_bbox = cluster["bbox"] - else: ## A table - ## At least we have to keep the included text cells, and we make the bbox completely comprise them - min_bbox = surrounding_list( - [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]] - ) - logger.debug(" Minimum bbox: " + str(min_bbox)) - logger.debug(" Initial bbox: " + str(cluster["bbox"])) - new_bbox = surrounding(min_bbox, cluster["bbox"]) - logger.debug(" Possibly increased bbox: " + str(new_bbox)) - - ## Now we look which non-belonging cells are covered. - ## (To decrease dependencies, we don't make use of which cells we actually removed.) - ## We don't worry about orphan cells, those could still be added to the table. - enclosed_cells = compute_enclosed_cells( - new_bbox, raw_cells, min_cell_intersection_with_cluster=0.3 - )[0] - additional_cells = set(enclosed_cells) - set(cluster["cell_ids"]) - logger.debug( - " Additional cells enclosed by Table bbox: " + str(additional_cells) - ) - spurious_cells = additional_cells - set(orphan_cell_indices) - logger.debug( - " Spurious cells enclosed by Table bbox (additional minus orphans): " - + str(spurious_cells) - ) - if len(spurious_cells) == 0: - return new_bbox - - ## Else we want to keep as much as possible, e.g., grid lines, but not the spurious cells if we can. - ## We initialize possible cuts with the current bbox. - left_cut = new_bbox[0] - right_cut = new_bbox[2] - upper_cut = new_bbox[3] - lower_cut = new_bbox[1] - - for cell_ix in spurious_cells: - cell = raw_cells[cell_ix] - # logger.debug(" Spurious cell bbox: " + str(cell["bbox"])) - is_left = cell["bbox"][2] < min_bbox[0] - is_right = cell["bbox"][0] > min_bbox[2] - is_above = cell["bbox"][1] > min_bbox[3] - is_below = cell["bbox"][3] < min_bbox[1] - # logger.debug(" Left, right, above, below? " + str([is_left, is_right, is_above, is_below])) - - if is_left: - if cell["bbox"][2] > left_cut: - ## We move the left cut to exclude this cell: - left_cut = cell["bbox"][2] - if is_right: - if cell["bbox"][0] < right_cut: - ## We move the right cut to exclude this cell: - right_cut = cell["bbox"][0] - if is_above: - if cell["bbox"][1] < upper_cut: - ## We move the upper cut to exclude this cell: - upper_cut = cell["bbox"][1] - if is_below: - if cell["bbox"][3] > lower_cut: - ## We move the left cut to exclude this cell: - lower_cut = cell["bbox"][3] - # logger.debug(" Current bbox: " + str([left_cut, lower_cut, right_cut, upper_cut])) - - new_bbox = [left_cut, lower_cut, right_cut, upper_cut] - - logger.debug(" Final bbox: " + str(new_bbox)) - return new_bbox - - -def remove_cluster_duplicates_by_conf(cluster_predictions, threshold=0.5): - DuplicateDeletedClusterIDs = [] - for cluster_1 in cluster_predictions: - for cluster_2 in cluster_predictions: - if cluster_1["id"] != cluster_2["id"]: - if_conf = False - if cluster_1["confidence"] > cluster_2["confidence"]: - if_conf = True - if if_conf == True: - if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > threshold: - DuplicateDeletedClusterIDs.append(cluster_2["id"]) - elif contains( - cluster_1["bbox"], - [ - cluster_2["bbox"][0] + 3, - cluster_2["bbox"][1] + 3, - cluster_2["bbox"][2] - 3, - cluster_2["bbox"][3] - 3, - ], - ): - DuplicateDeletedClusterIDs.append(cluster_2["id"]) - - DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs)) - - for cl_id in DuplicateDeletedClusterIDs: - for cluster in cluster_predictions: - if cl_id == cluster["id"]: - cluster_predictions.remove(cluster) - return cluster_predictions - - -# Assign orphan cells by a low confidence prediction that is below the assigned confidence -def assign_orphans_with_low_conf_pred( - cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices -): - for orph_id in orphan_cell_indices: - cluster_chosen = {} - iou_thresh = 0.05 - confidence = 0.05 - - # Loop over all predictions, and find the one with the highest IOU, and confidence - for cluster in cluster_predictions_low: - calc_iou = bb_iou(cluster["bbox"], raw_cells[orph_id]["bbox"]) - cluster_area = (cluster["bbox"][3] - cluster["bbox"][1]) * ( - cluster["bbox"][2] - cluster["bbox"][0] - ) - cell_area = ( - raw_cells[orph_id]["bbox"][3] - raw_cells[orph_id]["bbox"][1] - ) * (raw_cells[orph_id]["bbox"][2] - raw_cells[orph_id]["bbox"][0]) - - if ( - (iou_thresh < calc_iou) - and (cluster["confidence"] > confidence) - and (cell_area * 3 > cluster_area) - ): - cluster_chosen = cluster - iou_thresh = calc_iou - confidence = cluster["confidence"] - # If a candidate is found, assign to it the PDF cell ids, and tag that it was created by this function for tracking - if iou_thresh != 0.05 and confidence != 0.05: - cluster_chosen["cell_ids"].append(orph_id) - cluster_chosen["created_by"] = "orph_low_conf" - cluster_predictions.append(cluster_chosen) - orphan_cell_indices.remove(orph_id) - return cluster_predictions, orphan_cell_indices - - -def remove_ambigous_pdf_cell_by_conf(cluster_predictions, raw_cells, amb_cell_idxs): - for amb_cell_id in amb_cell_idxs: - highest_conf = 0 - highest_bbox_iou = 0 - cluster_chosen = None - problamatic_clusters = [] - - # Find clusters in question - for cluster in cluster_predictions: - - if amb_cell_id in cluster["cell_ids"]: - problamatic_clusters.append(amb_cell_id) - - # If the cell_id is in a cluster of high conf, and highest iou score, and smaller in area - bbox_iou_val = bb_iou(cluster["bbox"], raw_cells[amb_cell_id]["bbox"]) - - if ( - cluster["confidence"] > highest_conf - and bbox_iou_val > highest_bbox_iou - ): - cluster_chosen = cluster - highest_conf = cluster["confidence"] - highest_bbox_iou = bbox_iou_val - if cluster["id"] in problamatic_clusters: - problamatic_clusters.remove(cluster["id"]) - - # now remove the assigning of cell id from lower confidence, and threshold - for cluster in cluster_predictions: - for prob_amb_id in problamatic_clusters: - if prob_amb_id in cluster["cell_ids"]: - cluster["cell_ids"].remove(prob_amb_id) - amb_cell_idxs.remove(amb_cell_id) - - return cluster_predictions, amb_cell_idxs - - -def ranges(nums): - # Find if consecutive numbers exist within pdf cells - # Used to remove line numbers for review manuscripts - nums = sorted(set(nums)) - gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s + 1 < e] - edges = iter(nums[:1] + sum(gaps, []) + nums[-1:]) - return list(zip(edges, edges)) - - -def set_orphan_as_text( - cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices -): - max_id = -1 - figures = [] - for cluster in cluster_predictions: - if cluster["type"] == DocItemLabel.PICTURE: - figures.append(cluster) - - if cluster["id"] > max_id: - max_id = cluster["id"] - max_id += 1 - - lines_detector = False - content_of_orphans = [] - for orph_id in orphan_cell_indices: - orph_cell = raw_cells[orph_id] - content_of_orphans.append(raw_cells[orph_id]["text"]) - - fil_content_of_orphans = [] - for cell_content in content_of_orphans: - if cell_content.isnumeric(): - try: - num = int(cell_content) - fil_content_of_orphans.append(num) - except ValueError: # ignore the cell - pass - - # line_orphans = [] - # Check if there are more than 2 pdf orphan cells, if there are more than 2, - # then check between the orphan cells if they are numeric - # and if they are a consecutive series of numbers (using ranges function) to decide - - if len(fil_content_of_orphans) > 2: - out_ranges = ranges(fil_content_of_orphans) - if len(out_ranges) > 1: - cnt_range = 0 - for ranges_ in out_ranges: - if ranges_[0] != ranges_[1]: - # If there are more than 75 (half the total line number of a review manuscript page) - # decide that there are line numbers on page to be ignored. - if len(list(range(ranges_[0], ranges_[1]))) > 75: - lines_detector = True - # line_orphans = line_orphans + list(range(ranges_[0], ranges_[1])) - - for orph_id in orphan_cell_indices: - orph_cell = raw_cells[orph_id] - if bool(orph_cell["text"] and not orph_cell["text"].isspace()): - fig_flag = False - # Do not assign orphan cells if they are inside a figure - for fig in figures: - if contains(fig["bbox"], orph_cell["bbox"]): - fig_flag = True - - # if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans: - if fig_flag == False and lines_detector == False: - # get class from low confidence detections if not set as text: - class_type = DocItemLabel.TEXT - - for cluster in cluster_predictions_low: - intersection = compute_intersection( - orph_cell["bbox"], cluster["bbox"] - ) - class_type = DocItemLabel.TEXT - if ( - cluster["confidence"] > 0.1 - and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4 - ): - class_type = cluster["type"] - elif contains( - cluster["bbox"], - [ - orph_cell["bbox"][0] + 3, - orph_cell["bbox"][1] + 3, - orph_cell["bbox"][2] - 3, - orph_cell["bbox"][3] - 3, - ], - ): - class_type = cluster["type"] - elif intersection > area(orph_cell["bbox"]) * 0.2: - class_type = cluster["type"] - - new_cluster = { - "id": max_id, - "bbox": orph_cell["bbox"], - "type": class_type, - "cell_ids": [orph_id], - "confidence": -1, - "created_by": "orphan_default", - } - max_id += 1 - cluster_predictions.append(new_cluster) - return cluster_predictions, orphan_cell_indices - - -def merge_cells(cluster_predictions): - # Using graph component creates clusters if orphan cells are touching or too close. - G = nx.Graph() - for cluster in cluster_predictions: - if cluster["created_by"] == "orphan_default": - G.add_node(cluster["id"]) - - for cluster_1 in cluster_predictions: - for cluster_2 in cluster_predictions: - if ( - cluster_1["id"] != cluster_2["id"] - and cluster_2["created_by"] == "orphan_default" - and cluster_1["created_by"] == "orphan_default" - ): - cl1 = copy.deepcopy(cluster_1["bbox"]) - cl2 = copy.deepcopy(cluster_2["bbox"]) - cl1[0] = cl1[0] - 2 - cl1[1] = cl1[1] - 2 - cl1[2] = cl1[2] + 2 - cl1[3] = cl1[3] + 2 - cl2[0] = cl2[0] - 2 - cl2[1] = cl2[1] - 2 - cl2[2] = cl2[2] + 2 - cl2[3] = cl2[3] + 2 - if is_intersecting(cl1, cl2): - G.add_edge(cluster_1["id"], cluster_2["id"]) - - component = sorted(map(sorted, nx.k_edge_components(G, k=1))) - max_id = -1 - for cluster_1 in cluster_predictions: - if cluster_1["id"] > max_id: - max_id = cluster_1["id"] - - for nodes in component: - if len(nodes) > 1: - max_id += 1 - lines = [] - for node in nodes: - for cluster in cluster_predictions: - if cluster["id"] == node: - lines.append(cluster) - cluster_predictions.remove(cluster) - new_merged_cluster = build_cluster_from_lines( - lines, DocItemLabel.TEXT, max_id - ) - cluster_predictions.append(new_merged_cluster) - return cluster_predictions - - -def clean_up_clusters( - cluster_predictions, - raw_cells, - merge_cells=False, - img_table=False, - one_cell_table=False, -): - DuplicateDeletedClusterIDs = [] - - for cluster_1 in cluster_predictions: - for cluster_2 in cluster_predictions: - if cluster_1["id"] != cluster_2["id"]: - # remove any artifcats created by merging clusters - if merge_cells == True: - if contains( - cluster_1["bbox"], - [ - cluster_2["bbox"][0] + 3, - cluster_2["bbox"][1] + 3, - cluster_2["bbox"][2] - 3, - cluster_2["bbox"][3] - 3, - ], - ): - cluster_1["cell_ids"] = ( - cluster_1["cell_ids"] + cluster_2["cell_ids"] - ) - DuplicateDeletedClusterIDs.append(cluster_2["id"]) - # remove clusters that might appear inside tables, or images (such as pdf cells in graphs) - elif img_table == True: - if ( - cluster_1["type"] == DocItemLabel.TEXT - and cluster_2["type"] == DocItemLabel.PICTURE - or cluster_2["type"] == DocItemLabel.TABLE - ): - if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5: - DuplicateDeletedClusterIDs.append(cluster_1["id"]) - elif contains( - [ - cluster_2["bbox"][0] - 3, - cluster_2["bbox"][1] - 3, - cluster_2["bbox"][2] + 3, - cluster_2["bbox"][3] + 3, - ], - cluster_1["bbox"], - ): - DuplicateDeletedClusterIDs.append(cluster_1["id"]) - # remove tables that have one pdf cell - if one_cell_table == True: - if ( - cluster_1["type"] == DocItemLabel.TABLE - and len(cluster_1["cell_ids"]) < 2 - ): - DuplicateDeletedClusterIDs.append(cluster_1["id"]) - - DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs)) - - for cl_id in DuplicateDeletedClusterIDs: - for cluster in cluster_predictions: - if cl_id == cluster["id"]: - cluster_predictions.remove(cluster) - return cluster_predictions - - -def assigning_cell_ids_to_clusters(clusters, raw_cells, threshold): - for cluster in clusters: - cells_in_cluster, _ = compute_enclosed_cells( - cluster["bbox"], raw_cells, min_cell_intersection_with_cluster=threshold - ) - cluster["cell_ids"] = cells_in_cluster - ## These cell_ids are ids of the raw cells. - ## They are often, but not always, the same as the "id" or the index of the "cells" list in a prediction. - return clusters - - -# Creates a map of cell_id->cluster_id -def cell_id_state_map(clusters, cell_count): - clusters_around_cells = find_clusters_around_cells(cell_count, clusters) - orphan_cell_indices = [ - ix for ix in range(cell_count) if len(clusters_around_cells[ix]) == 0 - ] # which cells are assigned no cluster? - ambiguous_cell_indices = [ - ix for ix in range(cell_count) if len(clusters_around_cells[ix]) > 1 - ] # which cells are assigned > 1 clusters? - return clusters_around_cells, orphan_cell_indices, ambiguous_cell_indices diff --git a/docs/examples/custom_convert.py b/docs/examples/custom_convert.py index 2d300904..12893e22 100644 --- a/docs/examples/custom_convert.py +++ b/docs/examples/custom_convert.py @@ -74,6 +74,10 @@ def main(): pipeline_options.do_ocr = True pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True + pipeline_options.ocr_options.lang = "es" + pipeline_options.accelerator_options = AcceleratorOptions( + num_threads=4, device=Device.AUTO + ) doc_converter = DocumentConverter( format_options={ diff --git a/poetry.lock b/poetry.lock index 184658ea..221728b7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,36 @@ # This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. +[[package]] +name = "accelerate" +version = "1.1.1" +description = "Accelerate" +optional = false +python-versions = ">=3.9.0" +files = [ + {file = "accelerate-1.1.1-py3-none-any.whl", hash = "sha256:61edd81762131b8d4bede008643fa1e1f3bf59bec710ebda9771443e24feae02"}, + {file = "accelerate-1.1.1.tar.gz", hash = "sha256:0d39dfac557052bc735eb2703a0e87742879e1e40b88af8a2f9a93233d4cd7db"}, +] + +[package.dependencies] +huggingface-hub = ">=0.21.0" +numpy = ">=1.17,<3.0.0" +packaging = ">=20.0" +psutil = "*" +pyyaml = "*" +safetensors = ">=0.4.3" +torch = ">=1.10.0" + +[package.extras] +deepspeed = ["deepspeed"] +dev = ["bitsandbytes", "black (>=23.1,<24.0)", "datasets", "diffusers", "evaluate", "hf-doc-builder (>=0.3.0)", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "rich", "ruff (>=0.6.4,<0.7.0)", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"] +quality = ["black (>=23.1,<24.0)", "hf-doc-builder (>=0.3.0)", "ruff (>=0.6.4,<0.7.0)"] +rich = ["rich"] +sagemaker = ["sagemaker"] +test-dev = ["bitsandbytes", "datasets", "diffusers", "evaluate", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"] +test-prod = ["parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist"] +test-trackers = ["comet-ml", "dvclive", "tensorboard", "wandb"] +testing = ["bitsandbytes", "datasets", "diffusers", "evaluate", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"] + [[package]] name = "aiohappyeyeballs" version = "2.4.4" @@ -231,6 +262,21 @@ docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphi tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] +[[package]] +name = "autoflake" +version = "2.3.1" +description = "Removes unused imports and unused variables" +optional = false +python-versions = ">=3.8" +files = [ + {file = "autoflake-2.3.1-py3-none-any.whl", hash = "sha256:3ae7495db9084b7b32818b4140e6dc4fc280b712fb414f5b8fe57b0a8e85a840"}, + {file = "autoflake-2.3.1.tar.gz", hash = "sha256:c98b75dc5b0a86459c4f01a1d32ac7eb4338ec4317a4469515ff1e687ecd909e"}, +] + +[package.dependencies] +pyflakes = ">=3.0.0" +tomli = {version = ">=2.0.1", markers = "python_version < \"3.11\""} + [[package]] name = "autopep8" version = "2.2.0" @@ -793,64 +839,32 @@ name = "deepsearch-glm" version = "0.26.2" description = "Graph Language Models" optional = false -python-versions = "<4.0,>=3.9" -files = [ - {file = "deepsearch_glm-0.26.2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:00453a02bc8df959da576bc598ba528b394a9c016d6a428efc948c867be98938"}, - {file = "deepsearch_glm-0.26.2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:9e6f654ab4d9dc3e6e2033c9c45294c36e5e62650cac0e4a650af576364eb370"}, - {file = "deepsearch_glm-0.26.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:1fdf2fce9d642bbc5222600a1b280a7413aa640ed01acee13d43401ec27d6ad5"}, - {file = "deepsearch_glm-0.26.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:218cab085a58b88c55dbeb80cc5f5f7b3c5a96c8537eb2ada8e5cab70cd8e439"}, - {file = "deepsearch_glm-0.26.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75be007e62d11780f2433b213dad14d14a270c3607e909fd1fc95efdf02446c6"}, - {file = "deepsearch_glm-0.26.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a9b34c6cfb8b873ccf6e0072f5434c0c65a1d90652a6b901becc5b3b1695106"}, - {file = "deepsearch_glm-0.26.2-cp310-cp310-win_amd64.whl", hash = "sha256:f4b63c6e1d4a7be597efbe96052286bca805784cd7283a037919c349971051c5"}, - {file = "deepsearch_glm-0.26.2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:eaabedca45fdd87dc455dc08b1785db15ba5ea6b706820330447f2cf7f03a67a"}, - {file = "deepsearch_glm-0.26.2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:101bc2a79027df555050d08112717249916c4d82ad5815be2a1ac0581d9ab2b5"}, - {file = "deepsearch_glm-0.26.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:000d4a4895c4ff89c465b746bb7db3bb054a1fb5c3fabe2772d5431700c15d33"}, - {file = "deepsearch_glm-0.26.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:2d97f9ebdff1a9086cc32ddd0abb14b42c4b4b2ae666986078fd77db3aa4487d"}, - {file = "deepsearch_glm-0.26.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:666a3b53b0949735cff77a8209f2833866e34b635ca0c7f444807963d8379d93"}, - {file = "deepsearch_glm-0.26.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89aae1ec83222ef39e045f0186023473e5ce2ed30846c13f2943192d34d57c0f"}, - {file = "deepsearch_glm-0.26.2-cp311-cp311-win_amd64.whl", hash = "sha256:9bb173dcd0caef1d8a0d440e1ac3e9959c6b849e06b95b1d9b436661504c98f7"}, - {file = "deepsearch_glm-0.26.2-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:bb286be157a7b163b46a4d1f7e48a30d5cc365d4926c18e8b3c72994a8f296f7"}, - {file = "deepsearch_glm-0.26.2-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:defca9ecf1451ce3422b7783ea188571ffad7c941dbf52acc2638c5a4ffa7743"}, - {file = "deepsearch_glm-0.26.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:226f8862c616a4def202a6d0f71eb5d8e9f6ddbded2cf431c146150303888cf8"}, - {file = "deepsearch_glm-0.26.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:6ff0fe662254835763ad7d3edc2db320de8d233f645064e0356187d8e1fabe3b"}, - {file = "deepsearch_glm-0.26.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91c1b84ec5b1308de37c660f49570ee1e72bd7f0f607566344446b9293f1183c"}, - {file = "deepsearch_glm-0.26.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d634eeaae8943e1912c0dfbf3193e09bea8c1aac38db8a6fa1f03fe6a49cb84"}, - {file = "deepsearch_glm-0.26.2-cp312-cp312-win_amd64.whl", hash = "sha256:9294087d26037574817e8e1710e387fd9ef9ba4328705de86dd40d819f32909a"}, - {file = "deepsearch_glm-0.26.2-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:df7181143c62a1f0e166bc9ffb25deab617b53ba7c468284e3072b861c17405a"}, - {file = "deepsearch_glm-0.26.2-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:2c3fef2c8394d6dc22d1bcdab12d0f46df9b411c5431dfb585a2c7bb128e1744"}, - {file = "deepsearch_glm-0.26.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:f641a88421aa806ccef8f8e657fbb65135f59732110d21b5103c09138a659315"}, - {file = "deepsearch_glm-0.26.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:cf78499892caffb4bdc020b8c50ab7d623f568478375dcc2e3ec107d40972adc"}, - {file = "deepsearch_glm-0.26.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72f2b432b81b0bc7c87e33c41a97c7a8da2536dd2b337eb1b7d054fba12d556"}, - {file = "deepsearch_glm-0.26.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4703cae0d329b77e1d97892910313035204daa026d6e67ce6eb1b3e74e41f93e"}, - {file = "deepsearch_glm-0.26.2-cp313-cp313-win_amd64.whl", hash = "sha256:c906c75d080414490727de416fd1782bc6a10301378f72a741aa227b183832cf"}, - {file = "deepsearch_glm-0.26.2-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:10a366512540eff9f76645eb521df3469a160e8460ff6c3c1bfe172342c6c670"}, - {file = "deepsearch_glm-0.26.2-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:304988f1e08bd86a8a7b7cc0495e38faf586231f33f05c1023597c6177758572"}, - {file = "deepsearch_glm-0.26.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:c8f69b877846031648811ff80070b90b834bf9e4cdd74e5c2d93c7e18f408cd1"}, - {file = "deepsearch_glm-0.26.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:1ba12361d1e4b8b02a72f515028f22686d98526a703a1091f89e9487fa3aa3c7"}, - {file = "deepsearch_glm-0.26.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c03bb8b3cdb2952c9c269849830f7830fa7e0384b76809e25f4c2d5d091f746c"}, - {file = "deepsearch_glm-0.26.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fe719b26d7cfcf5632a56be1f1420920fcdbea4418c014dd6e7e218dd2aca11"}, - {file = "deepsearch_glm-0.26.2-cp39-cp39-win_amd64.whl", hash = "sha256:2b31fa419287af3429efc2d5610cbf2428bafc762e45b610a48ad30dffedaa9e"}, - {file = "deepsearch_glm-0.26.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:6df2504998e60c1aac3655820ad25e5eccca137da2e9f78fb53dc0fd0d1cdbf4"}, - {file = "deepsearch_glm-0.26.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:e1b4a789ec9555ec9f4ff6730d68081be37eaa43cb51c9463962967c9f672684"}, - {file = "deepsearch_glm-0.26.2.tar.gz", hash = "sha256:7a607e78903b66d28beac3408156c11ab7b34ee70e8ccd0d292b28433e5a9c1d"}, -] +python-versions = "^3.9" +files = [] +develop = false [package.dependencies] -docling-core = ">=2.0,<3.0" +docling-core = "^2.0" docutils = "!=0.21" numpy = ">=1.24.4,<3.0.0" pandas = ">=1.5.1,<3.0.0" -python-dotenv = ">=1.0.0,<2.0.0" -pywin32 = {version = ">=307,<308", markers = "sys_platform == \"win32\""} -requests = ">=2.32.3,<3.0.0" -rich = ">=13.7.0,<14.0.0" +python-dotenv = "^1.0.0" +pywin32 = {version = "^307", markers = "sys_platform == \"win32\""} +requests = "^2.32.3" +rich = "^13.7.0" tabulate = ">=0.8.9" -tqdm = ">=4.64.0,<5.0.0" +tqdm = "^4.64.0" [package.extras] pyplot = ["matplotlib (>=3.7.1,<4.0.0)"] toolkit = ["deepsearch-toolkit (>=1.1.0,<2.0.0)"] +[package.source] +type = "git" +url = "ssh://git@github.com/DS4SD/deepsearch-glm.git" +reference = "cau/layout-processing-children-payloads" +resolved_reference = "8fac776c07fb7541d17ebc9db48c9900074f25b1" + [[package]] name = "defusedxml" version = "0.7.1" @@ -893,94 +907,74 @@ name = "docling-core" version = "2.6.1" description = "A python library to define and validate data types in Docling." optional = false -python-versions = "<4.0,>=3.9" -files = [ - {file = "docling_core-2.6.1-py3-none-any.whl", hash = "sha256:8e7a5bc0ce13289567738481949fed3ab580f2d8cea7525b246159233d81b26b"}, - {file = "docling_core-2.6.1.tar.gz", hash = "sha256:c8af45e0873611120cc24757d567d37e053a54e2ce060b7b5b44efd0d73f75e5"}, -] +python-versions = "^3.9" +files = [] +develop = false [package.dependencies] -jsonref = ">=1.1.0,<2.0.0" -jsonschema = ">=4.16.0,<5.0.0" -pandas = ">=2.1.4,<3.0.0" -pillow = ">=10.3.0,<11.0.0" +jsonref = "^1.1.0" +jsonschema = "^4.16.0" +pandas = "^2.1.4" +pillow = "^10.3.0" pydantic = ">=2.6.0,<2.10" pyyaml = ">=5.1,<7.0.0" -tabulate = ">=0.9.0,<0.10.0" -typing-extensions = ">=4.12.2,<5.0.0" +tabulate = "^0.9.0" +typing-extensions = "^4.12.2" + +[package.source] +type = "git" +url = "ssh://git@github.com/DS4SD/docling-core.git" +reference = "feat-add-legacy-convert" +resolved_reference = "4434b1073dc15fefb75f28c37299abd32d9c532f" [[package]] name = "docling-ibm-models" version = "2.0.7" description = "This package contains the AI models used by the Docling PDF conversion package" optional = false -python-versions = "<4.0,>=3.9" -files = [ - {file = "docling_ibm_models-2.0.7-py3-none-any.whl", hash = "sha256:bf362add22e9c526ac56c04bce412d7bb1c331b44a73204abba0b1d90a500c78"}, - {file = "docling_ibm_models-2.0.7.tar.gz", hash = "sha256:e1372c4f2517d522125fb02a820558f01914926f532bcd0534f1028a25d63667"}, -] +python-versions = "^3.9" +files = [] +develop = false [package.dependencies] +accelerate = "^1.1.1" huggingface_hub = ">=0.23,<1" -jsonlines = ">=3.1.0,<4.0.0" +jsonlines = "^3.1.0" numpy = ">=1.24.4,<3.0.0" -opencv-python-headless = ">=4.6.0.66,<5.0.0.0" -Pillow = ">=10.0.0,<11.0.0" -torch = ">=2.2.2,<3.0.0" -torchvision = ">=0,<1" -tqdm = ">=4.64.0,<5.0.0" +opencv-python-headless = "^4.6.0.66" +Pillow = "^10.0.0" +torch = "^2.2.2" +torchvision = "^0" +tqdm = "^4.64.0" +transformers = "^4.46.2" + +[package.source] +type = "git" +url = "ssh://git@github.com/DS4SD/docling-ibm-models.git" +reference = "nli/performance" +resolved_reference = "c1bed7d5451ee16b7fb5b0bc5e847f599ed93aa7" [[package]] name = "docling-parse" version = "2.1.2" description = "Simple package to extract text with coordinates from programmatic PDFs" optional = false -python-versions = "<4.0,>=3.9" -files = [ - {file = "docling_parse-2.1.2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:140319e3eac73f9768d35313739891ae637af57fda03eade17d90e2d28ad80eb"}, - {file = "docling_parse-2.1.2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:cec968a436ad14e8a45a72fc0e0074750eee28548a14f3c3df5157a68ac958e7"}, - {file = "docling_parse-2.1.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:c84eba992fee49d190cf4834fd44ef4e6549c3f1fcd41b91622114703a7e4a87"}, - {file = "docling_parse-2.1.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:ae02af07f3dd335f56383a83efdc1f6450b7d38e21e1131005dbd341eb38e47d"}, - {file = "docling_parse-2.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6fa0731e97d2644ff8a3257ae53208b88be3ddc6a4bc54fbe39e21f8395530f0"}, - {file = "docling_parse-2.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d26d60136aab5f4a3a773922a8dcc530334165331660d074cd88dcd5d91206cd"}, - {file = "docling_parse-2.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:76eef41d50017c2fc531face44c1a35bef66095951622617d0f281e35d18e9e0"}, - {file = "docling_parse-2.1.2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:7f1ad037d3ac0d80252c493e73b12688ded3ece9bae7954ba62765506c139d21"}, - {file = "docling_parse-2.1.2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:9f1360c0558c84f4b6633b0882256f6d621fd9e52179acae39c727a43b48d937"}, - {file = "docling_parse-2.1.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:5d505c2d3e9eff4f3064b4d1f017a3c6577b5d8ba55540d558f4899561862956"}, - {file = "docling_parse-2.1.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:58f552f61ac35c02890b03fe59b06552353314c3c1ee2a050c68a8a206ab1b4b"}, - {file = "docling_parse-2.1.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22069dadcfdcebc02e36e27f80d452f1265a5a97d894f2391490bf099bc5432c"}, - {file = "docling_parse-2.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68942b31684a021e27b9b07d27ed139911444b33963f7e0b5d2dbda8aaa5cb1"}, - {file = "docling_parse-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:d87e3fbf1549cd8bc171240c18584ba8c32f83963b5af66b2a70a2bc3af56d2e"}, - {file = "docling_parse-2.1.2-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:5b00b81fa8eb0b34621f1ef9d07623d7dbcc354a33295a5b0c4209c39b1ff8eb"}, - {file = "docling_parse-2.1.2-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:1b99b122f941d0f19e92a215e589b94f49db899c5eec0147e83824652b18ce74"}, - {file = "docling_parse-2.1.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:744fe368a8fa49778e881c1052427c38a7d0e367273fcdef493e047513783108"}, - {file = "docling_parse-2.1.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:b8a3e558a96f7d593269be75ba4147ebe221f5edad3d41244cef3533e8a51b74"}, - {file = "docling_parse-2.1.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afcf53bce8c91886c1360e625e51d15ebfb36d37cd53b6e019e86ce1118c1d0c"}, - {file = "docling_parse-2.1.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89d25fc4fb8f16a8ed5bc8c4f00a77739d2536732c0ddae16340b1859adf68fd"}, - {file = "docling_parse-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:28a7f49a865a0cd71033a7899aac00c7d2e3b6c3a76488f8676ba0fc353d9f3a"}, - {file = "docling_parse-2.1.2-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:ad1560532cdf15dcb4a6005c8b7fe19def0e910e6125863f14978d6d07a1ba47"}, - {file = "docling_parse-2.1.2-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:19003b1bb64cd5a40999a3c5ffcb9a9d9608a073949b76acc58d58fb5054ea03"}, - {file = "docling_parse-2.1.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:041bf1c72a23d62e2dd30dcc3508222f6674e85b0f1d19a3196fd6d7b5f56015"}, - {file = "docling_parse-2.1.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:12403c26e833d8fdf0f406d2895f5108fd07b64a4d929c9105ca60f09b882c34"}, - {file = "docling_parse-2.1.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1082e227af3e31085eff3e96103b09becdf95324304e17ce0b1b61c43b93fbb7"}, - {file = "docling_parse-2.1.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77b36e36d1e07a06a1616ee281079d6b972c3059f2fa02dafcfc225a41e5bd1a"}, - {file = "docling_parse-2.1.2-cp313-cp313-win_amd64.whl", hash = "sha256:4300df86657935b0109c44702857ebf3d0713f1bbe376982f369504a762e2fef"}, - {file = "docling_parse-2.1.2-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:912fe44507f209d997e1183f38a71d4e14c31d53a164fb862631822624dad892"}, - {file = "docling_parse-2.1.2-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:10ff1928b12099f446fcd0b043182173e6b02ce74008ea6ce921d56cdee8964e"}, - {file = "docling_parse-2.1.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:391ad31a4086fabbc290851432f4cf0bdc366e07a454adf49e42029898d6b477"}, - {file = "docling_parse-2.1.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:ebf478e99c0c16d7dad30c0fdb1f5e236ae94d48da8dec48dbe5f0841eead4ed"}, - {file = "docling_parse-2.1.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b1c904017330d096981b7db6b225b66aff1cebdc422843103a782121d6e8be8"}, - {file = "docling_parse-2.1.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8bc8ec6ad1bec6168991b895d749b222bef14b568d1d9f6c06efaeb1645dfe12"}, - {file = "docling_parse-2.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:e6eb130aa367247e1f32225bb1608cee901d711b475527404bbc4330c9199b99"}, - {file = "docling_parse-2.1.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ef88d565c761b48f8a175fd474e068c0da9d4401e22d3e38de73e2f00f3df2d1"}, - {file = "docling_parse-2.1.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:bdc8ccbdc4ab91b829b8c421ad89da276442a2c891eda1f6507f248d0bd8dff9"}, - {file = "docling_parse-2.1.2.tar.gz", hash = "sha256:3c249f50e6351eb6126331a179fe86b64dc2073e9f881d52f8c8fb391633b89e"}, -] +python-versions = "^3.9" +files = [] +develop = false [package.dependencies] +autoflake = "^2.3.1" +pillow = "^10.4.0" pywin32 = {version = ">=305", markers = "sys_platform == \"win32\""} tabulate = ">=0.9.0,<1.0.0" +[package.source] +type = "git" +url = "ssh://git@github.com/DS4SD/docling-parse.git" +reference = "dev/expose-cell-sanitisation-via-python" +resolved_reference = "8ea65ae3080db88f54f8a3f7b622e7b002c9b7f0" + [[package]] name = "docutils" version = "0.21.2" @@ -3192,6 +3186,7 @@ files = [ {file = "nh3-0.2.19-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:00810cd5275f5c3f44b9eb0e521d1a841ee2f8023622de39ffc7d88bd533d8e0"}, {file = "nh3-0.2.19-cp38-abi3-win32.whl", hash = "sha256:7e98621856b0a911c21faa5eef8f8ea3e691526c2433f9afc2be713cb6fbdb48"}, {file = "nh3-0.2.19-cp38-abi3-win_amd64.whl", hash = "sha256:75c7cafb840f24430b009f7368945cb5ca88b2b54bb384ebfba495f16bc9c121"}, + {file = "nh3-0.2.19.tar.gz", hash = "sha256:790056b54c068ff8dceb443eaefb696b84beff58cca6c07afd754d17692a4804"}, ] [[package]] @@ -3766,9 +3761,9 @@ numpy = [ {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, ] [[package]] @@ -3792,9 +3787,9 @@ numpy = [ {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, ] [[package]] @@ -5474,12 +5469,12 @@ cffi = {version = "*", markers = "implementation_name == \"pypy\""} [[package]] name = "rapidocr-onnxruntime" -version = "1.4.0" +version = "1.4.1" description = "A cross platform OCR Library based on OnnxRuntime." optional = true python-versions = "<3.13,>=3.6" files = [ - {file = "rapidocr_onnxruntime-1.4.0-py3-none-any.whl", hash = "sha256:d21c4ba2ef80b7a8ecf8178632f273398a92ab44a1ffb9e171139ef2a589d690"}, + {file = "rapidocr_onnxruntime-1.4.1-py3-none-any.whl", hash = "sha256:5ecdb8f4f3beec56630197f87c3e67ab744fce0cc66394b7b1da08c8c96a727f"}, ] [package.dependencies] @@ -5700,112 +5695,114 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] [[package]] name = "rpds-py" -version = "0.22.0" +version = "0.22.1" description = "Python bindings to Rust's persistent data structures (rpds)" optional = false python-versions = ">=3.9" files = [ - {file = "rpds_py-0.22.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:a4366f264fa60d3c109f0b27af0cd9eb8d46746bd70bd3d9d425f035b6c7e286"}, - {file = "rpds_py-0.22.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e34a3e665d38d0749072e6565400c8ce9abae976e338919a0dfbfb0e1ba43068"}, - {file = "rpds_py-0.22.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38cacf1f378571450576f2c8ce87da6f3fddc59d744de5c12b37acc23285b1e1"}, - {file = "rpds_py-0.22.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8cbb040fec8eddd5a6a75e737fd73c9ce37e51f94bacdd0b178d0174a4758395"}, - {file = "rpds_py-0.22.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d80fd710b3307a3c63809048b72c536689b9b0b31a2518339c3f1a4d29c73d7a"}, - {file = "rpds_py-0.22.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4b5d17d8f5b885ce50e0cda85f99c0719e365e98b587338535fa566a48375afb"}, - {file = "rpds_py-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f7a048ec1ebc991331d709be4884dc318c9eaafa66dcde8be0933ac0e702149"}, - {file = "rpds_py-0.22.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:306da3dfa174b489a3fc63b0872e2226a5ddf94c59875a770d72aff945d5ed96"}, - {file = "rpds_py-0.22.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:c7b4450093c0c909299770226fb0285be47b0a57545bae25b5c4e51566b0e587"}, - {file = "rpds_py-0.22.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0903ffdb5b9007e503203b6285e4ff0faf96d875c19f1d103b475acf7d9f7311"}, - {file = "rpds_py-0.22.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d1522025cda9e57329aade769f56e5793b2a5da7759a21914ee10e67e17e601e"}, - {file = "rpds_py-0.22.0-cp310-cp310-win32.whl", hash = "sha256:49e084d47a66027ac72844f9f52f13d347a9a1f05d4f84381b420e47f836a7fd"}, - {file = "rpds_py-0.22.0-cp310-cp310-win_amd64.whl", hash = "sha256:d9ceca96df54cb1675a0b7f52f1c6d5d1df62c5b40741ba211780f1b05a282a2"}, - {file = "rpds_py-0.22.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:771c9a3851beaa617d8c8115d65f834a2b52490f42ee2b88b13f1fc5529e9e0c"}, - {file = "rpds_py-0.22.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:341a07a4b55126bfae68c9bf24220a73d456111e5eb3dcbdab9fd16de2341224"}, - {file = "rpds_py-0.22.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f7649c8b8e4bd1ccc5fcbd51a855d57a617deeba19c66e3d04b1abecc61036b2"}, - {file = "rpds_py-0.22.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2f513758e7cda8bc262e80299a8e3395d7ef7f4ae705be62632f229bc6c33208"}, - {file = "rpds_py-0.22.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba1fc34d0b2f6fd53377a4c954116251eba6d076bf64f903311f4a7d27d10acd"}, - {file = "rpds_py-0.22.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:632d2fdddd9fbe3ac8896a119fd18a71fc95ca9c4cbe5223096c142d8c4a2b1d"}, - {file = "rpds_py-0.22.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:326e42f2b49462e05f8527a1311ce98f9f97c484b3e443ec0ea4638bed3aebcf"}, - {file = "rpds_py-0.22.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e9bbdba9e75b1a9ee1dd1335034dad998ef1acc08492226c6fd50aa773bdfa7d"}, - {file = "rpds_py-0.22.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:41f65a97bf2c4b161c9f8f89bc37058346bec9b36e373c8ad00a16c957bff625"}, - {file = "rpds_py-0.22.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:0686f2c16eafdc2c6b4ce6e86e5b3092e87db09ae64be2787616444eb35b9756"}, - {file = "rpds_py-0.22.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4e7c9aa2353eb0b0d845323857197daa036c2ff8624df990b0d886d22a8f665e"}, - {file = "rpds_py-0.22.0-cp311-cp311-win32.whl", hash = "sha256:2d2fc3ab021be3e0b5aec6d4164f2689d231b8bfc5185cc454314746aa4aee72"}, - {file = "rpds_py-0.22.0-cp311-cp311-win_amd64.whl", hash = "sha256:87453d491369cd8018016d2714a13e8461975161703c18ee31eecf087a8ae5d4"}, - {file = "rpds_py-0.22.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:e9d4293b21c69ee4f9e1a99ac4f772951d345611c614a0cfae2ec6b565279bc9"}, - {file = "rpds_py-0.22.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:67e013a17a3db4d98cc228fd5aeb36a51b0f5cf7330b9102a552060f1fe4e560"}, - {file = "rpds_py-0.22.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b639a19e1791b646d27f15d17530a51722cc728d43b2dff3aeb904f92d91bac"}, - {file = "rpds_py-0.22.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1357c3092702078b7782b6ebd5ba9b22c1a291c34fbf9d8f1a48237466ac7758"}, - {file = "rpds_py-0.22.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:842855bbb113a19c393c6de5aa6ed9a26c6b13c2fead5e49114d39f0d08b94d8"}, - {file = "rpds_py-0.22.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ae7927cd2b869ca4dc645169d8af5494a29c99afd0ea0f24dd00c811ab1d8b8"}, - {file = "rpds_py-0.22.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91bfef5daa2a5a4fe62f8d317fc91a626073639f951f851bd2cb252d01bc6c5"}, - {file = "rpds_py-0.22.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4fc4824e38c1e91a73bc820e7caacaf19d0acd557465aceef0420ca59489b390"}, - {file = "rpds_py-0.22.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:92d28a608127b357da47c99e0d0e0655ca2060286540fe9f2a25a2e8ac666e05"}, - {file = "rpds_py-0.22.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c637188b930175c256f13adbfc427b83ec7e64476d1ec9d6608f312bb84e06c3"}, - {file = "rpds_py-0.22.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:93bbd66f46dddc41e8c656130c97c0fb515e0fa44e1eebb2592769dbbd41b2f5"}, - {file = "rpds_py-0.22.0-cp312-cp312-win32.whl", hash = "sha256:54d8f94dec5765a9edc19610fecf0fdf9cab36cbb9def1213188215f735a6f98"}, - {file = "rpds_py-0.22.0-cp312-cp312-win_amd64.whl", hash = "sha256:931bf3d0705b2834fed29354f35170fa022fe22a95542b61b7c66aca5f8a224f"}, - {file = "rpds_py-0.22.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:2a57300cc8b034c5707085249efd09f19116bb80278d0ec925d7f3710165c510"}, - {file = "rpds_py-0.22.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c398a5a8e258dfdc5ea2aa4e5aa2ca3207f654a8eb268693dd1a76939074a588"}, - {file = "rpds_py-0.22.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a6cc4eb1e86364331928acafb2bb41d8ab735ca3caf2d6019b9f6dac3f4f65d"}, - {file = "rpds_py-0.22.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:574c5c94213bc9990805bfd7e4ba3826d3c098516cbc19f0d0ef0433ad93fa06"}, - {file = "rpds_py-0.22.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4c0321bc03a1c513eca1837e3bba948b975bcf3a172aebc197ab3573207f137a"}, - {file = "rpds_py-0.22.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d276280649305c1da6cdd84585d48ae1f0efa67434d8b10d2df95228e59a05bb"}, - {file = "rpds_py-0.22.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c17b43fe9c6da16885e3fe28922bcd1a029e61631fb771c7d501019b40bcc904"}, - {file = "rpds_py-0.22.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:48c95997af9314f4034fe5ba2d837399e786586e220835a578d28fe8161e6ae5"}, - {file = "rpds_py-0.22.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e9aa4af6b879bb75a3c7766fbf49d77f4097dd12b548ecbbd8b3f85caa833281"}, - {file = "rpds_py-0.22.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8426f97117b914b9bfb2a7bd46edc148e8defda728a55a5df3a564abe70cd7a4"}, - {file = "rpds_py-0.22.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:034964ea0ea09645bdde13038b38abb14be0aa747f20fcfab6181207dd9e0483"}, - {file = "rpds_py-0.22.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:3dc7c64b56b82428894f056e9ff6e8ee917ff74fc26b65211a33602c2372e928"}, - {file = "rpds_py-0.22.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:1212cb231f2002934cd8d71a0d718fdd9d9a2dd671e0feef8501038df3508026"}, - {file = "rpds_py-0.22.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f21e1278c9456cd601832375c778ca44614d3433996488221a56572c223f04a"}, - {file = "rpds_py-0.22.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:875fe8dffb43c20f68379ee098b035a7038d7903c795d46715f66575a7050b19"}, - {file = "rpds_py-0.22.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e23dcdd4b2ff9c6b3317ea7921b210d39592f8ca1cdea58ada25b202c65c0a69"}, - {file = "rpds_py-0.22.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0fb8efc9e579acf1e556fd86277fecec320c21ca9b5d39db96433ad8c45bc4a"}, - {file = "rpds_py-0.22.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe23687924b25a2dee52fab15976fd6577ed8518072bcda9ff2e2b88ab1f168b"}, - {file = "rpds_py-0.22.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d5469b347445d1c31105f33e7bfc9a8ba213d48e42641a610dda65bf9e3c83f5"}, - {file = "rpds_py-0.22.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a810a57ce5e8ecf8eac6ec4dab534ff80c34e5a2c31db60e992009cd20f58e0f"}, - {file = "rpds_py-0.22.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:d9bb9242b38a664f307b3b897f093896f7ed51ef4fe25a0502e5a368de9151ea"}, - {file = "rpds_py-0.22.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:b4660943030406aaa40ec9f51960dd88049903d9536bc3c8ebb5cc4e1f119bbe"}, - {file = "rpds_py-0.22.0-cp313-cp313t-win32.whl", hash = "sha256:208ce1d8e3af138d1d9b21d7206356b7f29b96675e0113aea652cf024e4ddfdc"}, - {file = "rpds_py-0.22.0-cp313-cp313t-win_amd64.whl", hash = "sha256:e6da2e0500742e0f157f005924a0589f2e2dcbfdd6cd0cc0abce367433e989be"}, - {file = "rpds_py-0.22.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:f980a0640599a74f27fd9d50c84c293f1cb7afc2046c5c6d3efaf8ec7cdbc326"}, - {file = "rpds_py-0.22.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ca505fd3767a09a139737f3278bc8a485cb64043062da89bcba27e2f2ea78d33"}, - {file = "rpds_py-0.22.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba235e00e0878ba1080b0f2a761f143b2a2d1c354f3d8e507fbf2f3de401bf18"}, - {file = "rpds_py-0.22.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:81e7a27365b02fe70a77f1365376879917235b3fec551d19b4c91b51d0bc1d07"}, - {file = "rpds_py-0.22.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:32a0e24cab2daae0503b06666d516e90a080c1a95aff0406b9f03c6489177c4b"}, - {file = "rpds_py-0.22.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a73ed43d64209e853bba567a543170267a5cd64f359540b0ca2d597e329ba172"}, - {file = "rpds_py-0.22.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0abcce5e874474d3eab5ad53be03dae2abe651d248bdeaabe83708e82969e78"}, - {file = "rpds_py-0.22.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f4e9946c8c7def17e4fcb5eddb14c4eb6ebc7f6f309075e6c8d23b133c104607"}, - {file = "rpds_py-0.22.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:758098b38c344d9a7f279baf0689261777e601f620078ef5afdc9bd3339965c3"}, - {file = "rpds_py-0.22.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:9ad4640a409bc2b7d22b7921e7660f0db96c5c8c69fbb2e8f3261d4f71d33983"}, - {file = "rpds_py-0.22.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:8c48fc7458fe3a74dcdf56ba3534ff41bd421f69436df09ff3497fdaac18b431"}, - {file = "rpds_py-0.22.0-cp39-cp39-win32.whl", hash = "sha256:fde778947304e55fc732bc8ea5c6063e74244ac1808471cb498983a210aaf62c"}, - {file = "rpds_py-0.22.0-cp39-cp39-win_amd64.whl", hash = "sha256:5fdf91a7c07f40e47b193f2acae0ed9da35d09325d7c3c3279f722b7cbf3d264"}, - {file = "rpds_py-0.22.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c8fd7a16f7a047e06c747cfcf2acef3ac316132df1c6077445b29ee6f3f3a70b"}, - {file = "rpds_py-0.22.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:6b6e4bcfc32f831bfe3d6d8a5acedfbfd5e252a03c83fa24813b277a3a8a13ca"}, - {file = "rpds_py-0.22.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eadd2417e83a77ce3ae4a0efd08cb0ebdfd317b6406d11020354a53ad458ec84"}, - {file = "rpds_py-0.22.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f9dc2113e0cf0dd637751ca736186fca63664939ceb9f9f67e93ade88c69c0c9"}, - {file = "rpds_py-0.22.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc2c00acdf68f1f69a476b770af311a7dc3955b7de228b04a40bcc51ac4d743b"}, - {file = "rpds_py-0.22.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dfdabdf8519c93908b2bf0f87c3f86f9e88bab279fb4acfd0907519ca5a1739f"}, - {file = "rpds_py-0.22.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8338db3c76833d02dc21c3e2c42534091341d26e4f7ba32c6032bb558a02e07b"}, - {file = "rpds_py-0.22.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8ad4dfda52e64af3202ceb2143a62deba97894b71c64a4405ee80f6b3ea77285"}, - {file = "rpds_py-0.22.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:3b94b074dcce39976db22ea75c7aea8b22d95e6d3b62f76e20e1179a278521d8"}, - {file = "rpds_py-0.22.0-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:d4f2af3107fe4dc40c0d1a2409863f5249c6796398a1d83c1d99a0b3fa6cfb8d"}, - {file = "rpds_py-0.22.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:bb11809b0de643a292a82f728c494a2bbef0e30a7c42d37464abbd6bef7ca7b1"}, - {file = "rpds_py-0.22.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c1c21030ed494deb10226f90e2dbd84a012d59810c409832714a3dd576527be2"}, - {file = "rpds_py-0.22.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:64a0c965a1e299c9b280006bdb15c276c427c45360aed676305dc36bcaa4d13c"}, - {file = "rpds_py-0.22.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:2498ff422823be087b48bc82710deb87ac34f6b7c8034ee39920647647de1e60"}, - {file = "rpds_py-0.22.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59e63da174ff287db05ef7c21d75974a5bac727ed60452aeb3a14278477842a8"}, - {file = "rpds_py-0.22.0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e1c04fb380bc8efaae2fdf17ed6cd5d223da78a8b0b18a610f53d4c5d6e31dfd"}, - {file = "rpds_py-0.22.0-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e04919ffa9a728c446b27b6b625fa1d00ece221bdb9d633e978a7e0353a12c0e"}, - {file = "rpds_py-0.22.0-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:24c28df05bd284879d0fac850ba697077d2a33b7ebcaea6318d6b6cdfdc86ddc"}, - {file = "rpds_py-0.22.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d33622dc63c295788eed09dbb1d11bed178909d3267b02d873116ee6be368244"}, - {file = "rpds_py-0.22.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7539dbb8f705e13629ba6f23388976aad809e387f32a6e5c0712e4e8d9bfcce7"}, - {file = "rpds_py-0.22.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:b8906f537978da3f7f0bd1ba37b69f6a877bb43312023b086582707d2835bf2f"}, - {file = "rpds_py-0.22.0-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:62ab12fe03ffc49978d29de9c31bbb216610157f7e5ca8e172fed6642aead3be"}, - {file = "rpds_py-0.22.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:762206ba3bf1d6c8c9e0055871d3c0d5b074b7c3120193e6c067e7866f106ab1"}, - {file = "rpds_py-0.22.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ed0102146574e5e9f079b2e1a06e6b5b12a691f9c74a65b93b7f3d4feda566c6"}, - {file = "rpds_py-0.22.0.tar.gz", hash = "sha256:32de71c393f126d8203e9815557c7ff4d72ed1ad3aa3f52f6c7938413176750a"}, + {file = "rpds_py-0.22.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:ab27dd4edd84b13309f268ffcdfc07aef8339135ffab7b6d43f16884307a2a48"}, + {file = "rpds_py-0.22.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9d5b925156a746dc1f5f52376fdd1fbdd3f6ffe1fcd6f5e06f77ca79abb940a3"}, + {file = "rpds_py-0.22.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:201650b309c419143775c15209c620627de3c09a27c7fb58375325aec5cce260"}, + {file = "rpds_py-0.22.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:31264187fc934ff1024a4f56775f33c9252d3f4f3e27ec07d1995a26b52702c3"}, + {file = "rpds_py-0.22.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:97c5ffe47ccf92d8b17e10f8a5ce28d015aa1196edc3359684cf31504eae6a14"}, + {file = "rpds_py-0.22.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e9ac7280bd045f472b50306d7efeee051b69e3a2dd1b90f46bd7e86e63b1efa2"}, + {file = "rpds_py-0.22.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f941fb86195f97be7f6efe04a21b223f05dfe4d1dfb159999e2f8d101e44cc4"}, + {file = "rpds_py-0.22.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f91bfc39f7a64168e08ab831fa497ec5438c1d6c6e2f9e12848d95ad11ac8523"}, + {file = "rpds_py-0.22.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:effcae2152afe7937a28376dbabb25c770ef99ed4e16a4ffeb8e6a4f7c4f06aa"}, + {file = "rpds_py-0.22.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:2177e59c033bf0d1bf7de1ced561205963583caf3242c6c700a723034bfb5f8e"}, + {file = "rpds_py-0.22.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:66f4f48a89cdd30ab3a47335df81c76e9a63799d0d84b29c0618371c66fa37b0"}, + {file = "rpds_py-0.22.1-cp310-cp310-win32.whl", hash = "sha256:b07fa9e634234e84096adfa4be3828c8f26e238679c122824b2b3d7131bec578"}, + {file = "rpds_py-0.22.1-cp310-cp310-win_amd64.whl", hash = "sha256:ca4657e9fd0b1b5376942d403d634ce188f79064f0873aa853ab05b10185ceec"}, + {file = "rpds_py-0.22.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:608c84699b2db09c6a8743845b1a3dad36fae53eaaecb241d45b13dff74405fb"}, + {file = "rpds_py-0.22.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9dae4eb9b5534e09ba6c6ab496a757e5e394b7e7b08767d25ca37e8d36491114"}, + {file = "rpds_py-0.22.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09a1f000c5f6e08b298275bae00921e9fbbf2a35dae0a86db2821c058c2201a9"}, + {file = "rpds_py-0.22.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:580ccbf11f02f948add4cb641843030a89f1463d7c0740cbfc9aca91e9dc34b3"}, + {file = "rpds_py-0.22.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:96559e05bdf938b2048353e10a7920b98f853cefe4482c2064a718d7d0a50bd7"}, + {file = "rpds_py-0.22.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:128cbaed7ba26116820bcb992405d6a13ea18c8fca1b8c4f59906d858e91e979"}, + {file = "rpds_py-0.22.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:734783dd7da58f76222f458346ddebdb3621686a1a2a667db5049caf0c9956b9"}, + {file = "rpds_py-0.22.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c9ce6b83597d45bec44a2690857ede62fc98223772135f8a7fa90884eb726501"}, + {file = "rpds_py-0.22.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bca4428c4a957b78ded3e6e62884ab03f029dce8fa8d34818da0f80f61332b49"}, + {file = "rpds_py-0.22.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1ded65691a1d3fd7d2aa89d2c91aa51f941601bb2ce099739909034d957fef4b"}, + {file = "rpds_py-0.22.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:72407065ad459db9f3d052ea8c51e02534f02533fc61e51cbab3bd94166f086c"}, + {file = "rpds_py-0.22.1-cp311-cp311-win32.whl", hash = "sha256:eb013aa01b404219f28dc973d9e6310fd4db216d7299253dd355629952e0564e"}, + {file = "rpds_py-0.22.1-cp311-cp311-win_amd64.whl", hash = "sha256:8bd9ec1db79a664f4cbb12878693b73416f4d2cb425d3e27eccc1bdfbdc826ef"}, + {file = "rpds_py-0.22.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:8ec41049c90d204a6561238a9ad6c7263ebb7009d9759c98b58078d9d2fec9ba"}, + {file = "rpds_py-0.22.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:102be79c4cc47a4aeb5912401185c404cd2601c15a7163bbecff7f1bfe20b669"}, + {file = "rpds_py-0.22.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a603155db408f773637f9e3a712c6e3cbc521aaa8fa2b99f9ba6106c59a2496"}, + {file = "rpds_py-0.22.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5dbff9402c2bdf00bf0df9905694b3c292a3847c725651938a72f554351a5fcb"}, + {file = "rpds_py-0.22.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:96b3759d8ab2323324e0a92b2f44834f9d88089b8d1ab6f533b61f4be3411cef"}, + {file = "rpds_py-0.22.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3029f481b31f329b1fdb4ec4b56935d82210ddd9c6f86ea5a87c06f1e97b161"}, + {file = "rpds_py-0.22.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d280b4bf09f719b89fd9aab3b71067acc0d0449b7d1eba99a2ade4939cef8296"}, + {file = "rpds_py-0.22.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6c8e97e19aa7b0b0d801a159f932ce4435f1049c8c38e2bb372bb5bee559ce50"}, + {file = "rpds_py-0.22.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:50e4b5d291105f7063259fe0125b1af902fb34499444d7c5c521dd8328b00939"}, + {file = "rpds_py-0.22.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:d3777c446bb1c5fcd82dc3f8776e1a146cd91e80cc1892f8634575ace438d22f"}, + {file = "rpds_py-0.22.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:447ae1104fb32197b9262f772d565d38e834cc2e9edd89350b37b88fed636e70"}, + {file = "rpds_py-0.22.1-cp312-cp312-win32.whl", hash = "sha256:55d371b9d8b0c2a68a50413a8cb01c3c3ce1ea4f768bf77b66669a9a486e101e"}, + {file = "rpds_py-0.22.1-cp312-cp312-win_amd64.whl", hash = "sha256:413a30a99d8683dace3765885920ed27ab662efbb6c98d81db76c397ad1ffd71"}, + {file = "rpds_py-0.22.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:aa2ba0176037c915d8660a4e46581d645e2c22b5373e466bc8640a794d45861a"}, + {file = "rpds_py-0.22.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4ba6c66fbc6015b2f99e7176fec41793cecb00c4cc357cad038dff85e6ac42ab"}, + {file = "rpds_py-0.22.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15fa4ca658f8ad22645d3531682b17e5580832efbfa87304c3e62214c79c1e8a"}, + {file = "rpds_py-0.22.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d7833ef6f5d6cb634f296abfd93452fb3eb44c4e9a6ae95c1021eab704c1cee2"}, + {file = "rpds_py-0.22.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c0467838c90435b80793cde486a318fc916ee57f2af54e4b10c72b20cbdcbaa9"}, + {file = "rpds_py-0.22.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d962e2e89b3a95e3597a34b8c93ced1e98958502c5b8096c9fd69deff279f561"}, + {file = "rpds_py-0.22.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ce729f1dc8a4a190c34b69f75377bddc004079b2963ab722ab91fafe040be6d"}, + {file = "rpds_py-0.22.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8080467df22feca0fc9c46567001777c6fbc2b4a2683a7137420896051874ca1"}, + {file = "rpds_py-0.22.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0f9eb37d3a60b262a98ab51ee899cac039de9ca0ce68dcf1a6518a09719020b0"}, + {file = "rpds_py-0.22.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:153248f48d6f90a295a502f53ec544a3ffbd21b0bb32f5dca39c4b93a764d6a2"}, + {file = "rpds_py-0.22.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0a53592cdf98cec3dfcdb24ffec8a4797e7656b65700099af43ec7df023b6de4"}, + {file = "rpds_py-0.22.1-cp313-cp313-win32.whl", hash = "sha256:e8056adcefa2dcb67e8bc91ea5eee26df66e8b297a8cd6ff0903f85c70908fa0"}, + {file = "rpds_py-0.22.1-cp313-cp313-win_amd64.whl", hash = "sha256:a451dba533be77454ebcffc85189108fc05f279100835ac76e7989edacb89156"}, + {file = "rpds_py-0.22.1-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:2ea23f1525d4f64286dbe0947c929d45c3ffe963b2dbed1d3844a2e4938bda42"}, + {file = "rpds_py-0.22.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3aaa22487477de9618ce3b37f99fbe81219ba96f3c2ca84f576f0ab451b83aba"}, + {file = "rpds_py-0.22.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8954b9ffe60f479a0c0ba40987db2546c735ab02a725ea7fd89342152d4d821d"}, + {file = "rpds_py-0.22.1-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c8502a02ae3ae67084f5a0bf5a8253b19fa7a887f824e41e016cdb0ac532a06f"}, + {file = "rpds_py-0.22.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a083221b6a4ecdef38a60c95d8d3223d99449cb4da2544e9644958dc16664eb9"}, + {file = "rpds_py-0.22.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:542eb246d5be31b5e0a9c8ddb9539416f9b31f58f75bd4ee328bff2b5c58d6fd"}, + {file = "rpds_py-0.22.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffae97d28ea4f2c613a751d087b75a97fb78311b38cc2e9a2f4587e473ace167"}, + {file = "rpds_py-0.22.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0ff8d5b13ce2357fa8b33a0a2e3775aa71df5bf7c8ba060634c9d15ab12f357"}, + {file = "rpds_py-0.22.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:0f057a0c546c42964836b209d8de9ea1a4f4b0432006c6343cbe633d8ca14571"}, + {file = "rpds_py-0.22.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:48ee97c7c6027fd423058675b5a39d0b5f7a1648250b671563d5c9f74ff13ff0"}, + {file = "rpds_py-0.22.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:babec324e8654a59122aaa66936a9a483faa03276db9792f51332475c2dddc4a"}, + {file = "rpds_py-0.22.1-cp313-cp313t-win32.whl", hash = "sha256:e69acdbc132c9592c8dc393af85e38e206ca847c7019a953ff625191c3a12312"}, + {file = "rpds_py-0.22.1-cp313-cp313t-win_amd64.whl", hash = "sha256:c783e4ed68200f4e03c125690d23158b1c49c4b186d458a18debc109bbdc3c2e"}, + {file = "rpds_py-0.22.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:2143c3aed85992604d758bbe67da839fb4aab3dd2e1c6dddab5b3ca7162b34a2"}, + {file = "rpds_py-0.22.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f57e2d0f8022783426121b586d7c842ea40ea832a29e28ca36c881b54c74fb28"}, + {file = "rpds_py-0.22.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c0c324879d483504b07f7b18eb1b50567c434263bbe4866ecce33056162668a"}, + {file = "rpds_py-0.22.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1c40e02cc4f3e18fd39344edb10eebe04bd11cfd13119606b5771e5ea51630d3"}, + {file = "rpds_py-0.22.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f76c6f319e57007ad52e671ec741d801324760a377e3d4992c9bb8200333ebac"}, + {file = "rpds_py-0.22.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f5cae9b415ea8a6a563566dbf46650222eccc5971c7daa16fbee63aef92ae543"}, + {file = "rpds_py-0.22.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b09209cdfcacf5eba9cf80367130532e6c02e695252e1f64d3cfcc2356e6e19f"}, + {file = "rpds_py-0.22.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dbe428d0ac6eacaf05402adbaf137f59ad6063848182d1ff294f95ce0f24005b"}, + {file = "rpds_py-0.22.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:626b9feb01bff049a5aec4804f0c58db12585778b4902e5376a95b01f80a7a16"}, + {file = "rpds_py-0.22.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ec1ccc2a9f764cd632fb8ab28fdde166250df54fc8d97315a4a6948dc5367639"}, + {file = "rpds_py-0.22.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:ef92b1fbe6aa2e7885eb90853cc016b1fc95439a8cc8da6d526880e9e2148695"}, + {file = "rpds_py-0.22.1-cp39-cp39-win32.whl", hash = "sha256:c88535f83f7391cf3a45af990237e3939a6fdfbedaed2571633bfdd0bceb36b0"}, + {file = "rpds_py-0.22.1-cp39-cp39-win_amd64.whl", hash = "sha256:7839b7528faa4d134c183b1f2dd1ee4dc2ca2f899f4f0cfdf00fc04c255262a7"}, + {file = "rpds_py-0.22.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a0ed14a4162c2c2b21a162c9fcf90057e3e7da18cd171ab344c1e1664f75090e"}, + {file = "rpds_py-0.22.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:05fdeae9010533e47715c37df83264df0122584e40d691d50cf3607c060952a3"}, + {file = "rpds_py-0.22.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4659b2e4a5008715099e216050f5c6976e5a4329482664411789968b82e3f17d"}, + {file = "rpds_py-0.22.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a18aedc032d6468b73ebbe4437129cb30d54fe543cde2f23671ecad76c3aea24"}, + {file = "rpds_py-0.22.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:149b4d875ef9b12a8f5e303e86a32a58f8ef627e57ec97a7d0e4be819069d141"}, + {file = "rpds_py-0.22.1-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fdaee3947eaaa52dae3ceb9d9f66329e13d8bae35682b1e5dd54612938693934"}, + {file = "rpds_py-0.22.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36ce951800ed2acc6772fd9f42150f29d567f0423989748052fdb39d9e2b5795"}, + {file = "rpds_py-0.22.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ab784621d3e2a41916e21f13a483602cc989fd45fff637634b9231ba43d4383b"}, + {file = "rpds_py-0.22.1-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:c2a214bf5b79bd39a9de1c991353aaaacafda83ba1374178309e92be8e67d411"}, + {file = "rpds_py-0.22.1-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:85060e96953647871957d41707adb8d7bff4e977042fd0deb4fc1881b98dd2fe"}, + {file = "rpds_py-0.22.1-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:c6f3fd617db422c9d4e12cb8d84c984fe07d6d9cb0950cbf117f3bccc6268d05"}, + {file = "rpds_py-0.22.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f2d1b58a0c3a73f0361759642e80260a6d28eee6501b40fe25b82af33ef83f21"}, + {file = "rpds_py-0.22.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:76eaa4c087a061a2c8a0a92536405069878a8f530c00e84a9eaf332e70f5561f"}, + {file = "rpds_py-0.22.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:959ae04ed30cde606f3a0320f0a1f4167a107e685ef5209cce28c5080590bd31"}, + {file = "rpds_py-0.22.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:198067aa6f3d942ff5d0d655bb1e91b59ae85279d47590682cba2834ac1b97d2"}, + {file = "rpds_py-0.22.1-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3e7e99e2af59c56c59b6c964d612511b8203480d39d1ef83edc56f2cb42a3f5d"}, + {file = "rpds_py-0.22.1-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0545928bdf53dfdfcab284468212efefb8a6608ca3b6910c7fb2e5ed8bdc2dc0"}, + {file = "rpds_py-0.22.1-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ef7282d8a14b60dd515e47060638687710b1d518f4b5e961caad43fb3a3606f9"}, + {file = "rpds_py-0.22.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe3f245c2f39a5692d9123c174bc48f6f9fe3e96407e67c6d04541a767d99e72"}, + {file = "rpds_py-0.22.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:efb2ad60ca8637d5f9f653f9a9a8d73964059972b6b95036be77e028bffc68a3"}, + {file = "rpds_py-0.22.1-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:d8306f27418361b788e3fca9f47dec125457f80122e7e31ba7ff5cdba98343f8"}, + {file = "rpds_py-0.22.1-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:4c8dc7331e8cbb1c0ea2bcb550adb1777365944ffd125c69aa1117fdef4887f5"}, + {file = "rpds_py-0.22.1-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:776a06cb5720556a549829896a49acebb5bdd96c7bba100191a994053546975a"}, + {file = "rpds_py-0.22.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:e4f91d702b9ce1388660b3d4a28aa552614a1399e93f718ed0dacd68f23b3d32"}, + {file = "rpds_py-0.22.1.tar.gz", hash = "sha256:157a023bded0618a1eea54979fe2e0f9309e9ddc818ef4b8fc3b884ff38fedd5"}, ] [[package]] @@ -7647,4 +7644,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "33ee730cf750e618ec005ad44ad09617bc8f95632b30ac02b5290a03a33bdf5b" +content-hash = "0d9d498f50601c95a8616797441f00597acdea1e6a70d3b9642c17ffacc1bb45" diff --git a/pyproject.toml b/pyproject.toml index e41e4236..bd6ec1c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,9 +26,10 @@ packages = [{include = "docling"}] ###################### python = "^3.9" pydantic = ">=2.0.0,<2.10" -docling-core = "^2.6.1" -docling-ibm-models = "^2.0.6" -deepsearch-glm = "^0.26.1" +docling-core = { git = "ssh://git@github.com/DS4SD/docling-core.git", branch = "feat-add-legacy-convert" } +docling-ibm-models = { git = "ssh://git@github.com/DS4SD/docling-ibm-models.git", branch = "nli/performance" } +deepsearch-glm = { git = "ssh://git@github.com/DS4SD/deepsearch-glm.git", branch = "cau/layout-processing-children-payloads" } +docling-parse = { git = "ssh://git@github.com/DS4SD/docling-parse.git", branch = "dev/expose-cell-sanitisation-via-python" } filetype = "^1.2.0" pypdfium2 = "^4.30.0" pydantic-settings = "^2.3.0" @@ -36,7 +37,6 @@ huggingface_hub = ">=0.23,<1" requests = "^2.32.3" easyocr = "^1.7" tesserocr = { version = "^2.7.1", optional = true } -docling-parse = "^2.0.5" certifi = ">=2024.7.4" rtree = "^1.3.0" scipy = "^1.6.0"