diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py deleted file mode 100644 index 6f7de07a..00000000 --- a/docling/models/ds_glm_model.py +++ /dev/null @@ -1,328 +0,0 @@ -import copy -import random -from pathlib import Path -from typing import List, Union - -from deepsearch_glm.andromeda_nlp import nlp_model -from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument -from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox -from docling_core.types.legacy_doc.base import ( - Figure, - PageDimensions, - PageReference, - Prov, - Ref, -) -from docling_core.types.legacy_doc.base import Table as DsSchemaTable -from docling_core.types.legacy_doc.base import TableCell -from docling_core.types.legacy_doc.document import BaseText -from docling_core.types.legacy_doc.document import ( - CCSDocumentDescription as DsDocumentDescription, -) -from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject -from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument -from PIL import ImageDraw -from pydantic import BaseModel, ConfigDict, TypeAdapter - -from docling.datamodel.base_models import ( - Cluster, - ContainerElement, - FigureElement, - Table, - TextElement, -) -from docling.datamodel.document import ConversionResult, layout_label_to_ds_type -from docling.datamodel.settings import settings -from docling.utils.glm_utils import to_docling_document -from docling.utils.profiling import ProfilingScope, TimeRecorder -from docling.utils.utils import create_hash - - -class GlmOptions(BaseModel): - model_config = ConfigDict(protected_namespaces=()) - - model_names: str = "" # e.g. "language;term;reference" - - -class GlmModel: - def __init__(self, options: GlmOptions): - self.options = options - - self.model = nlp_model(loglevel="error", text_ordering=True) - - def _to_legacy_document(self, conv_res) -> DsDocument: - title = "" - desc: DsDocumentDescription = DsDocumentDescription(logs=[]) - - page_hashes = [ - PageReference( - hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)), - page=p.page_no + 1, - model="default", - ) - for p in conv_res.pages - ] - - file_info = DsFileInfoObject( - filename=conv_res.input.file.name, - document_hash=conv_res.input.document_hash, - num_pages=conv_res.input.page_count, - page_hashes=page_hashes, - ) - - main_text: List[Union[Ref, BaseText]] = [] - tables: List[DsSchemaTable] = [] - figures: List[Figure] = [] - - page_no_to_page = {p.page_no: p for p in conv_res.pages} - - for element in conv_res.assembled.elements: - # Convert bboxes to lower-left origin. - target_bbox = DsBoundingBox( - element.cluster.bbox.to_bottom_left_origin( - page_no_to_page[element.page_no].size.height - ).as_tuple() - ) - - if isinstance(element, TextElement): - main_text.append( - BaseText( - text=element.text, - obj_type=layout_label_to_ds_type.get(element.label), - name=element.label, - prov=[ - Prov( - bbox=target_bbox, - page=element.page_no + 1, - span=[0, len(element.text)], - ) - ], - ) - ) - elif isinstance(element, Table): - index = len(tables) - ref_str = f"#/tables/{index}" - main_text.append( - Ref( - name=element.label, - obj_type=layout_label_to_ds_type.get(element.label), - ref=ref_str, - ), - ) - - # Initialise empty table data grid (only empty cells) - table_data = [ - [ - TableCell( - text="", - # bbox=[0,0,0,0], - spans=[[i, j]], - obj_type="body", - ) - for j in range(element.num_cols) - ] - for i in range(element.num_rows) - ] - - # Overwrite cells in table data for which there is actual cell content. - for cell in element.table_cells: - for i in range( - min(cell.start_row_offset_idx, element.num_rows), - min(cell.end_row_offset_idx, element.num_rows), - ): - for j in range( - min(cell.start_col_offset_idx, element.num_cols), - min(cell.end_col_offset_idx, element.num_cols), - ): - celltype = "body" - if cell.column_header: - celltype = "col_header" - elif cell.row_header: - celltype = "row_header" - elif cell.row_section: - celltype = "row_section" - - def make_spans(cell): - for rspan in range( - min(cell.start_row_offset_idx, element.num_rows), - min(cell.end_row_offset_idx, element.num_rows), - ): - for cspan in range( - min( - cell.start_col_offset_idx, element.num_cols - ), - min(cell.end_col_offset_idx, element.num_cols), - ): - yield [rspan, cspan] - - spans = list(make_spans(cell)) - if cell.bbox is not None: - bbox = cell.bbox.to_bottom_left_origin( - page_no_to_page[element.page_no].size.height - ).as_tuple() - else: - bbox = None - - table_data[i][j] = TableCell( - text=cell.text, - bbox=bbox, - # col=j, - # row=i, - spans=spans, - obj_type=celltype, - # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx], - # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx] - ) - - tables.append( - DsSchemaTable( - num_cols=element.num_cols, - num_rows=element.num_rows, - obj_type=layout_label_to_ds_type.get(element.label), - data=table_data, - prov=[ - Prov( - bbox=target_bbox, - page=element.page_no + 1, - span=[0, 0], - ) - ], - ) - ) - - elif isinstance(element, FigureElement): - index = len(figures) - ref_str = f"#/figures/{index}" - main_text.append( - Ref( - name=element.label, - obj_type=layout_label_to_ds_type.get(element.label), - ref=ref_str, - ), - ) - figures.append( - Figure( - prov=[ - Prov( - bbox=target_bbox, - page=element.page_no + 1, - span=[0, 0], - ) - ], - obj_type=layout_label_to_ds_type.get(element.label), - payload={ - "children": TypeAdapter(List[Cluster]).dump_python( - element.cluster.children - ) - }, # hack to channel child clusters through GLM - ) - ) - elif isinstance(element, ContainerElement): - main_text.append( - BaseText( - text="", - payload={ - "children": TypeAdapter(List[Cluster]).dump_python( - element.cluster.children - ) - }, # hack to channel child clusters through GLM - obj_type=layout_label_to_ds_type.get(element.label), - name=element.label, - prov=[ - Prov( - bbox=target_bbox, - page=element.page_no + 1, - span=[0, 0], - ) - ], - ) - ) - - page_dimensions = [ - PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width) - for p in conv_res.pages - if p.size is not None - ] - - ds_doc: DsDocument = DsDocument( - name=title, - description=desc, - file_info=file_info, - main_text=main_text, - tables=tables, - figures=figures, - page_dimensions=page_dimensions, - ) - - return ds_doc - - def __call__(self, conv_res: ConversionResult) -> DoclingDocument: - with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT): - ds_doc = self._to_legacy_document(conv_res) - ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True) - - glm_doc = self.model.apply_on_doc(ds_doc_dict) - - docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental - - # DEBUG code: - def draw_clusters_and_cells(ds_document, page_no, show: bool = False): - clusters_to_draw = [] - image = copy.deepcopy(conv_res.pages[page_no].image) - for ix, elem in enumerate(ds_document.main_text): - if isinstance(elem, BaseText): - prov = elem.prov[0] # type: ignore - elif isinstance(elem, Ref): - _, arr, index = elem.ref.split("/") - index = int(index) # type: ignore - if arr == "tables": - prov = ds_document.tables[index].prov[0] - elif arr == "figures": - prov = ds_document.pictures[index].prov[0] - else: - prov = None - - if prov and prov.page == page_no: - clusters_to_draw.append( - Cluster( - id=ix, - label=elem.name, - bbox=BoundingBox.from_tuple( - coord=prov.bbox, # type: ignore - origin=CoordOrigin.BOTTOMLEFT, - ).to_top_left_origin(conv_res.pages[page_no].size.height), - ) - ) - - draw = ImageDraw.Draw(image) - for c in clusters_to_draw: - x0, y0, x1, y1 = c.bbox.as_tuple() - draw.rectangle([(x0, y0), (x1, y1)], outline="red") - draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255)) - - cell_color = ( - random.randint(30, 140), - random.randint(30, 140), - random.randint(30, 140), - ) - for tc in c.cells: # [:1]: - x0, y0, x1, y1 = tc.bbox.as_tuple() - draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color) - - if show: - image.show() - else: - out_path: Path = ( - Path(settings.debug.debug_output_path) - / f"debug_{conv_res.input.file.stem}" - ) - out_path.mkdir(parents=True, exist_ok=True) - - out_file = out_path / f"doc_page_{page_no:05}.png" - image.save(str(out_file), format="png") - - # for item in ds_doc.page_dimensions: - # page_no = item.page - # draw_clusters_and_cells(ds_doc, page_no) - - return docling_doc diff --git a/docling/models/readingorder_model.py b/docling/models/readingorder_model.py new file mode 100644 index 00000000..1c2325d4 --- /dev/null +++ b/docling/models/readingorder_model.py @@ -0,0 +1,290 @@ +import copy +import random +from pathlib import Path +from typing import Dict, List + +from docling_core.types.doc import ( + BoundingBox, + CoordOrigin, + DocItemLabel, + DoclingDocument, + DocumentOrigin, + GroupLabel, + ProvenanceItem, + RefItem, + TableData, +) +from docling_core.types.legacy_doc.base import Ref +from docling_core.types.legacy_doc.document import BaseText +from docling_ibm_models.reading_order.reading_order_rb import ( + PageElement as ReadingOrderPageElement, +) +from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor +from PIL import ImageDraw +from pydantic import BaseModel, ConfigDict + +from docling.datamodel.base_models import ( + Cluster, + ContainerElement, + FigureElement, + Table, + TextElement, +) +from docling.datamodel.document import ConversionResult +from docling.datamodel.settings import settings +from docling.utils.profiling import ProfilingScope, TimeRecorder + + +class ReadingOrderOptions(BaseModel): + model_config = ConfigDict(protected_namespaces=()) + + model_names: str = "" # e.g. "language;term;reference" + + +class ReadingOrderModel: + def __init__(self, options: ReadingOrderOptions): + self.options = options + self.ro_model = ReadingOrderPredictor() + + def _assembled_to_readingorder_elements( + self, conv_res: ConversionResult + ) -> List[ReadingOrderPageElement]: + + elements: List[ReadingOrderPageElement] = [] + + for ( + element + ) in ( + conv_res.assembled.body + ): # FIXME: use conv_res.assembled.elements (include furniture) + + page_height = conv_res.pages[element.page_no].size.height # type: ignore + bbox = element.cluster.bbox.to_bottom_left_origin(page_height) + text = element.text or "" + + elements.append( + ReadingOrderPageElement( + cid=len(elements), + ref=RefItem(cref=f"#/{element.page_no}/{element.cluster.id}"), + text=text, + page_no=element.page_no, + page_size=conv_res.pages[element.page_no].size, + label=element.label, + l=bbox.l, + r=bbox.r, + b=bbox.b, + t=bbox.t, + coord_origin=bbox.coord_origin, + ) + ) + + return elements + + def _readingorder_elements_to_docling_doc( + self, + conv_res: ConversionResult, + ro_elements: List[ReadingOrderPageElement], + el_to_captions_mapping: Dict[int, List[int]], + el_to_footnotes_mapping: Dict[int, List[int]], + el_merges_mapping: Dict[int, List[int]], + ) -> DoclingDocument: + + id_to_elem = { + RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem + for elem in conv_res.assembled.elements + } + + origin = DocumentOrigin( + mimetype="application/pdf", + filename=conv_res.input.file.name, + binary_hash=conv_res.input.document_hash, + ) + doc_name = Path(origin.filename).stem + out_doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin) + + for page in conv_res.pages: + page_no = page.page_no + 1 + size = page.size + + assert size is not None + + out_doc.add_page(page_no=page_no, size=size) + + current_list = None + + # TODO: handle merges + + for rel in ro_elements: + element = id_to_elem[rel.ref.cref] + + page_height = conv_res.pages[element.page_no].size.height # type: ignore + + if isinstance(element, TextElement): + text = element.text + + prov = ProvenanceItem( + page_no=element.page_no + 1, + charspan=(0, len(text)), + bbox=element.cluster.bbox.to_bottom_left_origin(page_height), + ) + label = element.label + + if label == DocItemLabel.LIST_ITEM: + if current_list is None: + current_list = out_doc.add_group( + label=GroupLabel.LIST, name="list" + ) + + # TODO: Infer if this is a numbered or a bullet list item + out_doc.add_list_item( + text=text, enumerated=False, prov=prov, parent=current_list + ) + elif label == DocItemLabel.SECTION_HEADER: + current_list = None + + out_doc.add_heading(text=text, prov=prov) + elif label == DocItemLabel.CODE: + current_list = None + + out_doc.add_code(text=text, prov=prov) + elif label == DocItemLabel.FORMULA: + current_list = None + + out_doc.add_text( + label=DocItemLabel.FORMULA, text="", orig=text, prov=prov + ) + else: + current_list = None + + out_doc.add_text(label=element.label, text=text, prov=prov) + + elif isinstance(element, Table): + + tbl_data = TableData( + num_rows=element.num_rows, + num_cols=element.num_cols, + table_cells=element.table_cells, + ) + + prov = ProvenanceItem( + page_no=element.page_no + 1, + charspan=(0, 0), + bbox=element.cluster.bbox.to_bottom_left_origin(page_height), + ) + + tbl = out_doc.add_table( + data=tbl_data, prov=prov, label=element.cluster.label + ) + + # TODO: handle element.cluster.children. + # TODO: handle captions + # tbl.captions.extend(caption_refs) + + elif isinstance(element, FigureElement): + text = "" + prov = ProvenanceItem( + page_no=element.page_no + 1, + charspan=(0, len(text)), + bbox=element.cluster.bbox.to_bottom_left_origin(page_height), + ) + + pic = out_doc.add_picture(prov=prov) + + # TODO: handle element.cluster.children. + # TODO: handle captions + # pic.captions.extend(caption_refs) + # _add_child_elements(pic, doc, obj, pelem) + + elif isinstance(element, ContainerElement): + pass + # TODO: handle element.cluster.children. + + return out_doc + + def __call__(self, conv_res: ConversionResult) -> DoclingDocument: + with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT): + page_elements = self._assembled_to_readingorder_elements(conv_res) + + # Apply reading order + sorted_elements = self.ro_model.predict_reading_order( + page_elements=page_elements + ) + el_to_captions_mapping = self.ro_model.predict_to_captions( + sorted_elements=sorted_elements + ) + el_to_footnotes_mapping = self.ro_model.predict_to_footnotes( + sorted_elements=sorted_elements + ) + el_merges_mapping = self.ro_model.predict_merges( + sorted_elements=sorted_elements + ) + + docling_doc: DoclingDocument = self._readingorder_elements_to_docling_doc( + conv_res, + sorted_elements, + el_to_captions_mapping, + el_to_footnotes_mapping, + el_merges_mapping, + ) + + # DEBUG code: + def draw_clusters_and_cells(ds_document, page_no, show: bool = False): + clusters_to_draw = [] + image = copy.deepcopy(conv_res.pages[page_no].image) + for ix, elem in enumerate(ds_document.main_text): + if isinstance(elem, BaseText): + prov = elem.prov[0] # type: ignore + elif isinstance(elem, Ref): + _, arr, index = elem.ref.split("/") + index = int(index) # type: ignore + if arr == "tables": + prov = ds_document.tables[index].prov[0] + elif arr == "figures": + prov = ds_document.pictures[index].prov[0] + else: + prov = None + + if prov and prov.page == page_no: + clusters_to_draw.append( + Cluster( + id=ix, + label=elem.name, + bbox=BoundingBox.from_tuple( + coord=prov.bbox, # type: ignore + origin=CoordOrigin.BOTTOMLEFT, + ).to_top_left_origin(conv_res.pages[page_no].size.height), + ) + ) + + draw = ImageDraw.Draw(image) + for c in clusters_to_draw: + x0, y0, x1, y1 = c.bbox.as_tuple() + draw.rectangle([(x0, y0), (x1, y1)], outline="red") + draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255)) + + cell_color = ( + random.randint(30, 140), + random.randint(30, 140), + random.randint(30, 140), + ) + for tc in c.cells: # [:1]: + x0, y0, x1, y1 = tc.bbox.as_tuple() + draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color) + + if show: + image.show() + else: + out_path: Path = ( + Path(settings.debug.debug_output_path) + / f"debug_{conv_res.input.file.stem}" + ) + out_path.mkdir(parents=True, exist_ok=True) + + out_file = out_path / f"doc_page_{page_no:05}.png" + image.save(str(out_file), format="png") + + # for item in ds_doc.page_dimensions: + # page_no = item.page + # draw_clusters_and_cells(ds_doc, page_no) + + return docling_doc diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 4e66415f..6d268ecb 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -25,7 +25,6 @@ from docling.models.document_picture_classifier import ( DocumentPictureClassifier, DocumentPictureClassifierOptions, ) -from docling.models.ds_glm_model import GlmModel, GlmOptions from docling.models.easyocr_model import EasyOcrModel from docling.models.layout_model import LayoutModel from docling.models.ocr_mac_model import OcrMacModel @@ -35,6 +34,7 @@ from docling.models.page_preprocessing_model import ( PagePreprocessingOptions, ) from docling.models.rapid_ocr_model import RapidOcrModel +from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions from docling.models.table_structure_model import TableStructureModel from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel from docling.models.tesseract_ocr_model import TesseractOcrModel @@ -63,7 +63,7 @@ class StandardPdfPipeline(PaginatedPipeline): or self.pipeline_options.generate_table_images ) - self.glm_model = GlmModel(options=GlmOptions()) + self.glm_model = ReadingOrderModel(options=ReadingOrderOptions()) if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None: raise RuntimeError( diff --git a/docs/examples/batch_convert.py b/docs/examples/batch_convert.py index cd5e4b3b..c2e04d10 100644 --- a/docs/examples/batch_convert.py +++ b/docs/examples/batch_convert.py @@ -125,7 +125,7 @@ def main(): conv_results = doc_converter.convert_all( input_doc_paths, - raises_on_error=False, # to let conversion run through all and examine results at the end + raises_on_error=True, # to let conversion run through all and examine results at the end ) success_count, partial_success_count, failure_count = export_documents( conv_results, output_dir=Path("scratch") diff --git a/poetry.lock b/poetry.lock index b261db4b..b2377b96 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -187,8 +187,8 @@ files = [ lazy-object-proxy = ">=1.4.0" typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} wrapt = [ - {version = ">=1.11,<2", markers = "python_version < \"3.11\""}, {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, + {version = ">=1.11,<2", markers = "python_version < \"3.11\""}, ] [[package]] @@ -894,33 +894,39 @@ chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"] [[package]] name = "docling-ibm-models" -version = "3.3.1" +version = "3.3.0" description = "This package contains the AI models used by the Docling PDF conversion package" optional = false -python-versions = "<4.0,>=3.9" -files = [ - {file = "docling_ibm_models-3.3.1-py3-none-any.whl", hash = "sha256:be8f6684839c48d4b318e58a558cd7e2af3351b712f9604a69a415a0e238d5e2"}, - {file = "docling_ibm_models-3.3.1.tar.gz", hash = "sha256:f1d64216bbca6507da6f80de1acf450f33bdc7dc81cfd7f532a6cfc545cc092a"}, -] +python-versions = "^3.9" +files = [] +develop = false [package.dependencies] +docling-core = "^2.16.0" huggingface_hub = ">=0.23,<1" -jsonlines = ">=3.1.0,<4.0.0" +jsonlines = "^3.1.0" numpy = [ {version = ">=1.24.4,<3.0.0", markers = "sys_platform != \"darwin\" or platform_machine != \"x86_64\""}, {version = ">=1.24.4,<2.0.0", markers = "sys_platform == \"darwin\" and platform_machine == \"x86_64\""}, ] -opencv-python-headless = ">=4.6.0.66,<5.0.0.0" -Pillow = ">=10.0.0,<11.0.0" +opencv-python-headless = "^4.6.0.66" +Pillow = "^10.0.0" +pydantic = "^2.0.0" safetensors = {version = ">=0.4.3,<1", extras = ["torch"]} -torch = ">=2.2.2,<3.0.0" -torchvision = ">=0,<1" -tqdm = ">=4.64.0,<5.0.0" +torch = "^2.2.2" +torchvision = "^0" +tqdm = "^4.64.0" transformers = [ {version = ">=4.42.0,<5.0.0", markers = "sys_platform != \"darwin\" or platform_machine != \"x86_64\""}, {version = ">=4.42.0,<4.43.0", markers = "sys_platform == \"darwin\" and platform_machine == \"x86_64\""}, ] +[package.source] +type = "git" +url = "ssh://git@github.com/DS4SD/docling-ibm-models.git" +reference = "dev/add-reading-order" +resolved_reference = "1d2dd932b4484dd9ec6e42c80b0174a06af63e08" + [[package]] name = "docling-parse" version = "3.3.0" @@ -2727,13 +2733,13 @@ pygments = ">2.12.0" [[package]] name = "mkdocs-material" -version = "9.6.2" +version = "9.6.3" description = "Documentation that simply works" optional = false python-versions = ">=3.8" files = [ - {file = "mkdocs_material-9.6.2-py3-none-any.whl", hash = "sha256:71d90dbd63b393ad11a4d90151dfe3dcbfcd802c0f29ce80bebd9bbac6abc753"}, - {file = "mkdocs_material-9.6.2.tar.gz", hash = "sha256:a3de1c5d4c745f10afa78b1a02f917b9dce0808fb206adc0f5bb48b58c1ca21f"}, + {file = "mkdocs_material-9.6.3-py3-none-any.whl", hash = "sha256:1125622067e26940806701219303b27c0933e04533560725d97ec26fd16a39cf"}, + {file = "mkdocs_material-9.6.3.tar.gz", hash = "sha256:c87f7d1c39ce6326da5e10e232aed51bae46252e646755900f4b0fc9192fa832"}, ] [package.dependencies] @@ -2834,8 +2840,8 @@ files = [ [package.dependencies] multiprocess = [ - {version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""}, {version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""}, + {version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""}, ] pygments = ">=2.0" pywin32 = {version = ">=301", markers = "platform_system == \"Windows\""} @@ -3844,10 +3850,10 @@ files = [ [package.dependencies] numpy = [ + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, ] @@ -3870,10 +3876,10 @@ files = [ [package.dependencies] numpy = [ + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, ] @@ -4059,9 +4065,9 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.22.4", markers = "python_version < \"3.11\""}, - {version = ">=1.23.2", markers = "python_version == \"3.11\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.22.4", markers = "python_version < \"3.11\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -4825,8 +4831,8 @@ files = [ astroid = ">=2.15.8,<=2.17.0-dev0" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} dill = [ - {version = ">=0.2", markers = "python_version < \"3.11\""}, {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, + {version = ">=0.2", markers = "python_version < \"3.11\""}, ] isort = ">=4.2.5,<6" mccabe = ">=0.6,<0.8" @@ -7062,13 +7068,13 @@ vision = ["Pillow (>=10.0.1,<=15.0)"] [[package]] name = "transformers" -version = "4.48.2" +version = "4.48.3" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" optional = false python-versions = ">=3.9.0" files = [ - {file = "transformers-4.48.2-py3-none-any.whl", hash = "sha256:493bc5b0268b116eff305edf6656367fc89cf570e7a9d5891369e04751db698a"}, - {file = "transformers-4.48.2.tar.gz", hash = "sha256:dcfb73473e61f22fb3366fe2471ed2e42779ecdd49527a1bdf1937574855d516"}, + {file = "transformers-4.48.3-py3-none-any.whl", hash = "sha256:78697f990f5ef350c23b46bf86d5081ce96b49479ab180b2de7687267de8fd36"}, + {file = "transformers-4.48.3.tar.gz", hash = "sha256:a5e8f1e9a6430aa78215836be70cecd3f872d99eeda300f41ad6cc841724afdb"}, ] [package.dependencies] @@ -7850,4 +7856,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "ca0464df452664834ae9bccc59f89240e2f5e8f3b179761de615548c799680e7" +content-hash = "e693a18915e18102575bb9b1179d78faf4fffe211e7d7b3f5bbf177695979ba1" diff --git a/pyproject.toml b/pyproject.toml index e1c30a3c..0e50ea1e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ packages = [{include = "docling"}] python = "^3.9" pydantic = "^2.0.0" docling-core = {extras = ["chunking"], version = "^2.17.2"} -docling-ibm-models = "^3.3.0" +docling-ibm-models = {git = "ssh://git@github.com/DS4SD/docling-ibm-models.git", rev = "dev/add-reading-order"} #"^3.3.0" deepsearch-glm = "^1.0.0" docling-parse = "^3.3.0" filetype = "^1.2.0"