From 74e0452b6add931667f6034ba98a23e2804ac000 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Tue, 15 Oct 2024 17:08:48 +0200 Subject: [PATCH] Add migration instructions to doc (wip) Signed-off-by: Christoph Auer --- docling/cli/main.py | 1 - docling/datamodel/document.py | 3 - docling/datamodel/pipeline_options.py | 5 +- docling/models/base_ocr_model.py | 2 +- docling/models/ds_glm_model.py | 8 +- docling/models/tesseract_ocr_model.py | 1 - docs/examples/batch_convert.py | 2 +- docs/examples/custom_convert.py | 12 +-- docs/examples/export_figures.py | 3 +- docs/examples/export_multimodal.py | 2 +- docs/examples/export_tables.py | 2 +- docs/v2.md | 104 +++++++++++++++++++++++++- tests/test_legacy_format_transform.py | 1 - 13 files changed, 116 insertions(+), 30 deletions(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index f97e4938..1800ea18 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -11,7 +11,6 @@ import typer from docling_core.utils.file import resolve_file_source from docling.backend.docling_parse_backend import DoclingParseDocumentBackend -from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import ( ConversionStatus, FormatToExtensions, diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 41d62114..1b39ff7d 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -23,7 +23,6 @@ from docling_core.types.doc import ( TextItem, ) from docling_core.types.doc.document import ListItem -from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox from docling_core.types.legacy_doc.base import Figure, GlmTableCell, TableCell from docling_core.utils.file import resolve_file_source from pydantic import BaseModel @@ -43,8 +42,6 @@ from docling.datamodel.base_models import ( MimeTypeToFormat, Page, PageElement, - Table, - TextElement, ) from docling.datamodel.settings import DocumentLimits from docling.utils.utils import create_file_hash, create_hash diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 473ef980..ccb52c4a 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -1,9 +1,8 @@ -import warnings from enum import Enum, auto from pathlib import Path -from typing import Annotated, List, Literal, Optional, Union +from typing import List, Literal, Optional, Union -from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic import BaseModel, ConfigDict, Field class TableFormerMode(str, Enum): diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index c1ca8ef7..59ae2295 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -1,7 +1,7 @@ import copy import logging from abc import abstractmethod -from typing import Iterable, List, Tuple +from typing import Iterable, List import numpy as np from docling_core.types.doc import BoundingBox, CoordOrigin diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py index 635651ad..1b0a84e7 100644 --- a/docling/models/ds_glm_model.py +++ b/docling/models/ds_glm_model.py @@ -1,16 +1,12 @@ import copy import random -from typing import List, Tuple, Union +from typing import List, Union from deepsearch_glm.nlp_utils import init_nlp_model -from deepsearch_glm.utils.doc_utils import ( - to_docling_document, - to_legacy_document_format, -) +from deepsearch_glm.utils.doc_utils import to_docling_document from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models from docling_core.types import BaseText from docling_core.types import Document as DsDocument -from docling_core.types import Document as DsLegacyDocument from docling_core.types import DocumentDescription as DsDocumentDescription from docling_core.types import FileInfoObject as DsFileInfoObject from docling_core.types import PageDimensions, PageReference, Prov, Ref diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index ea74b6ad..a97eb9a8 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -1,7 +1,6 @@ import logging from typing import Iterable -import numpy from docling_core.types.doc import BoundingBox, CoordOrigin from docling.datamodel.base_models import OcrCell, Page diff --git a/docs/examples/batch_convert.py b/docs/examples/batch_convert.py index da1c701f..c8e244fa 100644 --- a/docs/examples/batch_convert.py +++ b/docs/examples/batch_convert.py @@ -1,9 +1,9 @@ import json import logging -import time from pathlib import Path from typing import Iterable +import time import yaml from docling.datamodel.base_models import ConversionStatus diff --git a/docs/examples/custom_convert.py b/docs/examples/custom_convert.py index 8c753396..941cc951 100644 --- a/docs/examples/custom_convert.py +++ b/docs/examples/custom_convert.py @@ -1,18 +1,14 @@ import json import logging -import time from pathlib import Path -from typing import Iterable -from docling.datamodel.base_models import ConversionStatus, InputFormat -from docling.datamodel.document import ConversionResult +import time + +from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( PdfPipelineOptions, - TesseractCliOcrOptions, - TesseractOcrOptions, ) -from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption -from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline +from docling.document_converter import DocumentConverter, PdfFormatOption _log = logging.getLogger(__name__) diff --git a/docs/examples/export_figures.py b/docs/examples/export_figures.py index 1110b0f1..90f465b4 100644 --- a/docs/examples/export_figures.py +++ b/docs/examples/export_figures.py @@ -1,7 +1,8 @@ import logging -import time from pathlib import Path +import time + from docling.datamodel.base_models import FigureElement, InputFormat, Table from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption diff --git a/docs/examples/export_multimodal.py b/docs/examples/export_multimodal.py index 72845f41..5ead9a0a 100644 --- a/docs/examples/export_multimodal.py +++ b/docs/examples/export_multimodal.py @@ -1,9 +1,9 @@ import datetime import logging -import time from pathlib import Path import pandas as pd +import time from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions diff --git a/docs/examples/export_tables.py b/docs/examples/export_tables.py index 68b9ce47..55d3a40f 100644 --- a/docs/examples/export_tables.py +++ b/docs/examples/export_tables.py @@ -1,8 +1,8 @@ import logging -import time from pathlib import Path import pandas as pd +import time from docling.document_converter import DocumentConverter diff --git a/docs/v2.md b/docs/v2.md index 55135b1a..7e5e9ad9 100644 --- a/docs/v2.md +++ b/docs/v2.md @@ -1,7 +1,107 @@ ## What's new -Stay tuned! +Docling v2 introduces several new features: +- Understands and converts PDF, MS Word, MS Powerpoint, HTML and several image formats +- Produces a new, universal document representation which can encapsulate document hierarchy +- Comes with a fresh new API and CLI ## Migration from v1 -Stay tuned! +### Setting up a `DocumentConverter` + +To accomodate many input formats, we changed the way you need to set up your `DocumentConverter` object. +You can now define a list of allowed formats on the `DocumentConverter` initialization, and specify custom options +per-format if desired. By default, all supported formats are allowed. If you don't provide `format_options`, defaults +will be used for all `allowed_formats`. + +Format options can include the pipeline class to use, the options to provide to the pipeline, and the document backend. +They are provided as format-specific types, such as `PdfFormatOption` or `WordFormatOption`, as seen below. + +```python +from docling.document_converter import DocumentConverter +from docling.datamodel.base_models import InputFormat +from docling.document_converter import ( + DocumentConverter, + PdfFormatOption, + WordFormatOption, +) +from docling.pipeline.simple_pipeline import SimplePipeline +from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline +from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend + +## Default initialization still works as before: +# doc_converter = DocumentConverter() + +## Custom options are now defined per format. +doc_converter = ( + DocumentConverter( # all of the below is optional, has internal defaults. + allowed_formats=[ + InputFormat.PDF, + InputFormat.IMAGE, + InputFormat.DOCX, + InputFormat.HTML, + InputFormat.PPTX, + ], # whitelist formats, non-matching files are ignored. + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend + ), + InputFormat.DOCX: WordFormatOption( + pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend + ), + }, + ) +) +``` + +Note: If you work only with defaults, all remains the same as in Docling v1. + +### Converting documents + +We have simplified the way you can feed input to the `DocumentConverter` and renamed the conversion methods for +better semantics. You can now call the conversion directly with a single file, or a list of input files, +or `DocumentStream` objects, without constructing a `DocumentConversionInput` object first. + +* `DocumentConverter.convert` now converts a single file input (previously `DocumentConverter.convert_single`). +* `DocumentConverter.convert_all` now converts many files at once (previously `DocumentConverter.convert`). + + +```python +... +## Convert a single file (from URL or local path) +conv_result = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single` + +## Convert several files at once: + +input_files = [ + "tests/data/wiki_duck.html", + "tests/data/word_sample.docx", + "tests/data/lorem_ipsum.docx", + "tests/data/powerpoint_sample.pptx", + "tests/data/2305.03393v1-pg9-img.png", + "tests/data/2206.01062.pdf", +] + +conv_results_iter = doc_converter.convert_all(input_files) # previously `convert_batch` + +``` +Through the `raises_on_error` argument, you can also control if the conversion should raise exceptions when first +encountering a problem, or resiliently convert all files first and reflect errors in each file's conversion status. +By default, any error is immediately raised and the conversion aborts (previously, exceptions were swallowed). + +```python +... +conv_results_iter = doc_converter.convert_all(input_files, raises_on_error=False) # previously `convert_batch` + +``` + +### Exporting documents into JSON, Markdown, Doctags + +We have simplified how you can access and export the converted document data, too. + +TBD. + + +### CLI + +TBD. diff --git a/tests/test_legacy_format_transform.py b/tests/test_legacy_format_transform.py index 517b0b8b..4f764542 100644 --- a/tests/test_legacy_format_transform.py +++ b/tests/test_legacy_format_transform.py @@ -3,7 +3,6 @@ from pathlib import Path import pytest -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption