Add migration instructions to doc (wip)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-27 12:34:22 +00:00 · 2024-10-15 17:08:48 +02:00 · 2024-10-15 17:08:48 +02:00 · 74e0452b6a
commit 74e0452b6a
parent 9d15f4d5bf
13 changed files with 116 additions and 30 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -11,7 +11,6 @@ import typer
 from docling_core.utils.file import resolve_file_source
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import (
    ConversionStatus,
    FormatToExtensions,
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -23,7 +23,6 @@ from docling_core.types.doc import (
    TextItem,
 )
 from docling_core.types.doc.document import ListItem
 from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
 from docling_core.types.legacy_doc.base import Figure, GlmTableCell, TableCell
 from docling_core.utils.file import resolve_file_source
 from pydantic import BaseModel
@ -43,8 +42,6 @@ from docling.datamodel.base_models import (
    MimeTypeToFormat,
    Page,
    PageElement,
    Table,
    TextElement,
 )
 from docling.datamodel.settings import DocumentLimits
 from docling.utils.utils import create_file_hash, create_hash
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -1,9 +1,8 @@
 import warnings
 from enum import Enum, auto
 from pathlib import Path
-from typing import Annotated, List, Literal, Optional, Union
+from typing import List, Literal, Optional, Union
-from pydantic import BaseModel, ConfigDict, Field, model_validator
+from pydantic import BaseModel, ConfigDict, Field
 class TableFormerMode(str, Enum):
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@ -1,7 +1,7 @@
 import copy
 import logging
 from abc import abstractmethod
-from typing import Iterable, List, Tuple
+from typing import Iterable, List
 import numpy as np
 from docling_core.types.doc import BoundingBox, CoordOrigin
--- a/docling/models/ds_glm_model.py
+++ b/docling/models/ds_glm_model.py
@ -1,16 +1,12 @@
 import copy
 import random
-from typing import List, Tuple, Union
+from typing import List, Union
 from deepsearch_glm.nlp_utils import init_nlp_model
-from deepsearch_glm.utils.doc_utils import (
+from deepsearch_glm.utils.doc_utils import to_docling_document
    to_docling_document,
    to_legacy_document_format,
 )
 from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
 from docling_core.types import BaseText
 from docling_core.types import Document as DsDocument
 from docling_core.types import Document as DsLegacyDocument
 from docling_core.types import DocumentDescription as DsDocumentDescription
 from docling_core.types import FileInfoObject as DsFileInfoObject
 from docling_core.types import PageDimensions, PageReference, Prov, Ref
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -1,7 +1,6 @@
 import logging
 from typing import Iterable
 import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling.datamodel.base_models import OcrCell, Page
--- a/docs/examples/batch_convert.py
+++ b/docs/examples/batch_convert.py
@ -1,9 +1,9 @@
 import json
 import logging
 import time
 from pathlib import Path
 from typing import Iterable
 import time
 import yaml
 from docling.datamodel.base_models import ConversionStatus
--- a/docs/examples/custom_convert.py
+++ b/docs/examples/custom_convert.py
@ -1,18 +1,14 @@
 import json
 import logging
 import time
 from pathlib import Path
 from typing import Iterable
-from docling.datamodel.base_models import ConversionStatus, InputFormat
+import time
-from docling.datamodel.document import ConversionResult
+
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
 )
-from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
+from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 _log = logging.getLogger(__name__)
--- a/docs/examples/export_figures.py
+++ b/docs/examples/export_figures.py
@ -1,7 +1,8 @@
 import logging
 import time
 from pathlib import Path
 import time
 from docling.datamodel.base_models import FigureElement, InputFormat, Table
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
--- a/docs/examples/export_multimodal.py
+++ b/docs/examples/export_multimodal.py
@ -1,9 +1,9 @@
 import datetime
 import logging
 import time
 from pathlib import Path
 import pandas as pd
 import time
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
--- a/docs/examples/export_tables.py
+++ b/docs/examples/export_tables.py
@ -1,8 +1,8 @@
 import logging
 import time
 from pathlib import Path
 import pandas as pd
 import time
 from docling.document_converter import DocumentConverter
--- a/docs/v2.md
+++ b/docs/v2.md
@ -1,7 +1,107 @@
 ## What's new
-Stay tuned!
+Docling v2 introduces several new features:
 - Understands and converts PDF, MS Word, MS Powerpoint, HTML and several image formats 
 - Produces a new, universal document representation which can encapsulate document hierarchy
 - Comes with a fresh new API and CLI
 ## Migration from v1
-Stay tuned!
+### Setting up a `DocumentConverter`
 To accomodate many input formats, we changed the way you need to set up your `DocumentConverter` object.
 You can now define a list of allowed formats on the `DocumentConverter` initialization, and specify custom options 
 per-format if desired. By default, all supported formats are allowed. If you don't provide `format_options`, defaults 
 will be used for all `allowed_formats`.
 Format options can include the pipeline class to use, the options to provide to the pipeline, and the document backend.
 They are provided as format-specific types, such as `PdfFormatOption` or `WordFormatOption`, as seen below.
 ```python
 from docling.document_converter import DocumentConverter
 from docling.datamodel.base_models import InputFormat
 from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
 )
 from docling.pipeline.simple_pipeline import SimplePipeline
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 ## Default initialization still works as before:
 # doc_converter = DocumentConverter() 
 ## Custom options are now defined per format. 
 doc_converter = (
    DocumentConverter(  # all of the below is optional, has internal defaults.
        allowed_formats=[
            InputFormat.PDF,
            InputFormat.IMAGE,
            InputFormat.DOCX,
            InputFormat.HTML,
            InputFormat.PPTX,
        ],  # whitelist formats, non-matching files are ignored.
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
            ),
            InputFormat.DOCX: WordFormatOption(
                pipeline_cls=SimplePipeline  # , backend=MsWordDocumentBackend
            ),
        },
    )
 )
 ```
 Note: If you work only with defaults, all remains the same as in Docling v1.
 ### Converting documents
 We have simplified the way you can feed input to the `DocumentConverter` and renamed the conversion methods for 
 better semantics. You can now call the conversion directly with a single file, or a list of input files, 
 or `DocumentStream` objects, without constructing a `DocumentConversionInput` object first.
 * `DocumentConverter.convert` now converts a single file input (previously `DocumentConverter.convert_single`).
 * `DocumentConverter.convert_all` now converts many files at once (previously `DocumentConverter.convert`).
 ```python
 ...
 ## Convert a single file (from URL or local path)
 conv_result = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single`
 ## Convert several files at once:
 input_files = [
    "tests/data/wiki_duck.html",
    "tests/data/word_sample.docx",
    "tests/data/lorem_ipsum.docx",
    "tests/data/powerpoint_sample.pptx",
    "tests/data/2305.03393v1-pg9-img.png",
    "tests/data/2206.01062.pdf",
 ]
 conv_results_iter = doc_converter.convert_all(input_files) # previously `convert_batch`
 ```
 Through the `raises_on_error` argument, you can also control if the conversion should raise exceptions when first 
 encountering a problem, or resiliently convert all files first and reflect errors in each file's conversion status.
 By default, any error is immediately raised and the conversion aborts (previously, exceptions were swallowed).
 ```python
 ...
 conv_results_iter = doc_converter.convert_all(input_files, raises_on_error=False) # previously `convert_batch`
 ```
 ### Exporting documents into JSON, Markdown, Doctags
 We have simplified how you can access and export the converted document data, too.
 TBD.
 ### CLI
 TBD.
--- a/tests/test_legacy_format_transform.py
+++ b/tests/test_legacy_format_transform.py
@ -3,7 +3,6 @@ from pathlib import Path
 import pytest
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption