From 74e0452b6add931667f6034ba98a23e2804ac000 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Tue, 15 Oct 2024 17:08:48 +0200
Subject: [PATCH] Add migration instructions to doc (wip)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling/cli/main.py                   |   1 -
 docling/datamodel/document.py         |   3 -
 docling/datamodel/pipeline_options.py |   5 +-
 docling/models/base_ocr_model.py      |   2 +-
 docling/models/ds_glm_model.py        |   8 +-
 docling/models/tesseract_ocr_model.py |   1 -
 docs/examples/batch_convert.py        |   2 +-
 docs/examples/custom_convert.py       |  12 +--
 docs/examples/export_figures.py       |   3 +-
 docs/examples/export_multimodal.py    |   2 +-
 docs/examples/export_tables.py        |   2 +-
 docs/v2.md                            | 104 +++++++++++++++++++++++++-
 tests/test_legacy_format_transform.py |   1 -
 13 files changed, 116 insertions(+), 30 deletions(-)

diff --git a/docling/cli/main.py b/docling/cli/main.py
index f97e4938..1800ea18 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -11,7 +11,6 @@ import typer
 from docling_core.utils.file import resolve_file_source
 
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import (
     ConversionStatus,
     FormatToExtensions,
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index 41d62114..1b39ff7d 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -23,7 +23,6 @@ from docling_core.types.doc import (
     TextItem,
 )
 from docling_core.types.doc.document import ListItem
-from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
 from docling_core.types.legacy_doc.base import Figure, GlmTableCell, TableCell
 from docling_core.utils.file import resolve_file_source
 from pydantic import BaseModel
@@ -43,8 +42,6 @@ from docling.datamodel.base_models import (
     MimeTypeToFormat,
     Page,
     PageElement,
-    Table,
-    TextElement,
 )
 from docling.datamodel.settings import DocumentLimits
 from docling.utils.utils import create_file_hash, create_hash
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 473ef980..ccb52c4a 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -1,9 +1,8 @@
-import warnings
 from enum import Enum, auto
 from pathlib import Path
-from typing import Annotated, List, Literal, Optional, Union
+from typing import List, Literal, Optional, Union
 
-from pydantic import BaseModel, ConfigDict, Field, model_validator
+from pydantic import BaseModel, ConfigDict, Field
 
 
 class TableFormerMode(str, Enum):
diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py
index c1ca8ef7..59ae2295 100644
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@@ -1,7 +1,7 @@
 import copy
 import logging
 from abc import abstractmethod
-from typing import Iterable, List, Tuple
+from typing import Iterable, List
 
 import numpy as np
 from docling_core.types.doc import BoundingBox, CoordOrigin
diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py
index 635651ad..1b0a84e7 100644
--- a/docling/models/ds_glm_model.py
+++ b/docling/models/ds_glm_model.py
@@ -1,16 +1,12 @@
 import copy
 import random
-from typing import List, Tuple, Union
+from typing import List, Union
 
 from deepsearch_glm.nlp_utils import init_nlp_model
-from deepsearch_glm.utils.doc_utils import (
-    to_docling_document,
-    to_legacy_document_format,
-)
+from deepsearch_glm.utils.doc_utils import to_docling_document
 from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
 from docling_core.types import BaseText
 from docling_core.types import Document as DsDocument
-from docling_core.types import Document as DsLegacyDocument
 from docling_core.types import DocumentDescription as DsDocumentDescription
 from docling_core.types import FileInfoObject as DsFileInfoObject
 from docling_core.types import PageDimensions, PageReference, Prov, Ref
diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py
index ea74b6ad..a97eb9a8 100644
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@@ -1,7 +1,6 @@
 import logging
 from typing import Iterable
 
-import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
 
 from docling.datamodel.base_models import OcrCell, Page
diff --git a/docs/examples/batch_convert.py b/docs/examples/batch_convert.py
index da1c701f..c8e244fa 100644
--- a/docs/examples/batch_convert.py
+++ b/docs/examples/batch_convert.py
@@ -1,9 +1,9 @@
 import json
 import logging
-import time
 from pathlib import Path
 from typing import Iterable
 
+import time
 import yaml
 
 from docling.datamodel.base_models import ConversionStatus
diff --git a/docs/examples/custom_convert.py b/docs/examples/custom_convert.py
index 8c753396..941cc951 100644
--- a/docs/examples/custom_convert.py
+++ b/docs/examples/custom_convert.py
@@ -1,18 +1,14 @@
 import json
 import logging
-import time
 from pathlib import Path
-from typing import Iterable
 
-from docling.datamodel.base_models import ConversionStatus, InputFormat
-from docling.datamodel.document import ConversionResult
+import time
+
+from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
     PdfPipelineOptions,
-    TesseractCliOcrOptions,
-    TesseractOcrOptions,
 )
-from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
-from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+from docling.document_converter import DocumentConverter, PdfFormatOption
 
 _log = logging.getLogger(__name__)
 
diff --git a/docs/examples/export_figures.py b/docs/examples/export_figures.py
index 1110b0f1..90f465b4 100644
--- a/docs/examples/export_figures.py
+++ b/docs/examples/export_figures.py
@@ -1,7 +1,8 @@
 import logging
-import time
 from pathlib import Path
 
+import time
+
 from docling.datamodel.base_models import FigureElement, InputFormat, Table
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
diff --git a/docs/examples/export_multimodal.py b/docs/examples/export_multimodal.py
index 72845f41..5ead9a0a 100644
--- a/docs/examples/export_multimodal.py
+++ b/docs/examples/export_multimodal.py
@@ -1,9 +1,9 @@
 import datetime
 import logging
-import time
 from pathlib import Path
 
 import pandas as pd
+import time
 
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
diff --git a/docs/examples/export_tables.py b/docs/examples/export_tables.py
index 68b9ce47..55d3a40f 100644
--- a/docs/examples/export_tables.py
+++ b/docs/examples/export_tables.py
@@ -1,8 +1,8 @@
 import logging
-import time
 from pathlib import Path
 
 import pandas as pd
+import time
 
 from docling.document_converter import DocumentConverter
 
diff --git a/docs/v2.md b/docs/v2.md
index 55135b1a..7e5e9ad9 100644
--- a/docs/v2.md
+++ b/docs/v2.md
@@ -1,7 +1,107 @@
 ## What's new
 
-Stay tuned!
+Docling v2 introduces several new features:
+- Understands and converts PDF, MS Word, MS Powerpoint, HTML and several image formats 
+- Produces a new, universal document representation which can encapsulate document hierarchy
+- Comes with a fresh new API and CLI
 
 ## Migration from v1
 
-Stay tuned!
+### Setting up a `DocumentConverter`
+
+To accomodate many input formats, we changed the way you need to set up your `DocumentConverter` object.
+You can now define a list of allowed formats on the `DocumentConverter` initialization, and specify custom options 
+per-format if desired. By default, all supported formats are allowed. If you don't provide `format_options`, defaults 
+will be used for all `allowed_formats`.
+
+Format options can include the pipeline class to use, the options to provide to the pipeline, and the document backend.
+They are provided as format-specific types, such as `PdfFormatOption` or `WordFormatOption`, as seen below.
+
+```python
+from docling.document_converter import DocumentConverter
+from docling.datamodel.base_models import InputFormat
+from docling.document_converter import (
+    DocumentConverter,
+    PdfFormatOption,
+    WordFormatOption,
+)
+from docling.pipeline.simple_pipeline import SimplePipeline
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+
+## Default initialization still works as before:
+# doc_converter = DocumentConverter() 
+
+## Custom options are now defined per format. 
+doc_converter = (
+    DocumentConverter(  # all of the below is optional, has internal defaults.
+        allowed_formats=[
+            InputFormat.PDF,
+            InputFormat.IMAGE,
+            InputFormat.DOCX,
+            InputFormat.HTML,
+            InputFormat.PPTX,
+        ],  # whitelist formats, non-matching files are ignored.
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
+            ),
+            InputFormat.DOCX: WordFormatOption(
+                pipeline_cls=SimplePipeline  # , backend=MsWordDocumentBackend
+            ),
+        },
+    )
+)
+```
+
+Note: If you work only with defaults, all remains the same as in Docling v1.
+
+### Converting documents
+
+We have simplified the way you can feed input to the `DocumentConverter` and renamed the conversion methods for 
+better semantics. You can now call the conversion directly with a single file, or a list of input files, 
+or `DocumentStream` objects, without constructing a `DocumentConversionInput` object first.
+
+* `DocumentConverter.convert` now converts a single file input (previously `DocumentConverter.convert_single`).
+* `DocumentConverter.convert_all` now converts many files at once (previously `DocumentConverter.convert`).
+
+
+```python
+...
+## Convert a single file (from URL or local path)
+conv_result = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single`
+
+## Convert several files at once:
+
+input_files = [
+    "tests/data/wiki_duck.html",
+    "tests/data/word_sample.docx",
+    "tests/data/lorem_ipsum.docx",
+    "tests/data/powerpoint_sample.pptx",
+    "tests/data/2305.03393v1-pg9-img.png",
+    "tests/data/2206.01062.pdf",
+]
+
+conv_results_iter = doc_converter.convert_all(input_files) # previously `convert_batch`
+
+```
+Through the `raises_on_error` argument, you can also control if the conversion should raise exceptions when first 
+encountering a problem, or resiliently convert all files first and reflect errors in each file's conversion status.
+By default, any error is immediately raised and the conversion aborts (previously, exceptions were swallowed).
+
+```python
+...
+conv_results_iter = doc_converter.convert_all(input_files, raises_on_error=False) # previously `convert_batch`
+
+```
+
+### Exporting documents into JSON, Markdown, Doctags
+
+We have simplified how you can access and export the converted document data, too.
+
+TBD.
+
+
+### CLI
+
+TBD.
diff --git a/tests/test_legacy_format_transform.py b/tests/test_legacy_format_transform.py
index 517b0b8b..4f764542 100644
--- a/tests/test_legacy_format_transform.py
+++ b/tests/test_legacy_format_transform.py
@@ -3,7 +3,6 @@ from pathlib import Path
 
 import pytest
 
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption