Add migration instructions to doc (wip)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-15 17:08:48 +02:00
parent 9d15f4d5bf
commit 74e0452b6a
13 changed files with 116 additions and 30 deletions

View File

@ -11,7 +11,6 @@ import typer
from docling_core.utils.file import resolve_file_source from docling_core.utils.file import resolve_file_source
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ( from docling.datamodel.base_models import (
ConversionStatus, ConversionStatus,
FormatToExtensions, FormatToExtensions,

View File

@ -23,7 +23,6 @@ from docling_core.types.doc import (
TextItem, TextItem,
) )
from docling_core.types.doc.document import ListItem from docling_core.types.doc.document import ListItem
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
from docling_core.types.legacy_doc.base import Figure, GlmTableCell, TableCell from docling_core.types.legacy_doc.base import Figure, GlmTableCell, TableCell
from docling_core.utils.file import resolve_file_source from docling_core.utils.file import resolve_file_source
from pydantic import BaseModel from pydantic import BaseModel
@ -43,8 +42,6 @@ from docling.datamodel.base_models import (
MimeTypeToFormat, MimeTypeToFormat,
Page, Page,
PageElement, PageElement,
Table,
TextElement,
) )
from docling.datamodel.settings import DocumentLimits from docling.datamodel.settings import DocumentLimits
from docling.utils.utils import create_file_hash, create_hash from docling.utils.utils import create_file_hash, create_hash

View File

@ -1,9 +1,8 @@
import warnings
from enum import Enum, auto from enum import Enum, auto
from pathlib import Path from pathlib import Path
from typing import Annotated, List, Literal, Optional, Union from typing import List, Literal, Optional, Union
from pydantic import BaseModel, ConfigDict, Field, model_validator from pydantic import BaseModel, ConfigDict, Field
class TableFormerMode(str, Enum): class TableFormerMode(str, Enum):

View File

@ -1,7 +1,7 @@
import copy import copy
import logging import logging
from abc import abstractmethod from abc import abstractmethod
from typing import Iterable, List, Tuple from typing import Iterable, List
import numpy as np import numpy as np
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin

View File

@ -1,16 +1,12 @@
import copy import copy
import random import random
from typing import List, Tuple, Union from typing import List, Union
from deepsearch_glm.nlp_utils import init_nlp_model from deepsearch_glm.nlp_utils import init_nlp_model
from deepsearch_glm.utils.doc_utils import ( from deepsearch_glm.utils.doc_utils import to_docling_document
to_docling_document,
to_legacy_document_format,
)
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
from docling_core.types import BaseText from docling_core.types import BaseText
from docling_core.types import Document as DsDocument from docling_core.types import Document as DsDocument
from docling_core.types import Document as DsLegacyDocument
from docling_core.types import DocumentDescription as DsDocumentDescription from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref from docling_core.types import PageDimensions, PageReference, Prov, Ref

View File

@ -1,7 +1,6 @@
import logging import logging
from typing import Iterable from typing import Iterable
import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page from docling.datamodel.base_models import OcrCell, Page

View File

@ -1,9 +1,9 @@
import json import json
import logging import logging
import time
from pathlib import Path from pathlib import Path
from typing import Iterable from typing import Iterable
import time
import yaml import yaml
from docling.datamodel.base_models import ConversionStatus from docling.datamodel.base_models import ConversionStatus

View File

@ -1,18 +1,14 @@
import json import json
import logging import logging
import time
from pathlib import Path from pathlib import Path
from typing import Iterable
from docling.datamodel.base_models import ConversionStatus, InputFormat import time
from docling.datamodel.document import ConversionResult
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
PdfPipelineOptions, PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
) )
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -1,7 +1,8 @@
import logging import logging
import time
from pathlib import Path from pathlib import Path
import time
from docling.datamodel.base_models import FigureElement, InputFormat, Table from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -1,9 +1,9 @@
import datetime import datetime
import logging import logging
import time
from pathlib import Path from pathlib import Path
import pandas as pd import pandas as pd
import time
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions

View File

@ -1,8 +1,8 @@
import logging import logging
import time
from pathlib import Path from pathlib import Path
import pandas as pd import pandas as pd
import time
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter

View File

@ -1,7 +1,107 @@
## What's new ## What's new
Stay tuned! Docling v2 introduces several new features:
- Understands and converts PDF, MS Word, MS Powerpoint, HTML and several image formats
- Produces a new, universal document representation which can encapsulate document hierarchy
- Comes with a fresh new API and CLI
## Migration from v1 ## Migration from v1
Stay tuned! ### Setting up a `DocumentConverter`
To accomodate many input formats, we changed the way you need to set up your `DocumentConverter` object.
You can now define a list of allowed formats on the `DocumentConverter` initialization, and specify custom options
per-format if desired. By default, all supported formats are allowed. If you don't provide `format_options`, defaults
will be used for all `allowed_formats`.
Format options can include the pipeline class to use, the options to provide to the pipeline, and the document backend.
They are provided as format-specific types, such as `PdfFormatOption` or `WordFormatOption`, as seen below.
```python
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
DocumentConverter,
PdfFormatOption,
WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
## Default initialization still works as before:
# doc_converter = DocumentConverter()
## Custom options are now defined per format.
doc_converter = (
DocumentConverter( # all of the below is optional, has internal defaults.
allowed_formats=[
InputFormat.PDF,
InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
], # whitelist formats, non-matching files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
),
},
)
)
```
Note: If you work only with defaults, all remains the same as in Docling v1.
### Converting documents
We have simplified the way you can feed input to the `DocumentConverter` and renamed the conversion methods for
better semantics. You can now call the conversion directly with a single file, or a list of input files,
or `DocumentStream` objects, without constructing a `DocumentConversionInput` object first.
* `DocumentConverter.convert` now converts a single file input (previously `DocumentConverter.convert_single`).
* `DocumentConverter.convert_all` now converts many files at once (previously `DocumentConverter.convert`).
```python
...
## Convert a single file (from URL or local path)
conv_result = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single`
## Convert several files at once:
input_files = [
"tests/data/wiki_duck.html",
"tests/data/word_sample.docx",
"tests/data/lorem_ipsum.docx",
"tests/data/powerpoint_sample.pptx",
"tests/data/2305.03393v1-pg9-img.png",
"tests/data/2206.01062.pdf",
]
conv_results_iter = doc_converter.convert_all(input_files) # previously `convert_batch`
```
Through the `raises_on_error` argument, you can also control if the conversion should raise exceptions when first
encountering a problem, or resiliently convert all files first and reflect errors in each file's conversion status.
By default, any error is immediately raised and the conversion aborts (previously, exceptions were swallowed).
```python
...
conv_results_iter = doc_converter.convert_all(input_files, raises_on_error=False) # previously `convert_batch`
```
### Exporting documents into JSON, Markdown, Doctags
We have simplified how you can access and export the converted document data, too.
TBD.
### CLI
TBD.

View File

@ -3,7 +3,6 @@ from pathlib import Path
import pytest import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption