Merge from simplify-conv-api

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-11 15:57:08 +02:00
commit d0fccb9342
22 changed files with 286 additions and 380 deletions

View File

@ -13,7 +13,7 @@ from docling_core.utils.file import resolve_file_source
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
EasyOcrOptions, EasyOcrOptions,
PdfPipelineOptions, PdfPipelineOptions,
@ -231,12 +231,9 @@ def convert(
} }
) )
# Define input files
input = DocumentConversionInput.from_paths(input_doc_paths)
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert_batch(input) conv_results = doc_converter.convert_all(input_doc_paths)
output.mkdir(parents=True, exist_ok=True) output.mkdir(parents=True, exist_ok=True)
export_documents( export_documents(

View File

@ -19,6 +19,7 @@ from docling_core.types.experimental import (
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
) )
from docling_core.utils.file import resolve_file_source
from pydantic import BaseModel from pydantic import BaseModel
from typing_extensions import deprecated from typing_extensions import deprecated
@ -158,8 +159,7 @@ class DocumentFormat(str, Enum):
V1 = "v1" V1 = "v1"
@deprecated("Use `ConversionResult` instead.") class ConversionResult(BaseModel):
class ConvertedDocument(BaseModel):
input: InputDocument input: InputDocument
status: ConversionStatus = ConversionStatus.PENDING # failure, success status: ConversionStatus = ConversionStatus.PENDING # failure, success
@ -471,20 +471,16 @@ class ConvertedDocument(BaseModel):
yield element, cropped_im yield element, cropped_im
class ConversionResult(ConvertedDocument): class _DocumentConversionInput(BaseModel):
pass
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
class DocumentConversionInput(BaseModel):
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
limits: Optional[DocumentLimits] = DocumentLimits() limits: Optional[DocumentLimits] = DocumentLimits()
def docs( def docs(
self, format_options: Dict[InputFormat, "FormatOption"] self, format_options: Dict[InputFormat, "FormatOption"]
) -> Iterable[InputDocument]: ) -> Iterable[InputDocument]:
for item in self.path_or_stream_iterator:
for obj in self._path_or_stream_iterator: obj = resolve_file_source(item) if isinstance(item, str) else item
format = self._guess_format(obj) format = self._guess_format(obj)
if format not in format_options.keys(): if format not in format_options.keys():
_log.debug( _log.debug(
@ -510,6 +506,8 @@ class DocumentConversionInput(BaseModel):
limits=self.limits, limits=self.limits,
backend=backend, backend=backend,
) )
else:
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
def _guess_format(self, obj): def _guess_format(self, obj):
content = None content = None
@ -545,21 +543,3 @@ class DocumentConversionInput(BaseModel):
return "text/html" return "text/html"
return None return None
@classmethod
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
paths = [Path(p) for p in paths]
doc_input = cls(limits=limits)
doc_input._path_or_stream_iterator = paths
return doc_input
@classmethod
def from_streams(
cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
):
doc_input = cls(limits=limits)
doc_input._path_or_stream_iterator = streams
return doc_input

View File

@ -75,19 +75,4 @@ class PdfPipelineOptions(PipelineOptions):
Field(EasyOcrOptions(), discriminator="kind") Field(EasyOcrOptions(), discriminator="kind")
) )
keep_page_images: Annotated[
bool,
Field(
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
),
] = False # False: page images are removed in the assemble step
images_scale: Optional[float] = None # if set, the scale for generated images images_scale: Optional[float] = None # if set, the scale for generated images
@model_validator(mode="after")
def set_page_images_from_deprecated(self) -> "PdfPipelineOptions":
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
default_scale = 1.0
if self.keep_page_images and self.images_scale is None:
self.images_scale = default_scale
return self

View File

@ -1,33 +1,24 @@
import logging import logging
import tempfile import sys
import time import time
from pathlib import Path from pathlib import Path
from typing import Dict, Iterable, List, Optional, Type from typing import Dict, Iterable, List, Optional, Type
import requests from pydantic import BaseModel, ConfigDict, model_validator, validate_call
from pydantic import (
AnyHttpUrl,
BaseModel,
ConfigDict,
TypeAdapter,
ValidationError,
model_validator,
)
from typing_extensions import deprecated
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
from docling.datamodel.document import ( from docling.datamodel.document import (
ConversionResult, ConversionResult,
DocumentConversionInput,
InputDocument, InputDocument,
_DocumentConversionInput,
) )
from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import DocumentLimits, settings
from docling.pipeline.base_model_pipeline import AbstractModelPipeline from docling.pipeline.base_model_pipeline import AbstractModelPipeline
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
@ -118,16 +109,56 @@ class DocumentConverter:
Type[AbstractModelPipeline], AbstractModelPipeline Type[AbstractModelPipeline], AbstractModelPipeline
] = {} ] = {}
@deprecated("Use convert_batch instead.") @validate_call(config=ConfigDict(strict=True))
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]: def convert(
yield from self.convert_batch(input=input) self,
source: Path | str | DocumentStream, # TODO review naming
raises_on_error: bool = True,
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,
) -> ConversionResult:
def convert_batch( all_res = self.convert_all(
self, input: DocumentConversionInput, raise_on_error: bool = False source=[source],
raises_on_error=raises_on_error,
max_num_pages=max_num_pages,
max_file_size=max_file_size,
)
return next(all_res)
@validate_call(config=ConfigDict(strict=True))
def convert_all(
self,
source: Iterable[Path | str | DocumentStream], # TODO review naming
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,
) -> Iterable[ConversionResult]: ) -> Iterable[ConversionResult]:
limits = DocumentLimits(
max_num_pages=max_num_pages,
max_file_size=max_file_size,
)
conv_input = _DocumentConversionInput(
path_or_stream_iterator=source,
limit=limits,
)
conv_res_iter = self._convert(conv_input)
for conv_res in conv_res_iter:
if raises_on_error and conv_res.status not in {
ConversionStatus.SUCCESS,
ConversionStatus.PARTIAL_SUCCESS,
}:
raise RuntimeError(
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
)
else:
yield conv_res
def _convert(
self, conv_input: _DocumentConversionInput
) -> Iterable[ConversionResult]:
for input_batch in chunkify( for input_batch in chunkify(
input.docs(self.format_to_options), conv_input.docs(self.format_to_options),
settings.perf.doc_batch_size, # pass format_options settings.perf.doc_batch_size, # pass format_options
): ):
_log.info(f"Going to convert document batch...") _log.info(f"Going to convert document batch...")
@ -142,58 +173,6 @@ class DocumentConverter:
if item is not None: if item is not None:
yield item yield item
def convert_single(
self, source: Path | AnyHttpUrl | str, raise_on_error: bool = False
) -> ConversionResult:
"""Convert a single document.
Args:
source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
Raises:
ValueError: If source is of unexpected type.
RuntimeError: If conversion fails.
Returns:
ConversionResult: The conversion result object.
"""
with tempfile.TemporaryDirectory() as temp_dir:
try:
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
res = requests.get(http_url, stream=True)
res.raise_for_status()
fname = None
# try to get filename from response header
if cont_disp := res.headers.get("Content-Disposition"):
for par in cont_disp.strip().split(";"):
# currently only handling directive "filename" (not "*filename")
if (split := par.split("=")) and split[0].strip() == "filename":
fname = "=".join(split[1:]).strip().strip("'\"") or None
break
# otherwise, use name from URL:
if fname is None:
fname = Path(http_url.path).name or self._default_download_filename
local_path = Path(temp_dir) / fname
with open(local_path, "wb") as f:
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
f.write(chunk)
except ValidationError:
try:
local_path = TypeAdapter(Path).validate_python(source)
except ValidationError:
raise ValueError(
f"Unexpected file path type encountered: {type(source)}"
)
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
conv_res_iter = self.convert_batch(conv_inp)
conv_res: ConversionResult = next(conv_res_iter)
if conv_res.status not in {
ConversionStatus.SUCCESS,
ConversionStatus.PARTIAL_SUCCESS,
}:
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
return conv_res
def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractModelPipeline]: def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractModelPipeline]:
fopt = self.format_to_options.get(doc.format) fopt = self.format_to_options.get(doc.format)

View File

@ -14,23 +14,26 @@ from docling_core.types import Ref
from docling_core.types.experimental import BoundingBox, CoordOrigin from docling_core.types.experimental import BoundingBox, CoordOrigin
from docling_core.types.experimental.document import DoclingDocument from docling_core.types.experimental.document import DoclingDocument
from PIL import ImageDraw from PIL import ImageDraw
from pydantic import BaseModel, ConfigDict
from docling.datamodel.base_models import Cluster from docling.datamodel.base_models import Cluster
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
class GlmModel: class GlmOptions(BaseModel):
def __init__(self, config): model_config = ConfigDict(protected_namespaces=())
self.config = config
self.create_legacy_output = config.get("create_legacy_output", True) create_legacy_output: bool = True
model_names: str = "" # e.g. "language;term;reference"
class GlmModel:
def __init__(self, options: GlmOptions):
self.options = options
self.create_legacy_output = self.options.create_legacy_output
self.model_names = self.config.get(
"model_names", ""
) # "language;term;reference"
load_pretrained_nlp_models() load_pretrained_nlp_models()
# model = init_nlp_model(model_names="language;term;reference") self.model = init_nlp_model(model_names=self.options.model_names)
model = init_nlp_model(model_names=self.model_names)
self.model = model
def __call__( def __call__(
self, conv_res: ConversionResult self, conv_res: ConversionResult

View File

@ -2,6 +2,7 @@ import copy
import logging import logging
import random import random
import time import time
from pathlib import Path
from typing import Iterable, List from typing import Iterable, List
from docling_core.types.experimental import CoordOrigin from docling_core.types.experimental import CoordOrigin
@ -43,11 +44,8 @@ class LayoutModel(AbstractPageModel):
FIGURE_LABEL = DocItemLabel.PICTURE FIGURE_LABEL = DocItemLabel.PICTURE
FORMULA_LABEL = DocItemLabel.FORMULA FORMULA_LABEL = DocItemLabel.FORMULA
def __init__(self, config): def __init__(self, artifacts_path: Path):
self.config = config self.layout_predictor = LayoutPredictor(artifacts_path) # TODO temporary
self.layout_predictor = LayoutPredictor(
config["artifacts_path"]
) # TODO temporary
def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height): def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
MIN_INTERSECTION = 0.2 MIN_INTERSECTION = 0.2

View File

@ -2,6 +2,8 @@ import logging
import re import re
from typing import Iterable, List from typing import Iterable, List
from pydantic import BaseModel
from docling.datamodel.base_models import ( from docling.datamodel.base_models import (
AssembledUnit, AssembledUnit,
FigureElement, FigureElement,
@ -16,9 +18,13 @@ from docling.models.layout_model import LayoutModel
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class PageAssembleOptions(BaseModel):
keep_images: bool = False
class PageAssembleModel(AbstractPageModel): class PageAssembleModel(AbstractPageModel):
def __init__(self, config): def __init__(self, options: PageAssembleOptions):
self.config = config self.options = options
def sanitize_text(self, lines): def sanitize_text(self, lines):
if len(lines) <= 1: if len(lines) <= 1:
@ -147,7 +153,7 @@ class PageAssembleModel(AbstractPageModel):
) )
# Remove page images (can be disabled) # Remove page images (can be disabled)
if self.config["images_scale"] is None: if not self.options.keep_images:
page._image_cache = {} page._image_cache = {}
# Unload backend # Unload backend

View File

@ -1,14 +1,19 @@
from typing import Iterable from typing import Iterable, Optional
from PIL import ImageDraw from PIL import ImageDraw
from pydantic import BaseModel
from docling.datamodel.base_models import Page from docling.datamodel.base_models import Page
from docling.models.abstract_model import AbstractPageModel from docling.models.abstract_model import AbstractPageModel
class PagePreprocessingOptions(BaseModel):
images_scale: Optional[float]
class PagePreprocessingModel(AbstractPageModel): class PagePreprocessingModel(AbstractPageModel):
def __init__(self, config): def __init__(self, options: PagePreprocessingOptions):
self.config = config self.options = options
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for page in page_batch: for page in page_batch:
@ -23,7 +28,7 @@ class PagePreprocessingModel(AbstractPageModel):
scale=1.0 scale=1.0
) # puts the page image on the image cache at default scale ) # puts the page image on the image cache at default scale
images_scale = self.config["images_scale"] images_scale = self.options.images_scale
# user requested scales # user requested scales
if images_scale is not None: if images_scale is not None:
page._default_image_scale = images_scale page._default_image_scale = images_scale

View File

@ -10,19 +10,21 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
from PIL import ImageDraw from PIL import ImageDraw
from docling.datamodel.base_models import Page, Table, TableStructurePrediction from docling.datamodel.base_models import Page, Table, TableStructurePrediction
from docling.datamodel.pipeline_options import TableFormerMode from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
from docling.models.abstract_model import AbstractPageModel from docling.models.abstract_model import AbstractPageModel
class TableStructureModel(AbstractPageModel): class TableStructureModel(AbstractPageModel):
def __init__(self, config): def __init__(
self.config = config self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
self.do_cell_matching = config["do_cell_matching"] ):
self.mode = config["mode"] self.options = options
self.do_cell_matching = self.options.do_cell_matching
self.mode = self.options.mode
self.enabled = config["enabled"] self.enabled = enabled
if self.enabled: if self.enabled:
artifacts_path: Path = config["artifacts_path"] artifacts_path: Path = artifacts_path
if self.mode == TableFormerMode.ACCURATE: if self.mode == TableFormerMode.ACCURATE:
artifacts_path = artifacts_path / "fat" artifacts_path = artifacts_path / "fat"

View File

@ -13,11 +13,14 @@ from docling.datamodel.pipeline_options import (
TesseractOcrOptions, TesseractOcrOptions,
) )
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.models.ds_glm_model import GlmModel from docling.models.ds_glm_model import GlmModel, GlmOptions
from docling.models.easyocr_model import EasyOcrModel from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel from docling.models.layout_model import LayoutModel
from docling.models.page_assemble_model import PageAssembleModel from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
from docling.models.page_preprocessing_model import PagePreprocessingModel from docling.models.page_preprocessing_model import (
PagePreprocessingModel,
PagePreprocessingOptions,
)
from docling.models.table_structure_model import TableStructureModel from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docling.models.tesseract_ocr_model import TesseractOcrModel from docling.models.tesseract_ocr_model import TesseractOcrModel
@ -32,57 +35,50 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
def __init__(self, pipeline_options: PdfPipelineOptions): def __init__(self, pipeline_options: PdfPipelineOptions):
super().__init__(pipeline_options) super().__init__(pipeline_options)
self.pipeline_options: PdfPipelineOptions
if not pipeline_options.artifacts_path: if not pipeline_options.artifacts_path:
artifacts_path = self.download_models_hf() artifacts_path = self.download_models_hf()
self.artifacts_path = Path(artifacts_path) self.artifacts_path = Path(artifacts_path)
self.glm_model = GlmModel( self.glm_model = GlmModel(
config={"create_legacy_output": pipeline_options.create_legacy_output} options=GlmOptions(
create_legacy_output=pipeline_options.create_legacy_output
)
) )
ocr_model: BaseOcrModel if (ocr_model := self.get_ocr_model()) is None:
if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
ocr_model = EasyOcrModel(
enabled=pipeline_options.do_ocr,
options=pipeline_options.ocr_options,
)
elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
ocr_model = TesseractOcrCliModel(
enabled=pipeline_options.do_ocr,
options=pipeline_options.ocr_options,
)
elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
ocr_model = TesseractOcrModel(
enabled=pipeline_options.do_ocr,
options=pipeline_options.ocr_options,
)
else:
raise RuntimeError( raise RuntimeError(
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}." f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
) )
self.model_pipe = [ self.model_pipe = [
# Pre-processing
PagePreprocessingModel( PagePreprocessingModel(
config={"images_scale": pipeline_options.images_scale} options=PagePreprocessingOptions(
images_scale=pipeline_options.images_scale
)
), ),
# OCR
ocr_model, ocr_model,
# Layout model
LayoutModel( LayoutModel(
config={ artifacts_path=artifacts_path
"artifacts_path": artifacts_path / StandardPdfModelPipeline._layout_model_path
/ StandardPdfModelPipeline._layout_model_path
}
), ),
# Table structure model
TableStructureModel( TableStructureModel(
config={ enabled=pipeline_options.do_table_structure,
"artifacts_path": artifacts_path artifacts_path=artifacts_path
/ StandardPdfModelPipeline._table_model_path, / StandardPdfModelPipeline._table_model_path,
"enabled": pipeline_options.do_table_structure, options=pipeline_options.table_structure_options,
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching, ),
"mode": pipeline_options.table_structure_options.mode, # Page assemble
} PageAssembleModel(
options=PageAssembleOptions(
keep_images=pipeline_options.images_scale is not None
)
), ),
PageAssembleModel(config={"images_scale": pipeline_options.images_scale}),
] ]
self.enrichment_pipe = [ self.enrichment_pipe = [
@ -104,6 +100,24 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
return Path(download_path) return Path(download_path)
def get_ocr_model(self) -> Optional[BaseOcrModel]:
if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
return EasyOcrModel(
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
)
elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
return TesseractOcrCliModel(
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
)
elif isinstance(self.pipeline_options.ocr_options, TesseractOcrOptions):
return TesseractOcrModel(
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
)
return None
def initialize_page(self, doc: InputDocument, page: Page) -> Page: def initialize_page(self, doc: InputDocument, page: Page) -> Page:
page._backend = doc._backend.load_page(page.page_no) page._backend = doc._backend.load_page(page.page_no)
page.size = page._backend.get_size() page.size = page._backend.get_size()

View File

@ -7,7 +7,7 @@ from typing import Iterable
import yaml import yaml
from docling.datamodel.base_models import ConversionStatus from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -125,18 +125,19 @@ def main():
doc_converter = DocumentConverter() doc_converter = DocumentConverter()
input = DocumentConversionInput.from_paths(input_doc_paths)
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert_batch(input) conv_results = doc_converter.convert_all(
input_doc_paths,
raises_on_error=False, # to let conversion run through all and examine results at the end
)
success_count, partial_success_count, failure_count = export_documents( success_count, partial_success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch") conv_results, output_dir=Path("./scratch")
) )
end_time = time.time() - start_time end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.") _log.info(f"Document conversion complete in {end_time:.2f} seconds.")
if failure_count > 0: if failure_count > 0:
raise RuntimeError( raise RuntimeError(

View File

@ -5,9 +5,14 @@ from pathlib import Path
from typing import Iterable from typing import Iterable
from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import (
from docling.document_converter import DocumentConverter, PdfFormatOption PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -60,9 +65,7 @@ def export_documents(
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_paths = [ input_doc_path = Path("./tests/data/2206.01062.pdf")
Path("./tests/data/2206.01062.pdf"),
]
########################################################################### ###########################################################################
@ -147,24 +150,13 @@ def main():
########################################################################### ###########################################################################
# Define input files
input = DocumentConversionInput.from_paths(input_doc_paths)
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert_batch(input) conv_result = doc_converter.convert(input_doc_path)
success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch")
)
end_time = time.time() - start_time end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.") _log.info(f"Document converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -2,13 +2,7 @@ import logging
import time import time
from pathlib import Path from pathlib import Path
from docling.datamodel.base_models import ( from docling.datamodel.base_models import FigureElement, InputFormat, Table
ConversionStatus,
FigureElement,
InputFormat,
Table,
)
from docling.datamodel.document import DocumentConversionInput
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
@ -20,13 +14,9 @@ IMAGE_RESOLUTION_SCALE = 2.0
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_paths = [ input_doc_path = Path("./tests/data/2206.01062.pdf")
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch") output_dir = Path("./scratch")
input_files = DocumentConversionInput.from_paths(input_doc_paths)
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory. # will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images. # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
@ -42,46 +32,29 @@ def main():
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert_batch(input_files) conv_res = doc_converter.convert(input_doc_path)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
for conv_res in conv_results: doc_filename = conv_res.input.file.stem
if conv_res.status != ConversionStatus.SUCCESS:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
continue
doc_filename = conv_res.input.file.stem # Export page images
for page in conv_res.pages:
page_no = page.page_no + 1
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
with page_image_filename.open("wb") as fp:
page.image.save(fp, format="PNG")
# Export page images # Export figures and tables
for page in conv_res.pages: for element, image in conv_res.render_element_images(
page_no = page.page_no + 1 element_types=(FigureElement, Table)
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png" ):
with page_image_filename.open("wb") as fp: element_image_filename = output_dir / f"{doc_filename}-element-{element.id}.png"
page.image.save(fp, format="PNG") with element_image_filename.open("wb") as fp:
image.save(fp, "PNG")
# Export figures and tables
for element, image in conv_res.render_element_images(
element_types=(FigureElement, Table)
):
element_image_filename = (
output_dir / f"{doc_filename}-element-{element.id}.png"
)
with element_image_filename.open("wb") as fp:
image.save(fp, "PNG")
success_count += 1
end_time = time.time() - start_time end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.") _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -5,8 +5,7 @@ from pathlib import Path
import pandas as pd import pandas as pd
from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DocumentConversionInput
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.utils.export import generate_multimodal_pages from docling.utils.export import generate_multimodal_pages
@ -19,13 +18,9 @@ IMAGE_RESOLUTION_SCALE = 2.0
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_paths = [ input_doc_path = Path("./tests/data/2206.01062.pdf")
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch") output_dir = Path("./scratch")
input_files = DocumentConversionInput.from_paths(input_doc_paths)
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory. # will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images. # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
@ -41,53 +36,45 @@ def main():
start_time = time.time() start_time = time.time()
converted_docs = doc_converter.convert_batch(input_files) conv_res = doc_converter.convert(input_doc_path)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
for doc in converted_docs:
if doc.status != ConversionStatus.SUCCESS:
_log.info(f"Document {doc.input.file} failed to convert.")
failure_count += 1
continue
rows = [] rows = []
for ( for (
content_text, content_text,
content_md, content_md,
content_dt, content_dt,
page_cells, page_cells,
page_segments, page_segments,
page, page,
) in generate_multimodal_pages(doc): ) in generate_multimodal_pages(conv_res):
dpi = page._default_image_scale * 72 dpi = page._default_image_scale * 72
rows.append( rows.append(
{ {
"document": doc.input.file.name, "document": conv_res.input.file.name,
"hash": doc.input.document_hash, "hash": conv_res.input.document_hash,
"page_hash": page.page_hash, "page_hash": page.page_hash,
"image": { "image": {
"width": page.image.width, "width": page.image.width,
"height": page.image.height, "height": page.image.height,
"bytes": page.image.tobytes(), "bytes": page.image.tobytes(),
}, },
"cells": page_cells, "cells": page_cells,
"contents": content_text, "contents": content_text,
"contents_md": content_md, "contents_md": content_md,
"contents_dt": content_dt, "contents_dt": content_dt,
"segments": page_segments, "segments": page_segments,
"extra": { "extra": {
"page_num": page.page_no + 1, "page_num": page.page_no + 1,
"width_in_points": page.size.width, "width_in_points": page.size.width,
"height_in_points": page.size.height, "height_in_points": page.size.height,
"dpi": dpi, "dpi": dpi,
}, },
} }
) )
success_count += 1
# Generate one parquet from all documents # Generate one parquet from all documents
df = pd.json_normalize(rows) df = pd.json_normalize(rows)
@ -97,12 +84,9 @@ def main():
end_time = time.time() - start_time end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.") _log.info(
f"Document converted and multimodal pages generated in {end_time:.2f} seconds."
if failure_count > 0: )
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
# This block demonstrates how the file can be opened with the HF datasets library # This block demonstrates how the file can be opened with the HF datasets library
# from datasets import Dataset # from datasets import Dataset

View File

@ -4,8 +4,6 @@ from pathlib import Path
import pandas as pd import pandas as pd
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -14,59 +12,39 @@ _log = logging.getLogger(__name__)
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_paths = [ input_doc_path = Path("./tests/data/2206.01062.pdf")
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch") output_dir = Path("./scratch")
input_files = DocumentConversionInput.from_paths(input_doc_paths)
doc_converter = DocumentConverter() doc_converter = DocumentConverter()
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert_batch(input_files) conv_res = doc_converter.convert(input_doc_path)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
for conv_res in conv_results:
if conv_res.status != ConversionStatus.SUCCESS:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
continue
doc_filename = conv_res.input.file.stem doc_filename = conv_res.input.file.stem
# Export tables # Export tables
for table_ix, table in enumerate(conv_res.legacy_output.tables): for table_ix, table in enumerate(conv_res.legacy_output.tables):
table_df: pd.DataFrame = table.export_to_dataframe() table_df: pd.DataFrame = table.export_to_dataframe()
print(f"## Table {table_ix}") print(f"## Table {table_ix}")
print(table_df.to_markdown()) print(table_df.to_markdown())
# Save the table as csv # Save the table as csv
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv" element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
_log.info(f"Saving CSV table to {element_csv_filename}") _log.info(f"Saving CSV table to {element_csv_filename}")
table_df.to_csv(element_csv_filename) table_df.to_csv(element_csv_filename)
# Save the table as html # Save the table as html
element_html_filename = ( element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
output_dir / f"{doc_filename}-table-{table_ix+1}.html" _log.info(f"Saving HTML table to {element_html_filename}")
) with element_html_filename.open("w") as fp:
_log.info(f"Saving HTML table to {element_html_filename}") fp.write(table.export_to_html())
with element_html_filename.open("w") as fp:
fp.write(table.export_to_html())
success_count += 1
end_time = time.time() - start_time end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.") _log.info(f"Document converted and tables exported in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -2,7 +2,7 @@ from docling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
converter = DocumentConverter() converter = DocumentConverter()
result = converter.convert_single(source) result = converter.convert(source)
print(result.output.export_to_markdown()) # output: ## Docling Technical Report [...]" print(result.output.export_to_markdown()) # output: ## Docling Technical Report [...]"
# if the legacy output is needed, use this version # if the legacy output is needed, use this version
# print(result.render_as_markdown_v1()) # output: ## Docling Technical Report [...]" # print(result.render_as_markdown_v1()) # output: ## Docling Technical Report [...]"

View File

@ -4,7 +4,6 @@ from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import ( from docling.document_converter import (
DocumentConverter, DocumentConverter,
PdfFormatOption, PdfFormatOption,
@ -25,7 +24,6 @@ input_paths = [
Path("tests/data/2206.01062.pdf"), Path("tests/data/2206.01062.pdf"),
# Path("tests/data/2305.03393v1-pg9-img.png"), # Path("tests/data/2305.03393v1-pg9-img.png"),
] ]
input = DocumentConversionInput.from_paths(input_paths)
## for defaults use: ## for defaults use:
# doc_converter = DocumentConverter() # doc_converter = DocumentConverter()
@ -50,12 +48,36 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
}, },
) )
conv_results = doc_converter.convert_batch(input) doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
pdf=None,
docx=WordFormatOption(
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
),
formats=[
InputFormat.PDF,
# InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
], # whitelist formats, other files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
),
# InputFormat.IMAGE: PdfFormatOption(),
},
)
conv_results = doc_converter.convert_all(input_paths)
for res in conv_results: for res in conv_results:
out_path = Path("./scratch") out_path = Path("./scratch")
print( print(
f"Document {res.input.file.name} converted with status {res.status}." f"Document {res.input.file.name} converted."
f"\nSaved markdown output to: {str(out_path)}" f"\nSaved markdown output to: {str(out_path)}"
) )
# print(res.experimental.export_to_markdown()) # print(res.experimental.export_to_markdown())

View File

@ -48,7 +48,7 @@ def test_e2e_conversions():
for pdf_path in pdf_paths: for pdf_path in pdf_paths:
print(f"converting {pdf_path}") print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert_single(pdf_path) doc_result: ConversionResult = converter.convert(pdf_path)
verify_conversion_result_v1( verify_conversion_result_v1(
input_path=pdf_path, doc_result=doc_result, generate=GENERATE_V1 input_path=pdf_path, doc_result=doc_result, generate=GENERATE_V1

View File

@ -89,7 +89,7 @@ def test_e2e_conversions():
for pdf_path in pdf_paths: for pdf_path in pdf_paths:
print(f"converting {pdf_path}") print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert_single(pdf_path) doc_result: ConversionResult = converter.convert(pdf_path)
# Save conversions # Save conversions
# save_output(pdf_path, doc_result, None) # save_output(pdf_path, doc_result, None)

View File

@ -5,7 +5,6 @@ import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
@ -37,39 +36,24 @@ def converter():
return converter return converter
def test_convert_single(converter: DocumentConverter): def test_convert_path(converter: DocumentConverter):
pdf_path = get_pdf_path() pdf_path = get_pdf_path()
print(f"converting {pdf_path}") print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert_single(pdf_path) doc_result = converter.convert(pdf_path)
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result) verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result) verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
def test_batch_path(converter: DocumentConverter): def test_convert_stream(converter: DocumentConverter):
pdf_path = get_pdf_path()
print(f"converting {pdf_path}")
conv_input = DocumentConversionInput.from_paths([pdf_path])
results = converter.convert_batch(conv_input)
for doc_result in results:
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
def test_batch_bytes(converter: DocumentConverter):
pdf_path = get_pdf_path() pdf_path = get_pdf_path()
print(f"converting {pdf_path}") print(f"converting {pdf_path}")
buf = BytesIO(pdf_path.open("rb").read()) buf = BytesIO(pdf_path.open("rb").read())
docs = [DocumentStream(name=pdf_path.name, stream=buf)] stream = DocumentStream(name=pdf_path.name, stream=buf)
conv_input = DocumentConversionInput.from_streams(docs)
results = converter.convert_batch(conv_input) doc_result = converter.convert(stream)
for doc_result in results: verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result) verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)

View File

@ -39,6 +39,6 @@ def test_e2e_conversions(test_doc_path):
for converter in get_converters_with_table_options(): for converter in get_converters_with_table_options():
print(f"converting {test_doc_path}") print(f"converting {test_doc_path}")
doc_result: ConversionResult = converter.convert_single(test_doc_path) doc_result: ConversionResult = converter.convert(test_doc_path)
assert doc_result.status == ConversionStatus.SUCCESS assert doc_result.status == ConversionStatus.SUCCESS

View File

@ -1,4 +1,5 @@
import json import json
import warnings
from pathlib import Path from pathlib import Path
from typing import List from typing import List
@ -234,8 +235,10 @@ def verify_conversion_result_v1(
doc_pred_pages: List[Page] = doc_result.pages doc_pred_pages: List[Page] = doc_result.pages
doc_pred: DsDocument = doc_result.legacy_output doc_pred: DsDocument = doc_result.legacy_output
doc_pred_md = doc_result.render_as_markdown() with warnings.catch_warnings():
doc_pred_dt = doc_result.render_as_doctags() warnings.simplefilter("ignore", DeprecationWarning)
doc_pred_md = doc_result.render_as_markdown()
doc_pred_dt = doc_result.render_as_doctags()
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}" engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name