feat!: simplify conversion API (#139)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Panos Vagenas 2024-10-11 14:52:37 +02:00 committed by GitHub
parent 753f67a434
commit 136f16e85a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 164 additions and 303 deletions

View File

@ -13,7 +13,7 @@ from docling_core.utils.file import resolve_file_source
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
EasyOcrOptions, EasyOcrOptions,
PdfPipelineOptions, PdfPipelineOptions,
@ -231,12 +231,9 @@ def convert(
} }
) )
# Define input files
input = DocumentConversionInput.from_paths(input_doc_paths)
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert_batch(input) conv_results = doc_converter.convert_all(input_doc_paths)
output.mkdir(parents=True, exist_ok=True) output.mkdir(parents=True, exist_ok=True)
export_documents( export_documents(

View File

@ -19,6 +19,7 @@ from docling_core.types.experimental import (
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
) )
from docling_core.utils.file import resolve_file_source
from pydantic import BaseModel from pydantic import BaseModel
from typing_extensions import deprecated from typing_extensions import deprecated
@ -162,8 +163,7 @@ class DocumentFormat(str, Enum):
V1 = "v1" V1 = "v1"
@deprecated("Use `ConversionResult` instead.") class ConversionResult(BaseModel):
class ConvertedDocument(BaseModel):
input: InputDocument input: InputDocument
status: ConversionStatus = ConversionStatus.PENDING # failure, success status: ConversionStatus = ConversionStatus.PENDING # failure, success
@ -457,20 +457,16 @@ class ConvertedDocument(BaseModel):
yield element, cropped_im yield element, cropped_im
class ConversionResult(ConvertedDocument): class _DocumentConversionInput(BaseModel):
pass
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
class DocumentConversionInput(BaseModel):
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
limits: Optional[DocumentLimits] = DocumentLimits() limits: Optional[DocumentLimits] = DocumentLimits()
def docs( def docs(
self, format_options: Dict[InputFormat, "FormatOption"] self, format_options: Dict[InputFormat, "FormatOption"]
) -> Iterable[InputDocument]: ) -> Iterable[InputDocument]:
for item in self.path_or_stream_iterator:
for obj in self._path_or_stream_iterator: obj = resolve_file_source(item) if isinstance(item, str) else item
format = self._guess_format(obj) format = self._guess_format(obj)
if format not in format_options.keys(): if format not in format_options.keys():
_log.debug( _log.debug(
@ -496,6 +492,8 @@ class DocumentConversionInput(BaseModel):
limits=self.limits, limits=self.limits,
backend=backend, backend=backend,
) )
else:
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
def _guess_format(self, obj): def _guess_format(self, obj):
content = None content = None
@ -531,21 +529,3 @@ class DocumentConversionInput(BaseModel):
return "text/html" return "text/html"
return None return None
@classmethod
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
paths = [Path(p) for p in paths]
doc_input = cls(limits=limits)
doc_input._path_or_stream_iterator = paths
return doc_input
@classmethod
def from_streams(
cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
):
doc_input = cls(limits=limits)
doc_input._path_or_stream_iterator = streams
return doc_input

View File

@ -1,34 +1,24 @@
import logging import logging
import tempfile import sys
import time import time
from pathlib import Path from pathlib import Path
from typing import Dict, Iterable, List, Optional, Type from typing import Dict, Iterable, List, Optional, Type
import requests from pydantic import BaseModel, ConfigDict, model_validator, validate_call
from pydantic import (
AnyHttpUrl,
BaseModel,
ConfigDict,
TypeAdapter,
ValidationError,
field_validator,
model_validator,
)
from typing_extensions import deprecated
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
from docling.datamodel.document import ( from docling.datamodel.document import (
ConversionResult, ConversionResult,
DocumentConversionInput,
InputDocument, InputDocument,
_DocumentConversionInput,
) )
from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import DocumentLimits, settings
from docling.pipeline.base_model_pipeline import AbstractModelPipeline from docling.pipeline.base_model_pipeline import AbstractModelPipeline
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
@ -119,16 +109,56 @@ class DocumentConverter:
Type[AbstractModelPipeline], AbstractModelPipeline Type[AbstractModelPipeline], AbstractModelPipeline
] = {} ] = {}
@deprecated("Use convert_batch instead.") @validate_call(config=ConfigDict(strict=True))
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]: def convert(
yield from self.convert_batch(input=input) self,
source: Path | str | DocumentStream, # TODO review naming
raises_on_error: bool = True,
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,
) -> ConversionResult:
def convert_batch( all_res = self.convert_all(
self, input: DocumentConversionInput, raise_on_error: bool = False source=[source],
raises_on_error=raises_on_error,
max_num_pages=max_num_pages,
max_file_size=max_file_size,
)
return next(all_res)
@validate_call(config=ConfigDict(strict=True))
def convert_all(
self,
source: Iterable[Path | str | DocumentStream], # TODO review naming
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,
) -> Iterable[ConversionResult]: ) -> Iterable[ConversionResult]:
limits = DocumentLimits(
max_num_pages=max_num_pages,
max_file_size=max_file_size,
)
conv_input = _DocumentConversionInput(
path_or_stream_iterator=source,
limit=limits,
)
conv_res_iter = self._convert(conv_input)
for conv_res in conv_res_iter:
if raises_on_error and conv_res.status not in {
ConversionStatus.SUCCESS,
ConversionStatus.PARTIAL_SUCCESS,
}:
raise RuntimeError(
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
)
else:
yield conv_res
def _convert(
self, conv_input: _DocumentConversionInput
) -> Iterable[ConversionResult]:
for input_batch in chunkify( for input_batch in chunkify(
input.docs(self.format_to_options), conv_input.docs(self.format_to_options),
settings.perf.doc_batch_size, # pass format_options settings.perf.doc_batch_size, # pass format_options
): ):
_log.info(f"Going to convert document batch...") _log.info(f"Going to convert document batch...")
@ -143,58 +173,6 @@ class DocumentConverter:
if item is not None: if item is not None:
yield item yield item
def convert_single(
self, source: Path | AnyHttpUrl | str, raise_on_error: bool = False
) -> ConversionResult:
"""Convert a single document.
Args:
source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
Raises:
ValueError: If source is of unexpected type.
RuntimeError: If conversion fails.
Returns:
ConversionResult: The conversion result object.
"""
with tempfile.TemporaryDirectory() as temp_dir:
try:
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
res = requests.get(http_url, stream=True)
res.raise_for_status()
fname = None
# try to get filename from response header
if cont_disp := res.headers.get("Content-Disposition"):
for par in cont_disp.strip().split(";"):
# currently only handling directive "filename" (not "*filename")
if (split := par.split("=")) and split[0].strip() == "filename":
fname = "=".join(split[1:]).strip().strip("'\"") or None
break
# otherwise, use name from URL:
if fname is None:
fname = Path(http_url.path).name or self._default_download_filename
local_path = Path(temp_dir) / fname
with open(local_path, "wb") as f:
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
f.write(chunk)
except ValidationError:
try:
local_path = TypeAdapter(Path).validate_python(source)
except ValidationError:
raise ValueError(
f"Unexpected file path type encountered: {type(source)}"
)
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
conv_res_iter = self.convert_batch(conv_inp)
conv_res: ConversionResult = next(conv_res_iter)
if conv_res.status not in {
ConversionStatus.SUCCESS,
ConversionStatus.PARTIAL_SUCCESS,
}:
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
return conv_res
def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractModelPipeline]: def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractModelPipeline]:
fopt = self.format_to_options.get(doc.format) fopt = self.format_to_options.get(doc.format)

View File

@ -14,13 +14,15 @@ from docling_core.types import Ref
from docling_core.types.experimental import BoundingBox, CoordOrigin from docling_core.types.experimental import BoundingBox, CoordOrigin
from docling_core.types.experimental.document import DoclingDocument from docling_core.types.experimental.document import DoclingDocument
from PIL import ImageDraw from PIL import ImageDraw
from pydantic import BaseModel from pydantic import BaseModel, ConfigDict
from docling.datamodel.base_models import Cluster from docling.datamodel.base_models import Cluster
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
class GlmOptions(BaseModel): class GlmOptions(BaseModel):
model_config = ConfigDict(protected_namespaces=())
create_legacy_output: bool = True create_legacy_output: bool = True
model_names: str = "" # e.g. "language;term;reference" model_names: str = "" # e.g. "language;term;reference"

View File

@ -7,7 +7,7 @@ from typing import Iterable
import yaml import yaml
from docling.datamodel.base_models import ConversionStatus from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -125,18 +125,19 @@ def main():
doc_converter = DocumentConverter() doc_converter = DocumentConverter()
input = DocumentConversionInput.from_paths(input_doc_paths)
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert_batch(input) conv_results = doc_converter.convert_all(
input_doc_paths,
raises_on_error=False, # to let conversion run through all and examine results at the end
)
success_count, partial_success_count, failure_count = export_documents( success_count, partial_success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch") conv_results, output_dir=Path("./scratch")
) )
end_time = time.time() - start_time end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.") _log.info(f"Document conversion complete in {end_time:.2f} seconds.")
if failure_count > 0: if failure_count > 0:
raise RuntimeError( raise RuntimeError(

View File

@ -5,7 +5,7 @@ from pathlib import Path
from typing import Iterable from typing import Iterable
from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
PdfPipelineOptions, PdfPipelineOptions,
TesseractCliOcrOptions, TesseractCliOcrOptions,
@ -65,9 +65,7 @@ def export_documents(
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_paths = [ input_doc_path = Path("./tests/data/2206.01062.pdf")
Path("./tests/data/2206.01062.pdf"),
]
########################################################################### ###########################################################################
@ -152,24 +150,13 @@ def main():
########################################################################### ###########################################################################
# Define input files
input = DocumentConversionInput.from_paths(input_doc_paths)
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert_batch(input) conv_result = doc_converter.convert(input_doc_path)
success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch")
)
end_time = time.time() - start_time end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.") _log.info(f"Document converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -2,13 +2,7 @@ import logging
import time import time
from pathlib import Path from pathlib import Path
from docling.datamodel.base_models import ( from docling.datamodel.base_models import FigureElement, InputFormat, Table
ConversionStatus,
FigureElement,
InputFormat,
Table,
)
from docling.datamodel.document import DocumentConversionInput
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
@ -20,13 +14,9 @@ IMAGE_RESOLUTION_SCALE = 2.0
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_paths = [ input_doc_path = Path("./tests/data/2206.01062.pdf")
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch") output_dir = Path("./scratch")
input_files = DocumentConversionInput.from_paths(input_doc_paths)
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory. # will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images. # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
@ -42,46 +32,29 @@ def main():
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert_batch(input_files) conv_res = doc_converter.convert(input_doc_path)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
for conv_res in conv_results: doc_filename = conv_res.input.file.stem
if conv_res.status != ConversionStatus.SUCCESS:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
continue
doc_filename = conv_res.input.file.stem # Export page images
for page in conv_res.pages:
page_no = page.page_no + 1
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
with page_image_filename.open("wb") as fp:
page.image.save(fp, format="PNG")
# Export page images # Export figures and tables
for page in conv_res.pages: for element, image in conv_res.render_element_images(
page_no = page.page_no + 1 element_types=(FigureElement, Table)
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png" ):
with page_image_filename.open("wb") as fp: element_image_filename = output_dir / f"{doc_filename}-element-{element.id}.png"
page.image.save(fp, format="PNG") with element_image_filename.open("wb") as fp:
image.save(fp, "PNG")
# Export figures and tables
for element, image in conv_res.render_element_images(
element_types=(FigureElement, Table)
):
element_image_filename = (
output_dir / f"{doc_filename}-element-{element.id}.png"
)
with element_image_filename.open("wb") as fp:
image.save(fp, "PNG")
success_count += 1
end_time = time.time() - start_time end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.") _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -5,8 +5,7 @@ from pathlib import Path
import pandas as pd import pandas as pd
from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DocumentConversionInput
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.utils.export import generate_multimodal_pages from docling.utils.export import generate_multimodal_pages
@ -19,13 +18,9 @@ IMAGE_RESOLUTION_SCALE = 2.0
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_paths = [ input_doc_path = Path("./tests/data/2206.01062.pdf")
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch") output_dir = Path("./scratch")
input_files = DocumentConversionInput.from_paths(input_doc_paths)
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory. # will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images. # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
@ -41,53 +36,45 @@ def main():
start_time = time.time() start_time = time.time()
converted_docs = doc_converter.convert_batch(input_files) conv_res = doc_converter.convert(input_doc_path)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
for doc in converted_docs:
if doc.status != ConversionStatus.SUCCESS:
_log.info(f"Document {doc.input.file} failed to convert.")
failure_count += 1
continue
rows = [] rows = []
for ( for (
content_text, content_text,
content_md, content_md,
content_dt, content_dt,
page_cells, page_cells,
page_segments, page_segments,
page, page,
) in generate_multimodal_pages(doc): ) in generate_multimodal_pages(conv_res):
dpi = page._default_image_scale * 72 dpi = page._default_image_scale * 72
rows.append( rows.append(
{ {
"document": doc.input.file.name, "document": conv_res.input.file.name,
"hash": doc.input.document_hash, "hash": conv_res.input.document_hash,
"page_hash": page.page_hash, "page_hash": page.page_hash,
"image": { "image": {
"width": page.image.width, "width": page.image.width,
"height": page.image.height, "height": page.image.height,
"bytes": page.image.tobytes(), "bytes": page.image.tobytes(),
}, },
"cells": page_cells, "cells": page_cells,
"contents": content_text, "contents": content_text,
"contents_md": content_md, "contents_md": content_md,
"contents_dt": content_dt, "contents_dt": content_dt,
"segments": page_segments, "segments": page_segments,
"extra": { "extra": {
"page_num": page.page_no + 1, "page_num": page.page_no + 1,
"width_in_points": page.size.width, "width_in_points": page.size.width,
"height_in_points": page.size.height, "height_in_points": page.size.height,
"dpi": dpi, "dpi": dpi,
}, },
} }
) )
success_count += 1
# Generate one parquet from all documents # Generate one parquet from all documents
df = pd.json_normalize(rows) df = pd.json_normalize(rows)
@ -97,12 +84,9 @@ def main():
end_time = time.time() - start_time end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.") _log.info(
f"Document converted and multimodal pages generated in {end_time:.2f} seconds."
if failure_count > 0: )
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
# This block demonstrates how the file can be opened with the HF datasets library # This block demonstrates how the file can be opened with the HF datasets library
# from datasets import Dataset # from datasets import Dataset

View File

@ -4,8 +4,6 @@ from pathlib import Path
import pandas as pd import pandas as pd
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -14,59 +12,39 @@ _log = logging.getLogger(__name__)
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_paths = [ input_doc_path = Path("./tests/data/2206.01062.pdf")
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch") output_dir = Path("./scratch")
input_files = DocumentConversionInput.from_paths(input_doc_paths)
doc_converter = DocumentConverter() doc_converter = DocumentConverter()
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert_batch(input_files) conv_res = doc_converter.convert(input_doc_path)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
for conv_res in conv_results:
if conv_res.status != ConversionStatus.SUCCESS:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
continue
doc_filename = conv_res.input.file.stem doc_filename = conv_res.input.file.stem
# Export tables # Export tables
for table_ix, table in enumerate(conv_res.legacy_output.tables): for table_ix, table in enumerate(conv_res.legacy_output.tables):
table_df: pd.DataFrame = table.export_to_dataframe() table_df: pd.DataFrame = table.export_to_dataframe()
print(f"## Table {table_ix}") print(f"## Table {table_ix}")
print(table_df.to_markdown()) print(table_df.to_markdown())
# Save the table as csv # Save the table as csv
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv" element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
_log.info(f"Saving CSV table to {element_csv_filename}") _log.info(f"Saving CSV table to {element_csv_filename}")
table_df.to_csv(element_csv_filename) table_df.to_csv(element_csv_filename)
# Save the table as html # Save the table as html
element_html_filename = ( element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
output_dir / f"{doc_filename}-table-{table_ix+1}.html" _log.info(f"Saving HTML table to {element_html_filename}")
) with element_html_filename.open("w") as fp:
_log.info(f"Saving HTML table to {element_html_filename}") fp.write(table.export_to_html())
with element_html_filename.open("w") as fp:
fp.write(table.export_to_html())
success_count += 1
end_time = time.time() - start_time end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.") _log.info(f"Document converted and tables exported in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -2,7 +2,7 @@ from docling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
converter = DocumentConverter() converter = DocumentConverter()
result = converter.convert_single(source) result = converter.convert(source)
print(result.output.export_to_markdown()) # output: ## Docling Technical Report [...]" print(result.output.export_to_markdown()) # output: ## Docling Technical Report [...]"
# if the legacy output is needed, use this version # if the legacy output is needed, use this version
# print(result.render_as_markdown_v1()) # output: ## Docling Technical Report [...]" # print(result.render_as_markdown_v1()) # output: ## Docling Technical Report [...]"

View File

@ -6,7 +6,6 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import ( from docling.document_converter import (
DocumentConverter, DocumentConverter,
FormatOption, FormatOption,
@ -28,7 +27,6 @@ input_paths = [
Path("tests/data/2206.01062.pdf"), Path("tests/data/2206.01062.pdf"),
# Path("tests/data/2305.03393v1-pg9-img.png"), # Path("tests/data/2305.03393v1-pg9-img.png"),
] ]
input = DocumentConversionInput.from_paths(input_paths)
## for defaults use: ## for defaults use:
# doc_converter = DocumentConverter() # doc_converter = DocumentConverter()
@ -52,12 +50,12 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
}, },
) )
conv_results = doc_converter.convert_batch(input) conv_results = doc_converter.convert_all(input_paths)
for res in conv_results: for res in conv_results:
out_path = Path("./scratch") out_path = Path("./scratch")
print( print(
f"Document {res.input.file.name} converted with status {res.status}." f"Document {res.input.file.name} converted."
f"\nSaved markdown output to: {str(out_path)}" f"\nSaved markdown output to: {str(out_path)}"
) )
# print(res.experimental.export_to_markdown()) # print(res.experimental.export_to_markdown())

View File

@ -3,7 +3,7 @@ from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
@ -48,7 +48,7 @@ def test_e2e_conversions():
for pdf_path in pdf_paths: for pdf_path in pdf_paths:
print(f"converting {pdf_path}") print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert_single(pdf_path) doc_result: ConversionResult = converter.convert(pdf_path)
verify_conversion_result_v1( verify_conversion_result_v1(
input_path=pdf_path, doc_result=doc_result, generate=GENERATE_V1 input_path=pdf_path, doc_result=doc_result, generate=GENERATE_V1

View File

@ -8,7 +8,6 @@ from docling.datamodel.pipeline_options import (
EasyOcrOptions, EasyOcrOptions,
OcrOptions, OcrOptions,
PdfPipelineOptions, PdfPipelineOptions,
PipelineOptions,
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions, TesseractOcrOptions,
) )
@ -90,7 +89,7 @@ def test_e2e_conversions():
for pdf_path in pdf_paths: for pdf_path in pdf_paths:
print(f"converting {pdf_path}") print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert_single(pdf_path) doc_result: ConversionResult = converter.convert(pdf_path)
# Save conversions # Save conversions
# save_output(pdf_path, doc_result, None) # save_output(pdf_path, doc_result, None)

View File

@ -5,8 +5,7 @@ import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
@ -37,39 +36,24 @@ def converter():
return converter return converter
def test_convert_single(converter: DocumentConverter): def test_convert_path(converter: DocumentConverter):
pdf_path = get_pdf_path() pdf_path = get_pdf_path()
print(f"converting {pdf_path}") print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert_single(pdf_path) doc_result = converter.convert(pdf_path)
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result) verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result) verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
def test_batch_path(converter: DocumentConverter): def test_convert_stream(converter: DocumentConverter):
pdf_path = get_pdf_path()
print(f"converting {pdf_path}")
conv_input = DocumentConversionInput.from_paths([pdf_path])
results = converter.convert_batch(conv_input)
for doc_result in results:
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
def test_batch_bytes(converter: DocumentConverter):
pdf_path = get_pdf_path() pdf_path = get_pdf_path()
print(f"converting {pdf_path}") print(f"converting {pdf_path}")
buf = BytesIO(pdf_path.open("rb").read()) buf = BytesIO(pdf_path.open("rb").read())
docs = [DocumentStream(name=pdf_path.name, stream=buf)] stream = DocumentStream(name=pdf_path.name, stream=buf)
conv_input = DocumentConversionInput.from_streams(docs)
results = converter.convert_batch(conv_input) doc_result = converter.convert(stream)
for doc_result in results: verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result) verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)

View File

@ -39,6 +39,6 @@ def test_e2e_conversions(test_doc_path):
for converter in get_converters_with_table_options(): for converter in get_converters_with_table_options():
print(f"converting {test_doc_path}") print(f"converting {test_doc_path}")
doc_result: ConversionResult = converter.convert_single(test_doc_path) doc_result: ConversionResult = converter.convert(test_doc_path)
assert doc_result.status == ConversionStatus.SUCCESS assert doc_result.status == ConversionStatus.SUCCESS