mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
feat!: simplify conversion API
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
753f67a434
commit
8777b759ae
@ -13,7 +13,7 @@ from docling_core.utils.file import resolve_file_source
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
PdfPipelineOptions,
|
||||
@ -231,12 +231,9 @@ def convert(
|
||||
}
|
||||
)
|
||||
|
||||
# Define input files
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert_batch(input)
|
||||
conv_results = doc_converter.convert_all(input_doc_paths)
|
||||
|
||||
output.mkdir(parents=True, exist_ok=True)
|
||||
export_documents(
|
||||
|
@ -19,6 +19,7 @@ from docling_core.types.experimental import (
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
)
|
||||
from docling_core.utils.file import resolve_file_source
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
@ -162,8 +163,7 @@ class DocumentFormat(str, Enum):
|
||||
V1 = "v1"
|
||||
|
||||
|
||||
@deprecated("Use `ConversionResult` instead.")
|
||||
class ConvertedDocument(BaseModel):
|
||||
class ConversionResult(BaseModel):
|
||||
input: InputDocument
|
||||
|
||||
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
||||
@ -457,20 +457,16 @@ class ConvertedDocument(BaseModel):
|
||||
yield element, cropped_im
|
||||
|
||||
|
||||
class ConversionResult(ConvertedDocument):
|
||||
pass
|
||||
class _DocumentConversionInput(BaseModel):
|
||||
|
||||
|
||||
class DocumentConversionInput(BaseModel):
|
||||
|
||||
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
||||
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
||||
limits: Optional[DocumentLimits] = DocumentLimits()
|
||||
|
||||
def docs(
|
||||
self, format_options: Dict[InputFormat, "FormatOption"]
|
||||
) -> Iterable[InputDocument]:
|
||||
|
||||
for obj in self._path_or_stream_iterator:
|
||||
for item in self.path_or_stream_iterator:
|
||||
obj = resolve_file_source(item) if isinstance(item, str) else item
|
||||
format = self._guess_format(obj)
|
||||
if format not in format_options.keys():
|
||||
_log.debug(
|
||||
@ -496,6 +492,8 @@ class DocumentConversionInput(BaseModel):
|
||||
limits=self.limits,
|
||||
backend=backend,
|
||||
)
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
||||
|
||||
def _guess_format(self, obj):
|
||||
content = None
|
||||
@ -531,21 +529,3 @@ class DocumentConversionInput(BaseModel):
|
||||
return "text/html"
|
||||
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
|
||||
paths = [Path(p) for p in paths]
|
||||
|
||||
doc_input = cls(limits=limits)
|
||||
doc_input._path_or_stream_iterator = paths
|
||||
|
||||
return doc_input
|
||||
|
||||
@classmethod
|
||||
def from_streams(
|
||||
cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
|
||||
):
|
||||
doc_input = cls(limits=limits)
|
||||
doc_input._path_or_stream_iterator = streams
|
||||
|
||||
return doc_input
|
||||
|
@ -1,34 +1,24 @@
|
||||
import logging
|
||||
import tempfile
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Type
|
||||
|
||||
import requests
|
||||
from pydantic import (
|
||||
AnyHttpUrl,
|
||||
BaseModel,
|
||||
ConfigDict,
|
||||
TypeAdapter,
|
||||
ValidationError,
|
||||
field_validator,
|
||||
model_validator,
|
||||
)
|
||||
from typing_extensions import deprecated
|
||||
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
DocumentConversionInput,
|
||||
InputDocument,
|
||||
_DocumentConversionInput,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.datamodel.settings import DocumentLimits, settings
|
||||
from docling.pipeline.base_model_pipeline import AbstractModelPipeline
|
||||
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||
@ -119,16 +109,56 @@ class DocumentConverter:
|
||||
Type[AbstractModelPipeline], AbstractModelPipeline
|
||||
] = {}
|
||||
|
||||
@deprecated("Use convert_batch instead.")
|
||||
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
|
||||
yield from self.convert_batch(input=input)
|
||||
@validate_call(config=ConfigDict(strict=True))
|
||||
def convert(
|
||||
self,
|
||||
source: Path | str | DocumentStream, # TODO review naming
|
||||
raises_on_error: bool = True,
|
||||
max_num_pages: int = sys.maxsize,
|
||||
max_file_size: int = sys.maxsize,
|
||||
) -> ConversionResult:
|
||||
|
||||
def convert_batch(
|
||||
self, input: DocumentConversionInput, raise_on_error: bool = False
|
||||
all_res = self.convert_all(
|
||||
source=[source],
|
||||
raises_on_error=raises_on_error,
|
||||
max_num_pages=max_num_pages,
|
||||
max_file_size=max_file_size,
|
||||
)
|
||||
return next(all_res)
|
||||
|
||||
@validate_call(config=ConfigDict(strict=True))
|
||||
def convert_all(
|
||||
self,
|
||||
source: Iterable[Path | str | DocumentStream], # TODO review naming
|
||||
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
||||
max_num_pages: int = sys.maxsize,
|
||||
max_file_size: int = sys.maxsize,
|
||||
) -> Iterable[ConversionResult]:
|
||||
limits = DocumentLimits(
|
||||
max_num_pages=max_num_pages,
|
||||
max_file_size=max_file_size,
|
||||
)
|
||||
conv_input = _DocumentConversionInput(
|
||||
path_or_stream_iterator=source,
|
||||
limit=limits,
|
||||
)
|
||||
conv_res_iter = self._convert(conv_input)
|
||||
for conv_res in conv_res_iter:
|
||||
if raises_on_error and conv_res.status not in {
|
||||
ConversionStatus.SUCCESS,
|
||||
ConversionStatus.PARTIAL_SUCCESS,
|
||||
}:
|
||||
raise RuntimeError(
|
||||
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
||||
)
|
||||
else:
|
||||
yield conv_res
|
||||
|
||||
def _convert(
|
||||
self, conv_input: _DocumentConversionInput
|
||||
) -> Iterable[ConversionResult]:
|
||||
for input_batch in chunkify(
|
||||
input.docs(self.format_to_options),
|
||||
conv_input.docs(self.format_to_options),
|
||||
settings.perf.doc_batch_size, # pass format_options
|
||||
):
|
||||
_log.info(f"Going to convert document batch...")
|
||||
@ -143,58 +173,6 @@ class DocumentConverter:
|
||||
if item is not None:
|
||||
yield item
|
||||
|
||||
def convert_single(
|
||||
self, source: Path | AnyHttpUrl | str, raise_on_error: bool = False
|
||||
) -> ConversionResult:
|
||||
"""Convert a single document.
|
||||
|
||||
Args:
|
||||
source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
|
||||
|
||||
Raises:
|
||||
ValueError: If source is of unexpected type.
|
||||
RuntimeError: If conversion fails.
|
||||
|
||||
Returns:
|
||||
ConversionResult: The conversion result object.
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
try:
|
||||
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
|
||||
res = requests.get(http_url, stream=True)
|
||||
res.raise_for_status()
|
||||
fname = None
|
||||
# try to get filename from response header
|
||||
if cont_disp := res.headers.get("Content-Disposition"):
|
||||
for par in cont_disp.strip().split(";"):
|
||||
# currently only handling directive "filename" (not "*filename")
|
||||
if (split := par.split("=")) and split[0].strip() == "filename":
|
||||
fname = "=".join(split[1:]).strip().strip("'\"") or None
|
||||
break
|
||||
# otherwise, use name from URL:
|
||||
if fname is None:
|
||||
fname = Path(http_url.path).name or self._default_download_filename
|
||||
local_path = Path(temp_dir) / fname
|
||||
with open(local_path, "wb") as f:
|
||||
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
|
||||
f.write(chunk)
|
||||
except ValidationError:
|
||||
try:
|
||||
local_path = TypeAdapter(Path).validate_python(source)
|
||||
except ValidationError:
|
||||
raise ValueError(
|
||||
f"Unexpected file path type encountered: {type(source)}"
|
||||
)
|
||||
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
|
||||
conv_res_iter = self.convert_batch(conv_inp)
|
||||
conv_res: ConversionResult = next(conv_res_iter)
|
||||
if conv_res.status not in {
|
||||
ConversionStatus.SUCCESS,
|
||||
ConversionStatus.PARTIAL_SUCCESS,
|
||||
}:
|
||||
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
|
||||
return conv_res
|
||||
|
||||
def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractModelPipeline]:
|
||||
fopt = self.format_to_options.get(doc.format)
|
||||
|
||||
|
@ -14,13 +14,15 @@ from docling_core.types import Ref
|
||||
from docling_core.types.experimental import BoundingBox, CoordOrigin
|
||||
from docling_core.types.experimental.document import DoclingDocument
|
||||
from PIL import ImageDraw
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
from docling.datamodel.base_models import Cluster
|
||||
from docling.datamodel.document import ConversionResult
|
||||
|
||||
|
||||
class GlmOptions(BaseModel):
|
||||
model_config = ConfigDict(protected_namespaces=())
|
||||
|
||||
create_legacy_output: bool = True
|
||||
model_names: str = "" # e.g. "language;term;reference"
|
||||
|
||||
|
@ -7,7 +7,7 @@ from typing import Iterable
|
||||
import yaml
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -125,18 +125,19 @@ def main():
|
||||
|
||||
doc_converter = DocumentConverter()
|
||||
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert_batch(input)
|
||||
conv_results = doc_converter.convert_all(
|
||||
input_doc_paths,
|
||||
raises_on_error=False, # to let conversion run through all and examine results at the end
|
||||
)
|
||||
success_count, partial_success_count, failure_count = export_documents(
|
||||
conv_results, output_dir=Path("./scratch")
|
||||
)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
_log.info(f"Document conversion complete in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
|
@ -5,7 +5,7 @@ from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
@ -65,9 +65,7 @@ def export_documents(
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
|
||||
###########################################################################
|
||||
|
||||
@ -152,24 +150,13 @@ def main():
|
||||
|
||||
###########################################################################
|
||||
|
||||
# Define input files
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert_batch(input)
|
||||
success_count, failure_count = export_documents(
|
||||
conv_results, output_dir=Path("./scratch")
|
||||
)
|
||||
conv_result = doc_converter.convert(input_doc_path)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
_log.info(f"Document converted in {end_time:.2f} seconds.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -2,13 +2,7 @@ import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
FigureElement,
|
||||
InputFormat,
|
||||
Table,
|
||||
)
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
@ -20,13 +14,9 @@ IMAGE_RESOLUTION_SCALE = 2.0
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
output_dir = Path("./scratch")
|
||||
|
||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
# will destroy them for cleaning up memory.
|
||||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||
@ -42,17 +32,9 @@ def main():
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert_batch(input_files)
|
||||
conv_res = doc_converter.convert(input_doc_path)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status != ConversionStatus.SUCCESS:
|
||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export page images
|
||||
@ -66,22 +48,13 @@ def main():
|
||||
for element, image in conv_res.render_element_images(
|
||||
element_types=(FigureElement, Table)
|
||||
):
|
||||
element_image_filename = (
|
||||
output_dir / f"{doc_filename}-element-{element.id}.png"
|
||||
)
|
||||
element_image_filename = output_dir / f"{doc_filename}-element-{element.id}.png"
|
||||
with element_image_filename.open("wb") as fp:
|
||||
image.save(fp, "PNG")
|
||||
|
||||
success_count += 1
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
_log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -5,8 +5,7 @@ from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.utils.export import generate_multimodal_pages
|
||||
@ -19,13 +18,9 @@ IMAGE_RESOLUTION_SCALE = 2.0
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
output_dir = Path("./scratch")
|
||||
|
||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
# will destroy them for cleaning up memory.
|
||||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||
@ -41,16 +36,9 @@ def main():
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
converted_docs = doc_converter.convert_batch(input_files)
|
||||
conv_res = doc_converter.convert(input_doc_path)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
for doc in converted_docs:
|
||||
if doc.status != ConversionStatus.SUCCESS:
|
||||
_log.info(f"Document {doc.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
rows = []
|
||||
for (
|
||||
@ -60,14 +48,14 @@ def main():
|
||||
page_cells,
|
||||
page_segments,
|
||||
page,
|
||||
) in generate_multimodal_pages(doc):
|
||||
) in generate_multimodal_pages(conv_res):
|
||||
|
||||
dpi = page._default_image_scale * 72
|
||||
|
||||
rows.append(
|
||||
{
|
||||
"document": doc.input.file.name,
|
||||
"hash": doc.input.document_hash,
|
||||
"document": conv_res.input.file.name,
|
||||
"hash": conv_res.input.document_hash,
|
||||
"page_hash": page.page_hash,
|
||||
"image": {
|
||||
"width": page.image.width,
|
||||
@ -87,7 +75,6 @@ def main():
|
||||
},
|
||||
}
|
||||
)
|
||||
success_count += 1
|
||||
|
||||
# Generate one parquet from all documents
|
||||
df = pd.json_normalize(rows)
|
||||
@ -97,11 +84,8 @@ def main():
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
_log.info(
|
||||
f"Document converted and multimodal pages generated in {end_time:.2f} seconds."
|
||||
)
|
||||
|
||||
# This block demonstrates how the file can be opened with the HF datasets library
|
||||
|
@ -4,8 +4,6 @@ from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -14,27 +12,16 @@ _log = logging.getLogger(__name__)
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
output_dir = Path("./scratch")
|
||||
|
||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
doc_converter = DocumentConverter()
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert_batch(input_files)
|
||||
conv_res = doc_converter.convert(input_doc_path)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status != ConversionStatus.SUCCESS:
|
||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
@ -50,23 +37,14 @@ def main():
|
||||
table_df.to_csv(element_csv_filename)
|
||||
|
||||
# Save the table as html
|
||||
element_html_filename = (
|
||||
output_dir / f"{doc_filename}-table-{table_ix+1}.html"
|
||||
)
|
||||
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
|
||||
_log.info(f"Saving HTML table to {element_html_filename}")
|
||||
with element_html_filename.open("w") as fp:
|
||||
fp.write(table.export_to_html())
|
||||
|
||||
success_count += 1
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
_log.info(f"Document converted and tables exported in {end_time:.2f} seconds.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -2,7 +2,7 @@ from docling.document_converter import DocumentConverter
|
||||
|
||||
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
||||
converter = DocumentConverter()
|
||||
result = converter.convert_single(source)
|
||||
result = converter.convert(source)
|
||||
print(result.output.export_to_markdown()) # output: ## Docling Technical Report [...]"
|
||||
# if the legacy output is needed, use this version
|
||||
# print(result.render_as_markdown_v1()) # output: ## Docling Technical Report [...]"
|
||||
|
@ -6,7 +6,6 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import (
|
||||
DocumentConverter,
|
||||
FormatOption,
|
||||
@ -28,7 +27,6 @@ input_paths = [
|
||||
Path("tests/data/2206.01062.pdf"),
|
||||
# Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
]
|
||||
input = DocumentConversionInput.from_paths(input_paths)
|
||||
|
||||
## for defaults use:
|
||||
# doc_converter = DocumentConverter()
|
||||
@ -52,12 +50,12 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
|
||||
},
|
||||
)
|
||||
|
||||
conv_results = doc_converter.convert_batch(input)
|
||||
conv_results = doc_converter.convert_all(input_paths)
|
||||
|
||||
for res in conv_results:
|
||||
out_path = Path("./scratch")
|
||||
print(
|
||||
f"Document {res.input.file.name} converted with status {res.status}."
|
||||
f"Document {res.input.file.name} converted."
|
||||
f"\nSaved markdown output to: {str(out_path)}"
|
||||
)
|
||||
# print(res.experimental.export_to_markdown())
|
||||
|
@ -3,7 +3,7 @@ from pathlib import Path
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||
@ -48,7 +48,7 @@ def test_e2e_conversions():
|
||||
for pdf_path in pdf_paths:
|
||||
print(f"converting {pdf_path}")
|
||||
|
||||
doc_result: ConversionResult = converter.convert_single(pdf_path)
|
||||
doc_result: ConversionResult = converter.convert(pdf_path)
|
||||
|
||||
verify_conversion_result_v1(
|
||||
input_path=pdf_path, doc_result=doc_result, generate=GENERATE_V1
|
||||
|
@ -8,7 +8,6 @@ from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrOptions,
|
||||
PdfPipelineOptions,
|
||||
PipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
@ -90,7 +89,7 @@ def test_e2e_conversions():
|
||||
for pdf_path in pdf_paths:
|
||||
print(f"converting {pdf_path}")
|
||||
|
||||
doc_result: ConversionResult = converter.convert_single(pdf_path)
|
||||
doc_result: ConversionResult = converter.convert(pdf_path)
|
||||
|
||||
# Save conversions
|
||||
# save_output(pdf_path, doc_result, None)
|
||||
|
@ -5,8 +5,7 @@ import pytest
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||
@ -37,39 +36,24 @@ def converter():
|
||||
return converter
|
||||
|
||||
|
||||
def test_convert_single(converter: DocumentConverter):
|
||||
def test_convert_path(converter: DocumentConverter):
|
||||
|
||||
pdf_path = get_pdf_path()
|
||||
print(f"converting {pdf_path}")
|
||||
|
||||
doc_result: ConversionResult = converter.convert_single(pdf_path)
|
||||
doc_result = converter.convert(pdf_path)
|
||||
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
|
||||
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
|
||||
|
||||
|
||||
def test_batch_path(converter: DocumentConverter):
|
||||
|
||||
pdf_path = get_pdf_path()
|
||||
print(f"converting {pdf_path}")
|
||||
|
||||
conv_input = DocumentConversionInput.from_paths([pdf_path])
|
||||
|
||||
results = converter.convert_batch(conv_input)
|
||||
for doc_result in results:
|
||||
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
|
||||
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
|
||||
|
||||
|
||||
def test_batch_bytes(converter: DocumentConverter):
|
||||
def test_convert_stream(converter: DocumentConverter):
|
||||
|
||||
pdf_path = get_pdf_path()
|
||||
print(f"converting {pdf_path}")
|
||||
|
||||
buf = BytesIO(pdf_path.open("rb").read())
|
||||
docs = [DocumentStream(name=pdf_path.name, stream=buf)]
|
||||
conv_input = DocumentConversionInput.from_streams(docs)
|
||||
stream = DocumentStream(name=pdf_path.name, stream=buf)
|
||||
|
||||
results = converter.convert_batch(conv_input)
|
||||
for doc_result in results:
|
||||
doc_result = converter.convert(stream)
|
||||
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
|
||||
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
|
||||
|
@ -39,6 +39,6 @@ def test_e2e_conversions(test_doc_path):
|
||||
for converter in get_converters_with_table_options():
|
||||
print(f"converting {test_doc_path}")
|
||||
|
||||
doc_result: ConversionResult = converter.convert_single(test_doc_path)
|
||||
doc_result: ConversionResult = converter.convert(test_doc_path)
|
||||
|
||||
assert doc_result.status == ConversionStatus.SUCCESS
|
||||
|
Loading…
Reference in New Issue
Block a user