docling/docs/examples/onnxtr_with_custom_models.py
felix a19cf81f98 style & quality applied
Signed-off-by: felix <felixdittrich92@gmail.com>
2025-03-31 14:21:24 +02:00

41 lines
1.1 KiB
Python

from docling.datamodel.pipeline_options import OnnxtrOcrOptions, PdfPipelineOptions
from docling.document_converter import (
ConversionResult,
DocumentConverter,
InputFormat,
PdfFormatOption,
)
def main():
# Source document to convert
source = "https://arxiv.org/pdf/2408.09869v4"
ocr_options = OnnxtrOcrOptions(
det_arch="db_mobilenet_v3_large",
reco_arch="Felix92/onnxtr-parseq-multilingual-v1", # Model will be downloaded from Hugging Face Hub
auto_correct_orientation=True, # This can be used to correct the orientation of the pages
)
pipeline_options = PdfPipelineOptions(
ocr_options=ocr_options,
)
# Convert the document
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
),
},
)
conversion_result: ConversionResult = converter.convert(source=source)
doc = conversion_result.document
md = doc.export_to_markdown()
print(md)
if __name__ == "__main__":
main()