docling/examples/img_understand_pipeline.py

import logging
import os
import time
from pathlib import Path
from typing import Iterable

import httpx
from dotenv import load_dotenv

from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.pipeline.img_understand_pipeline import (
    ImgUnderstandApiOptions,
    ImgUnderstandPipeline,
    ImgUnderstandPipelineOptions,
    ImgUnderstandVllmOptions,
)

_log = logging.getLogger(__name__)


def export_documents(
    conv_results: Iterable[ConversionResult],
    output_dir: Path,
):
    output_dir.mkdir(parents=True, exist_ok=True)

    success_count = 0
    failure_count = 0

    for conv_res in conv_results:
        if conv_res.status == ConversionStatus.SUCCESS:
            success_count += 1
            doc_filename = conv_res.input.file.stem

            # # Export Deep Search document JSON format:
            # with (output_dir / f"{doc_filename}.json").open("w") as fp:
            #     fp.write(json.dumps(conv_res.render_as_dict()))

            # # Export Text format:
            # with (output_dir / f"{doc_filename}.txt").open("w") as fp:
            #     fp.write(conv_res.render_as_text())

            # # Export Markdown format:
            # with (output_dir / f"{doc_filename}.md").open("w") as fp:
            #     fp.write(conv_res.render_as_markdown())

            # # Export Document Tags format:
            # with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
            #     fp.write(conv_res.render_as_doctags())

        else:
            _log.info(f"Document {conv_res.input.file} failed to convert.")
            failure_count += 1

    _log.info(
        f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
    )

    return success_count, failure_count


def _get_iam_access_token(api_key: str) -> str:
    res = httpx.post(
        url="https://iam.cloud.ibm.com/identity/token",
        headers={
            "Content-Type": "application/x-www-form-urlencoded",
        },
        data=f"grant_type=urn:ibm:params:oauth:grant-type:apikey&apikey={api_key}",
    )
    res.raise_for_status()
    api_out = res.json()
    print(f"{api_out=}")
    return api_out["access_token"]


def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_paths = [
        Path("./tests/data/2206.01062.pdf"),
    ]

    load_dotenv()
    api_key = os.environ.get("WX_API_KEY")
    project_id = os.environ.get("WX_PROJECT_ID")

    doc_converter = DocumentConverter(
        pipeline_cls=ImgUnderstandPipeline,
        # TODO: make DocumentConverter provide the correct default value
        # for pipeline_options, given the pipeline_cls
        pipeline_options=ImgUnderstandPipelineOptions(
            img_understand_options=ImgUnderstandApiOptions(
                url="https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29",
                headers={
                    "Authorization": "Bearer " + _get_iam_access_token(api_key=api_key),
                },
                params=dict(
                    model_id="meta-llama/llama3-llava-next-8b-hf",
                    project_id=project_id,
                    max_tokens=512,
                    seed=42,
                ),
                llm_prompt="Describe this figure in three sentences.",
                provenance="llama3-llava-next-8b-hf",
            )
        ),
    )

    # Define input files
    input = DocumentConversionInput.from_paths(input_doc_paths)

    start_time = time.time()

    conv_results = doc_converter.convert(input)
    success_count, failure_count = export_documents(
        conv_results, output_dir=Path("./scratch")
    )

    end_time = time.time() - start_time

    _log.info(f"All documents were converted in {end_time:.2f} seconds.")

    if failure_count > 0:
        raise RuntimeError(
            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
        )


if __name__ == "__main__":
    main()