docling/examples/img_understand_pipeline.py
Michele Dolfi a122a7be4c introduce img understand pipeline
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2024-09-22 20:24:38 +02:00

133 lines
4.0 KiB
Python

import logging
import os
import time
from pathlib import Path
from typing import Iterable
import httpx
from dotenv import load_dotenv
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.pipeline.img_understand_pipeline import (
ImgUnderstandApiOptions,
ImgUnderstandPipeline,
ImgUnderstandPipelineOptions,
ImgUnderstandVllmOptions,
)
_log = logging.getLogger(__name__)
def export_documents(
conv_results: Iterable[ConversionResult],
output_dir: Path,
):
output_dir.mkdir(parents=True, exist_ok=True)
success_count = 0
failure_count = 0
for conv_res in conv_results:
if conv_res.status == ConversionStatus.SUCCESS:
success_count += 1
doc_filename = conv_res.input.file.stem
# # Export Deep Search document JSON format:
# with (output_dir / f"{doc_filename}.json").open("w") as fp:
# fp.write(json.dumps(conv_res.render_as_dict()))
# # Export Text format:
# with (output_dir / f"{doc_filename}.txt").open("w") as fp:
# fp.write(conv_res.render_as_text())
# # Export Markdown format:
# with (output_dir / f"{doc_filename}.md").open("w") as fp:
# fp.write(conv_res.render_as_markdown())
# # Export Document Tags format:
# with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
# fp.write(conv_res.render_as_doctags())
else:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
_log.info(
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
)
return success_count, failure_count
def _get_iam_access_token(api_key: str) -> str:
res = httpx.post(
url="https://iam.cloud.ibm.com/identity/token",
headers={
"Content-Type": "application/x-www-form-urlencoded",
},
data=f"grant_type=urn:ibm:params:oauth:grant-type:apikey&apikey={api_key}",
)
res.raise_for_status()
api_out = res.json()
print(f"{api_out=}")
return api_out["access_token"]
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]
load_dotenv()
api_key = os.environ.get("WX_API_KEY")
project_id = os.environ.get("WX_PROJECT_ID")
doc_converter = DocumentConverter(
pipeline_cls=ImgUnderstandPipeline,
# TODO: make DocumentConverter provide the correct default value
# for pipeline_options, given the pipeline_cls
pipeline_options=ImgUnderstandPipelineOptions(
img_understand_options=ImgUnderstandApiOptions(
url="https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29",
headers={
"Authorization": "Bearer " + _get_iam_access_token(api_key=api_key),
},
params=dict(
model_id="meta-llama/llama3-llava-next-8b-hf",
project_id=project_id,
max_tokens=512,
seed=42,
),
llm_prompt="Describe this figure in three sentences.",
provenance="llama3-llava-next-8b-hf",
)
),
)
# Define input files
input = DocumentConversionInput.from_paths(input_doc_paths)
start_time = time.time()
conv_results = doc_converter.convert(input)
success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch")
)
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__":
main()