{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Document Loading using Docling & adding Tables and Figures Discription"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### This notebook creates enrichments for tables and images and places it back in the markdown, for creating more meaningful search indexes and ensuring promising results to reduce false positives in semantic searches of tables and images"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
"This code loads environment variables using the `dotenv` library and sets the necessary environment variables for Azure services.\n",
"The environment variables are loaded from the `.env` file in the same directory as this notebook.\n",
"\"\"\"\n",
"\n",
"import os\n",
"from dotenv import load_dotenv\n",
"from openai import AzureOpenAI\n",
"\n",
"load_dotenv()\n",
"\n",
"aoai_api_base = os.getenv(\"AZURE_OPENAI_ENDPOINT\")\n",
"aoai_api_key= os.getenv(\"AZURE_OPENAI_API_KEY\")\n",
"aoai_deployment_name = os.getenv(\"AZURE_OPENAI_DEPLOYMENT_NAME\")\n",
"aoai_api_version = os.getenv(\"AZURE_OPENAI_API_VERSION\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import base64\n",
"from mimetypes import guess_type\n",
"\n",
"# Function to encode a local image into data URL \n",
"def local_image_to_data_url(image_path):\n",
" # Guess the MIME type of the image based on the file extension\n",
" mime_type, _ = guess_type(image_path)\n",
" if mime_type is None:\n",
" mime_type = 'application/octet-stream' # Default MIME type if none is found\n",
"\n",
" # Read and encode the image file\n",
" with open(image_path, \"rb\") as image_file:\n",
" base64_encoded_data = base64.b64encode(image_file.read()).decode('utf-8')\n",
"\n",
" # Construct the data URL\n",
" return f\"data:{mime_type};base64,{base64_encoded_data}\"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Use Azure OpenAI (GPT-4V model) to understand the semantics of the figure content"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"MAX_TOKENS = 2000\n",
"\n",
"def understand_image_with_gptv(api_base, api_key, deployment_name, api_version, image_path):\n",
" \"\"\"\n",
" Generates a description for an image using the GPT-4V model.\n",
"\n",
" Parameters:\n",
" - api_base (str): The base URL of the API.\n",
" - api_key (str): The API key for authentication.\n",
" - deployment_name (str): The name of the deployment.\n",
" - api_version (str): The version of the API.\n",
" - image_path (str): The path to the image file.\n",
" - caption (str): The caption for the image.\n",
"\n",
" Returns:\n",
" - img_description (str): The generated description for the image.\n",
" \"\"\"\n",
" client = AzureOpenAI(\n",
" api_key=api_key, \n",
" api_version=api_version,\n",
" base_url=f\"{api_base}/openai/deployments/{deployment_name}\"\n",
" )\n",
"\n",
" data_url = local_image_to_data_url(image_path)\n",
"\n",
" response = client.chat.completions.create(\n",
" model=deployment_name,\n",
" messages=[\n",
" { \"role\": \"system\", \"content\": \"You are a helpful assistant.\" },\n",
" { \"role\": \"user\", \"content\": [ \n",
" { \n",
" \"type\": \"text\", \n",
" \"text\": \"Describe this image in English:\" \n",
" },\n",
" { \n",
" \"type\": \"image_url\",\n",
" \"image_url\": {\n",
" \"url\": data_url\n",
" }\n",
" }\n",
" ] } \n",
" ],\n",
" max_tokens=MAX_TOKENS\n",
" )\n",
"\n",
" img_description = response.choices[0].message.content\n",
" \n",
" return img_description"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Use Azure OpenAI (GPT-4V model) to understand the semantics of the table content (Abstractions of the table content and images)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"MAX_TOKENS = 2000\n",
"\n",
"def understand_table_with_gptv(api_base, api_key, deployment_name, api_version, image_path):\n",
" \"\"\"\n",
" Generates a description for an image using the GPT-4V model.\n",
"\n",
" Parameters:\n",
" - api_base (str): The base URL of the API.\n",
" - api_key (str): The API key for authentication.\n",
" - deployment_name (str): The name of the deployment.\n",
" - api_version (str): The version of the API.\n",
" - image_path (str): The path to the image file.\n",
" - caption (str): The caption for the image.\n",
"\n",
" Returns:\n",
" - img_description (str): The generated description for the image.\n",
" \"\"\"\n",
" client = AzureOpenAI(\n",
" api_key=api_key, \n",
" api_version=api_version,\n",
" base_url=f\"{api_base}/openai/deployments/{deployment_name}\"\n",
" )\n",
"\n",
" data_url = local_image_to_data_url(image_path)\n",
"\n",
" response = client.chat.completions.create(\n",
" model=deployment_name,\n",
" messages=[\n",
" { \"role\": \"system\", \"content\": \"You are a helpful assistant.\" },\n",
" { \"role\": \"user\", \"content\": [ \n",
" { \n",
" \"type\": \"text\", \n",
" \"text\": \"Describe this Table image in English:\" \n",
" },\n",
" { \n",
" \"type\": \"image_url\",\n",
" \"image_url\": {\n",
" \"url\": data_url\n",
" }\n",
" }\n",
" ] } \n",
" ],\n",
" max_tokens=MAX_TOKENS\n",
" )\n",
"\n",
" img_description = response.choices[0].message.content\n",
" \n",
" return img_description"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Update the markdown with the abstractions of images as generated earlier, updates are being done exactly above each respective image in the markdown"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"def update_figure_description(md_content, img_description, idx):\n",
" \"\"\"\n",
" Updates the figure description in the Markdown content.\n",
"\n",
" Args:\n",
" md_content (str): The original Markdown content.\n",
" img_description (str): The new description for the image.\n",
" idx (int): The index of the figure.\n",
"\n",
" Returns:\n",
" str: The updated Markdown content with the new figure description.\n",
" \"\"\"\n",
"\n",
" # The substring you're looking for\n",
" start_substring = f\"\"\n",
" end_substring = \"\"\n",
" new_string = f\"\"\n",
" \n",
" new_md_content = md_content\n",
" # Find the start and end indices of the part to replace\n",
" start_index = md_content.find(start_substring)\n",
" if start_index != -1: # if start_substring is found\n",
" start_index += len(start_substring) # move the index to the end of start_substring\n",
" end_index = md_content.find(end_substring, start_index)\n",
" if end_index != -1: # if end_substring is found\n",
" # Replace the old string with the new string\n",
" new_md_content = md_content[:start_index] + new_string + md_content[end_index:]\n",
" \n",
" return new_md_content\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Update the markdown with the abstractions of tables as generated earlier, updates are being done exactly above each respective table in the markdown"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"def update_table_description(md_content, img_description, idx):\n",
" \"\"\"\n",
" Updates the figure description in the Markdown content.\n",
"\n",
" Args:\n",
" md_content (str): The original Markdown content.\n",
" img_description (str): The new description for the image.\n",
" idx (int): The index of the figure.\n",
"\n",
" Returns:\n",
" str: The updated Markdown content with the new figure description.\n",
" \"\"\"\n",
"\n",
" # The substring you're looking for\n",
" start_substring = f\"\"\n",
" new_string = f\"\"\n",
" \n",
" new_md_content = md_content\n",
" new_md_content = new_md_content.replace(start_substring,f'{start_substring}\\n{new_string}')\n",
" \n",
" return new_md_content\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Tagging the order to figures and tables - which makes tracebility and explainability easier"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"def add_figures_indexes(string):\n",
" figures = re.finditer(r\"(.*?)\", string, re.DOTALL)\n",
" updated_string = string\n",
" offset = 0\n",
" for index, match in enumerate(figures):\n",
" full_match = match.group(1) # Full ...\n",
" replacement = f\"\\n\\n\\n\\n\"\n",
" start, end = match.start() + offset, match.end() + offset\n",
" updated_string = updated_string[:start] + replacement + updated_string[end:]\n",
" offset += len(replacement) - len(full_match)\n",
"\n",
" return updated_string\n"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"def add_tables_indexes(content):\n",
" # Match markdown tables using a pattern that looks for at least two pipe-separated rows\n",
" # - one for headers (with optional alignment)\n",
" # - one or more for data\n",
" table_pattern = re.compile(\n",
" r\"(?P\\n\\n|^)(?P(?:\\|.*\\|\\n)+\\|(?: *[-:]+[-| :]*?)\\|\\n(?:\\|.*\\|\\n?)+)\",\n",
" re.MULTILINE\n",
" )\n",
"\n",
" updated_content = content\n",
" offset = 0\n",
"\n",
" for index, match in enumerate(table_pattern.finditer(content)):\n",
" start, end = match.start(\"table\") + offset, match.end(\"table\") + offset\n",
" replacement = f\"\\n\" + match.group(\"table\")\n",
" updated_content = updated_content[:start] + replacement + updated_content[end:]\n",
" offset += len(replacement) - len(match.group(\"table\"))\n",
"\n",
" return updated_content\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### docling reader - calls the above functions and generates the markdowns"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"from docling_core.types.doc import ImageRefMode, PictureItem, TableItem\n",
"from docling.datamodel.base_models import FigureElement, InputFormat, Table\n",
"from docling.datamodel.pipeline_options import PdfPipelineOptions\n",
"from docling.document_converter import DocumentConverter, PdfFormatOption\n",
"IMAGE_RESOLUTION_SCALE = 2.0\n",
"\n",
"def docling_reader(input_file_path):\n",
" input_file_path = Path(f\"{input_file_path}\")\n",
" \n",
"\n",
" pipeline_options = PdfPipelineOptions()\n",
" pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE\n",
" pipeline_options.generate_page_images = True\n",
" pipeline_options.generate_table_images = True\n",
" pipeline_options.generate_picture_images = True\n",
"\n",
" doc_converter = DocumentConverter(\n",
" format_options={\n",
" InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)\n",
" }\n",
" )\n",
"\n",
" conv_res = doc_converter.convert(input_file_path)\n",
" md_content = conv_res.document.export_to_markdown(image_placeholder='',page_break_placeholder='')\n",
" \n",
" doc_filename = conv_res.input.file.stem\n",
" output_dir = Path(f\"Output_Markdowns/{doc_filename}\")\n",
" output_dir.mkdir(parents=True, exist_ok=True)\n",
"\n",
" md_filename = output_dir / f\"md_content_{doc_filename}.md\"\n",
" with md_filename.open(\"w\", encoding=\"utf-8\") as fp:\n",
" fp.write(md_content)\n",
"\n",
" md_content = add_figures_indexes(md_content)\n",
" md_content = add_tables_indexes(md_content)\n",
"\n",
" # Save page images\n",
" for page_no, page in conv_res.document.pages.items():\n",
" page_no = page.page_no\n",
" page_image_filename = output_dir / f\"{doc_filename}-{page_no}.png\"\n",
" with page_image_filename.open(\"wb\") as fp:\n",
" page.image.pil_image.save(fp, format=\"PNG\")\n",
"\n",
" # Save images of figures and tables\n",
" table_counter = 0\n",
" picture_counter = 0\n",
" for element, _level in conv_res.document.iterate_items(): \n",
" if isinstance(element, TableItem):\n",
" element_image_filename = (\n",
" output_dir / f\"{doc_filename}-table-{table_counter}.png\"\n",
" )\n",
" with element_image_filename.open(\"wb\") as fp:\n",
" element.image.pil_image.save(fp, \"PNG\")\n",
" table_desc = understand_table_with_gptv(aoai_api_base, aoai_api_key, aoai_deployment_name, aoai_api_version, element_image_filename)\n",
" md_content = update_table_description(md_content, table_desc, table_counter)\n",
" table_counter += 1\n",
" \n",
" if isinstance(element, PictureItem):\n",
" element_image_filename = (\n",
" output_dir / f\"{doc_filename}-picture-{picture_counter}.png\"\n",
" )\n",
" with element_image_filename.open(\"wb\") as fp:\n",
" element.image.pil_image.save(fp, \"PNG\")\n",
" picture_desc = understand_image_with_gptv(aoai_api_base, aoai_api_key, aoai_deployment_name, aoai_api_version, element_image_filename)\n",
" md_content = update_figure_description(md_content, picture_desc, picture_counter)\n",
" picture_counter += 1\n",
" \n",
"\n",
" # Save markdown with embedded pictures\n",
" md_filename = output_dir / f\"md_content_tf_description_{doc_filename}.md\"\n",
" with md_filename.open(\"w\", encoding=\"utf-8\") as fp:\n",
" fp.write(md_content)\n",
" return md_content"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Completed process for Datasheet_AR71UW2SW.pdf\n",
"Completed process for Datasheet_EOE8P39X.pdf\n",
"Completed process for Datasheet_ESS47400SX.pdf\n",
"Completed process for Datasheet_EWUS052B5B.pdf\n"
]
}
],
"source": [
"if __name__ == \"__main__\":\n",
" input_file_paths = [\"Datasheet_AR71UW2SW.pdf\",\"Datasheet_EOE8P39X.pdf\",\"Datasheet_ESS47400SX.pdf\",\"Datasheet_EWUS052B5B.pdf\"]\n",
" for file in input_file_paths:\n",
" docling_reader(file)\n",
" print(f'Completed process for {file}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}