{ "schema_name": "DoclingDocument", "version": "1.5.0", "name": "example_09", "origin": { "mimetype": "text/html", "binary_hash": 6785336133244366107, "filename": "example_09.html" }, "furniture": { "self_ref": "#/furniture", "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified" }, "body": { "self_ref": "#/body", "children": [ { "$ref": "#/texts/0" }, { "$ref": "#/texts/1" }, { "$ref": "#/texts/6" }, { "$ref": "#/texts/8" }, { "$ref": "#/texts/10" } ], "content_layer": "body", "name": "_root_", "label": "unspecified" }, "groups": [ { "self_ref": "#/groups/0", "parent": { "$ref": "#/texts/3" }, "children": [ { "$ref": "#/texts/5" }, { "$ref": "#/pictures/1" }, { "$ref": "#/texts/7" }, { "$ref": "#/pictures/2" }, { "$ref": "#/texts/9" }, { "$ref": "#/pictures/3" } ], "content_layer": "body", "name": "list", "label": "list" }, { "self_ref": "#/groups/1", "parent": { "$ref": "#/texts/11" }, "children": [ { "$ref": "#/texts/13" }, { "$ref": "#/texts/14" }, { "$ref": "#/texts/15" } ], "content_layer": "body", "name": "ordered list", "label": "list" } ], "texts": [ { "self_ref": "#/texts/0", "parent": { "$ref": "#/body" }, "children": [ { "$ref": "#/pictures/0" }, { "$ref": "#/texts/2" }, { "$ref": "#/texts/3" } ], "content_layer": "body", "label": "title", "prov": [], "orig": "Introduction to parsing HTML files with Docling", "text": "Introduction to parsing HTML files with Docling" }, { "self_ref": "#/texts/1", "parent": { "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "caption", "prov": [], "orig": "Docling", "text": "Docling" }, { "self_ref": "#/texts/2", "parent": { "$ref": "#/texts/0" }, "children": [], "content_layer": "body", "label": "text", "prov": [], "orig": "Docling simplifies document processing, parsing diverse formats — including HTML — and providing seamless integrations with the gen AI ecosystem.", "text": "Docling simplifies document processing, parsing diverse formats — including HTML — and providing seamless integrations with the gen AI ecosystem." }, { "self_ref": "#/texts/3", "parent": { "$ref": "#/texts/0" }, "children": [ { "$ref": "#/texts/4" }, { "$ref": "#/groups/0" }, { "$ref": "#/texts/11" } ], "content_layer": "body", "label": "section_header", "prov": [], "orig": "Supported file formats", "text": "Supported file formats", "level": 1 }, { "self_ref": "#/texts/4", "parent": { "$ref": "#/texts/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [], "orig": "Docling supports multiple file formats..", "text": "Docling supports multiple file formats.." }, { "self_ref": "#/texts/5", "parent": { "$ref": "#/groups/0" }, "children": [], "content_layer": "body", "label": "list_item", "prov": [], "orig": "Advanced PDF understanding", "text": "Advanced PDF understanding", "enumerated": false, "marker": "" }, { "self_ref": "#/texts/6", "parent": { "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "caption", "prov": [], "orig": "PDF", "text": "PDF" }, { "self_ref": "#/texts/7", "parent": { "$ref": "#/groups/0" }, "children": [], "content_layer": "body", "label": "list_item", "prov": [], "orig": "Microsoft Office DOCX", "text": "Microsoft Office DOCX", "enumerated": false, "marker": "" }, { "self_ref": "#/texts/8", "parent": { "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "caption", "prov": [], "orig": "DOCX", "text": "DOCX" }, { "self_ref": "#/texts/9", "parent": { "$ref": "#/groups/0" }, "children": [], "content_layer": "body", "label": "list_item", "prov": [], "orig": "HTML files (with optional support for images)", "text": "HTML files (with optional support for images)", "enumerated": false, "marker": "" }, { "self_ref": "#/texts/10", "parent": { "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "caption", "prov": [], "orig": "HTML", "text": "HTML" }, { "self_ref": "#/texts/11", "parent": { "$ref": "#/texts/3" }, "children": [ { "$ref": "#/texts/12" }, { "$ref": "#/groups/1" } ], "content_layer": "body", "label": "section_header", "prov": [], "orig": "Three backends for handling HTML files", "text": "Three backends for handling HTML files", "level": 2 }, { "self_ref": "#/texts/12", "parent": { "$ref": "#/texts/11" }, "children": [], "content_layer": "body", "label": "text", "prov": [], "orig": "Docling has three backends for parsing HTML files:", "text": "Docling has three backends for parsing HTML files:" }, { "self_ref": "#/texts/13", "parent": { "$ref": "#/groups/1" }, "children": [], "content_layer": "body", "label": "list_item", "prov": [], "orig": "HTMLDocumentBackend Ignores images", "text": "HTMLDocumentBackend Ignores images", "enumerated": true, "marker": "" }, { "self_ref": "#/texts/14", "parent": { "$ref": "#/groups/1" }, "children": [], "content_layer": "body", "label": "list_item", "prov": [], "orig": "HTMLDocumentBackendImagesInline Extracts images inline", "text": "HTMLDocumentBackendImagesInline Extracts images inline", "enumerated": true, "marker": "" }, { "self_ref": "#/texts/15", "parent": { "$ref": "#/groups/1" }, "children": [], "content_layer": "body", "label": "list_item", "prov": [], "orig": "HTMLDocumentBackendImagesReferenced Extracts images as references", "text": "HTMLDocumentBackendImagesReferenced Extracts images as references", "enumerated": true, "marker": "" } ], "pictures": [ { "self_ref": "#/pictures/0", "parent": { "$ref": "#/texts/0" }, "children": [], "content_layer": "body", "label": "picture", "prov": [], "captions": [ { "$ref": "#/texts/1" } ], "references": [], "footnotes": [], "annotations": [] }, { "self_ref": "#/pictures/1", "parent": { "$ref": "#/groups/0" }, "children": [], "content_layer": "body", "label": "picture", "prov": [], "captions": [ { "$ref": "#/texts/6" } ], "references": [], "footnotes": [], "annotations": [] }, { "self_ref": "#/pictures/2", "parent": { "$ref": "#/groups/0" }, "children": [], "content_layer": "body", "label": "picture", "prov": [], "captions": [ { "$ref": "#/texts/8" } ], "references": [], "footnotes": [], "annotations": [] }, { "self_ref": "#/pictures/3", "parent": { "$ref": "#/groups/0" }, "children": [], "content_layer": "body", "label": "picture", "prov": [], "captions": [ { "$ref": "#/texts/10" } ], "references": [], "footnotes": [], "annotations": [] } ], "tables": [], "key_value_items": [], "form_items": [], "pages": {} }