diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e31b769..3561c135 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +## [v2.10.0](https://github.com/DS4SD/docling/releases/tag/v2.10.0) - 2024-12-09 + +### Feature + +* Docling-parse v2 as default PDF backend ([#549](https://github.com/DS4SD/docling/issues/549)) ([`aca57f0`](https://github.com/DS4SD/docling/commit/aca57f0527dddcc027dc1ee840e2e492ab997170)) + +### Fix + +* Call into docling-core for legacy document transform ([#551](https://github.com/DS4SD/docling/issues/551)) ([`7972d47`](https://github.com/DS4SD/docling/commit/7972d47f88604f02d6a32527116c4d78eb1005e2)) +* Introduce Image format options in CLI. Silence the tqdm downloading messages. ([#544](https://github.com/DS4SD/docling/issues/544)) ([`78f61a8`](https://github.com/DS4SD/docling/commit/78f61a8522d3a19ecc1d605e8441fb543ca0fa96)) + ## [v2.9.0](https://github.com/DS4SD/docling/releases/tag/v2.9.0) - 2024-12-09 ### Feature diff --git a/docling/cli/main.py b/docling/cli/main.py index 07b37b26..d20c6332 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -29,8 +29,10 @@ from docling.datamodel.pipeline_options import ( AcceleratorDevice, AcceleratorOptions, EasyOcrOptions, + OcrEngine, OcrMacOptions, OcrOptions, + PdfBackend, PdfPipelineOptions, RapidOcrOptions, TableFormerMode, @@ -70,22 +72,6 @@ def version_callback(value: bool): raise typer.Exit() -# Define an enum for the backend options -class PdfBackend(str, Enum): - PYPDFIUM2 = "pypdfium2" - DLPARSE_V1 = "dlparse_v1" - DLPARSE_V2 = "dlparse_v2" - - -# Define an enum for the ocr engines -class OcrEngine(str, Enum): - EASYOCR = "easyocr" - TESSERACT_CLI = "tesseract_cli" - TESSERACT = "tesseract" - OCRMAC = "ocrmac" - RAPIDOCR = "rapidocr" - - def export_documents( conv_results: Iterable[ConversionResult], output_dir: Path, diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 72c55182..e6b3607f 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -19,12 +19,12 @@ if TYPE_CHECKING: class ConversionStatus(str, Enum): - PENDING = auto() - STARTED = auto() - FAILURE = auto() - SUCCESS = auto() - PARTIAL_SUCCESS = auto() - SKIPPED = auto() + PENDING = "pending" + STARTED = "started" + FAILURE = "failure" + SUCCESS = "success" + PARTIAL_SUCCESS = "partial_success" + SKIPPED = "skipped" class InputFormat(str, Enum): @@ -89,15 +89,15 @@ MimeTypeToFormat = { class DocInputType(str, Enum): - PATH = auto() - STREAM = auto() + PATH = "path" + STREAM = "stream" class DoclingComponentType(str, Enum): - DOCUMENT_BACKEND = auto() - MODEL = auto() - DOC_ASSEMBLER = auto() - USER_INPUT = auto() + DOCUMENT_BACKEND = "document_backend" + MODEL = "model" + DOC_ASSEMBLER = "doc_assembler" + USER_INPUT = "user_input" class ErrorItem(BaseModel): diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 242052f7..bae4ab2a 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -33,6 +33,7 @@ from docling_core.types.legacy_doc.document import ( from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument from docling_core.utils.file import resolve_source_to_stream +from docling_core.utils.legacy import docling_document_to_legacy from pydantic import BaseModel from typing_extensions import deprecated @@ -191,259 +192,7 @@ class ConversionResult(BaseModel): @property @deprecated("Use document instead.") def legacy_document(self): - reverse_label_mapping = { - DocItemLabel.CAPTION.value: "Caption", - DocItemLabel.FOOTNOTE.value: "Footnote", - DocItemLabel.FORMULA.value: "Formula", - DocItemLabel.LIST_ITEM.value: "List-item", - DocItemLabel.PAGE_FOOTER.value: "Page-footer", - DocItemLabel.PAGE_HEADER.value: "Page-header", - DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples. - DocItemLabel.SECTION_HEADER.value: "Section-header", - DocItemLabel.TABLE.value: "Table", - DocItemLabel.TEXT.value: "Text", - DocItemLabel.TITLE.value: "Title", - DocItemLabel.DOCUMENT_INDEX.value: "Document Index", - DocItemLabel.CODE.value: "Code", - DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected", - DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected", - DocItemLabel.FORM.value: "Form", - DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region", - DocItemLabel.PARAGRAPH.value: "paragraph", - } - - title = "" - desc = DsDocumentDescription(logs=[]) - - page_hashes = [ - PageReference( - hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)), - page=p.page_no, - model="default", - ) - for p in self.document.pages.values() - ] - - file_info = DsFileInfoObject( - filename=self.input.file.name, - document_hash=self.input.document_hash, - num_pages=self.input.page_count, - page_hashes=page_hashes, - ) - - main_text = [] - tables = [] - figures = [] - equations = [] - footnotes = [] - page_headers = [] - page_footers = [] - - embedded_captions = set() - for ix, (item, level) in enumerate( - self.document.iterate_items(self.document.body) - ): - - if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0: - caption = item.caption_text(self.document) - if caption: - embedded_captions.add(caption) - - for item, level in self.document.iterate_items(): - if isinstance(item, DocItem): - item_type = item.label - - if isinstance(item, (TextItem, ListItem, SectionHeaderItem)): - - if isinstance(item, ListItem) and item.marker: - text = f"{item.marker} {item.text}" - else: - text = item.text - - # Can be empty. - prov = [ - Prov( - bbox=p.bbox.as_tuple(), - page=p.page_no, - span=[0, len(item.text)], - ) - for p in item.prov - ] - main_text.append( - BaseText( - text=text, - obj_type=layout_label_to_ds_type.get(item.label), - name=reverse_label_mapping[item.label], - prov=prov, - ) - ) - - # skip captions of they are embedded in the actual - # floating object - if item_type == DocItemLabel.CAPTION and text in embedded_captions: - continue - - elif isinstance(item, TableItem) and item.data: - index = len(tables) - ref_str = f"#/tables/{index}" - main_text.append( - Ref( - name=reverse_label_mapping[item.label], - obj_type=layout_label_to_ds_type.get(item.label), - ref=ref_str, - ), - ) - - # Initialise empty table data grid (only empty cells) - table_data = [ - [ - TableCell( - text="", - # bbox=[0,0,0,0], - spans=[[i, j]], - obj_type="body", - ) - for j in range(item.data.num_cols) - ] - for i in range(item.data.num_rows) - ] - - # Overwrite cells in table data for which there is actual cell content. - for cell in item.data.table_cells: - for i in range( - min(cell.start_row_offset_idx, item.data.num_rows), - min(cell.end_row_offset_idx, item.data.num_rows), - ): - for j in range( - min(cell.start_col_offset_idx, item.data.num_cols), - min(cell.end_col_offset_idx, item.data.num_cols), - ): - celltype = "body" - if cell.column_header: - celltype = "col_header" - elif cell.row_header: - celltype = "row_header" - elif cell.row_section: - celltype = "row_section" - - def make_spans(cell): - for rspan in range( - min( - cell.start_row_offset_idx, - item.data.num_rows, - ), - min( - cell.end_row_offset_idx, item.data.num_rows - ), - ): - for cspan in range( - min( - cell.start_col_offset_idx, - item.data.num_cols, - ), - min( - cell.end_col_offset_idx, - item.data.num_cols, - ), - ): - yield [rspan, cspan] - - spans = list(make_spans(cell)) - table_data[i][j] = GlmTableCell( - text=cell.text, - bbox=( - cell.bbox.as_tuple() - if cell.bbox is not None - else None - ), # check if this is bottom-left - spans=spans, - obj_type=celltype, - col=j, - row=i, - row_header=cell.row_header, - row_section=cell.row_section, - col_header=cell.column_header, - row_span=[ - cell.start_row_offset_idx, - cell.end_row_offset_idx, - ], - col_span=[ - cell.start_col_offset_idx, - cell.end_col_offset_idx, - ], - ) - - # Compute the caption - caption = item.caption_text(self.document) - - tables.append( - DsSchemaTable( - text=caption, - num_cols=item.data.num_cols, - num_rows=item.data.num_rows, - obj_type=layout_label_to_ds_type.get(item.label), - data=table_data, - prov=[ - Prov( - bbox=p.bbox.as_tuple(), - page=p.page_no, - span=[0, 0], - ) - for p in item.prov - ], - ) - ) - - elif isinstance(item, PictureItem): - index = len(figures) - ref_str = f"#/figures/{index}" - main_text.append( - Ref( - name=reverse_label_mapping[item.label], - obj_type=layout_label_to_ds_type.get(item.label), - ref=ref_str, - ), - ) - - # Compute the caption - caption = item.caption_text(self.document) - - figures.append( - Figure( - prov=[ - Prov( - bbox=p.bbox.as_tuple(), - page=p.page_no, - span=[0, len(caption)], - ) - for p in item.prov - ], - obj_type=layout_label_to_ds_type.get(item.label), - text=caption, - # data=[[]], - ) - ) - - page_dimensions = [ - PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width) - for p in self.document.pages.values() - ] - - ds_doc = DsDocument( - name=title, - description=desc, - file_info=file_info, - main_text=main_text, - equations=equations, - footnotes=footnotes, - page_headers=page_headers, - page_footers=page_footers, - tables=tables, - figures=figures, - page_dimensions=page_dimensions, - ) - - return ds_doc + return docling_document_to_legacy(self.document) class _DummyBackend(AbstractDocumentBackend): diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 2aeab531..52af10c4 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -190,6 +190,26 @@ class OcrMacOptions(OcrOptions): ) +# Define an enum for the backend options +class PdfBackend(str, Enum): + """Enum of valid PDF backends.""" + + PYPDFIUM2 = "pypdfium2" + DLPARSE_V1 = "dlparse_v1" + DLPARSE_V2 = "dlparse_v2" + + +# Define an enum for the ocr engines +class OcrEngine(str, Enum): + """Enum of valid OCR engines.""" + + EASYOCR = "easyocr" + TESSERACT_CLI = "tesseract_cli" + TESSERACT = "tesseract" + OCRMAC = "ocrmac" + RAPIDOCR = "rapidocr" + + class PipelineOptions(BaseModel): """Base pipeline options.""" diff --git a/docling/models/rapid_ocr_model.py b/docling/models/rapid_ocr_model.py index 7cf8c7d3..cba2a8a0 100644 --- a/docling/models/rapid_ocr_model.py +++ b/docling/models/rapid_ocr_model.py @@ -97,24 +97,25 @@ class RapidOcrModel(BaseOcrModel): del high_res_image del im - cells = [ - OcrCell( - id=ix, - text=line[1], - confidence=line[2], - bbox=BoundingBox.from_tuple( - coord=( - (line[0][0][0] / self.scale) + ocr_rect.l, - (line[0][0][1] / self.scale) + ocr_rect.t, - (line[0][2][0] / self.scale) + ocr_rect.l, - (line[0][2][1] / self.scale) + ocr_rect.t, + if result is not None: + cells = [ + OcrCell( + id=ix, + text=line[1], + confidence=line[2], + bbox=BoundingBox.from_tuple( + coord=( + (line[0][0][0] / self.scale) + ocr_rect.l, + (line[0][0][1] / self.scale) + ocr_rect.t, + (line[0][2][0] / self.scale) + ocr_rect.l, + (line[0][2][1] / self.scale) + ocr_rect.t, + ), + origin=CoordOrigin.TOPLEFT, ), - origin=CoordOrigin.TOPLEFT, - ), - ) - for ix, line in enumerate(result) - ] - all_ocr_cells.extend(cells) + ) + for ix, line in enumerate(result) + ] + all_ocr_cells.extend(cells) # Post-process the cells page.cells = self.post_process_cells(all_ocr_cells, page.cells) diff --git a/docs/concepts/architecture.md b/docs/concepts/architecture.md index 07aa1b30..00e81db0 100644 --- a/docs/concepts/architecture.md +++ b/docs/concepts/architecture.md @@ -10,7 +10,7 @@ For each document format, the *document converter* knows which format-specific * The *conversion result* contains the [*Docling document*](./docling_document.md), Docling's fundamental document representation. -Some typical scenarios for using a Docling document include directly calling its *export methods*, such as for markdown, dictionary etc., or having it chunked by a *chunker*. +Some typical scenarios for using a Docling document include directly calling its *export methods*, such as for markdown, dictionary etc., or having it chunked by a [*chunker*](./chunking.md). For more details on Docling's architecture, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869). diff --git a/docs/cli.md b/docs/reference/cli.md similarity index 82% rename from docs/cli.md rename to docs/reference/cli.md index 3f67df0d..25612267 100644 --- a/docs/cli.md +++ b/docs/reference/cli.md @@ -1,4 +1,4 @@ -# CLI Reference +# CLI reference This page provides documentation for our command line tools. @@ -6,4 +6,4 @@ This page provides documentation for our command line tools. :module: docling.cli.main :command: click_app :prog_name: docling - :style: table \ No newline at end of file + :style: table diff --git a/docs/api_reference/docling_document.md b/docs/reference/docling_document.md similarity index 100% rename from docs/api_reference/docling_document.md rename to docs/reference/docling_document.md diff --git a/docs/api_reference/document_converter.md b/docs/reference/document_converter.md similarity index 100% rename from docs/api_reference/document_converter.md rename to docs/reference/document_converter.md diff --git a/docs/api_reference/pipeline_options.md b/docs/reference/pipeline_options.md similarity index 100% rename from docs/api_reference/pipeline_options.md rename to docs/reference/pipeline_options.md diff --git a/docs/usage.md b/docs/usage.md index e7a214a0..9a5b555a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -22,9 +22,7 @@ A simple example would look like this: docling https://arxiv.org/pdf/2206.01062 ``` -To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./cli.md). - - +To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md). ### Advanced options @@ -130,29 +128,37 @@ You can limit the CPU threads used by Docling by setting the environment variabl ## Chunking -You can perform a hierarchy-aware chunking of a Docling document as follows: +You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a +`HybridChunker`, as shown below (for more details check out +[this example](examples/hybrid_chunking.ipynb)): ```python from docling.document_converter import DocumentConverter -from docling_core.transforms.chunker import HierarchicalChunker +from docling.chunking import HybridChunker conv_res = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062") doc = conv_res.document -chunks = list(HierarchicalChunker().chunk(doc)) -print(chunks[30]) +chunker = HybridChunker(tokenizer="BAAI/bge-small-en-v1.5") # set tokenizer as needed +chunk_iter = chunker.chunk(doc) +``` + +An example chunk would look like this: + +```python +print(list(chunk_iter)[11]) # { -# "text": "Lately, new types of ML models for document-layout analysis have emerged [...]", +# "text": "In this paper, we present the DocLayNet dataset. [...]", # "meta": { # "doc_items": [{ -# "self_ref": "#/texts/40", +# "self_ref": "#/texts/28", # "label": "text", # "prov": [{ # "page_no": 2, -# "bbox": {"l": 317.06, "t": 325.81, "r": 559.18, "b": 239.97, ...}, -# }] -# }], -# "headings": ["2 RELATED WORK"], +# "bbox": {"l": 53.29, "t": 287.14, "r": 295.56, "b": 212.37, ...}, +# }], ..., +# }, ...], +# "headings": ["1 INTRODUCTION"], # } # } ``` diff --git a/mkdocs.yml b/mkdocs.yml index 4db4bc4f..1dd8c345 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -56,7 +56,6 @@ nav: - "Docling": index.md - Installation: installation.md - Usage: usage.md - - CLI: cli.md - FAQ: faq.md - Docling v2: v2.md - Concepts: @@ -77,15 +76,12 @@ nav: - "Multimodal export": examples/export_multimodal.py - "Force full page OCR": examples/full_page_ocr.py - "Accelerator options": examples/run_with_acclerators.py + - Chunking: + - "Hybrid chunking": examples/hybrid_chunking.ipynb - RAG / QA: - "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb - "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb - "Hybrid RAG with Qdrant": examples/hybrid_rag_qdrant.ipynb - - Chunking: - - "Hybrid chunking": examples/hybrid_chunking.ipynb - # - Chunking: examples/chunking.md - # - CLI: - # - CLI: examples/cli.md - Integrations: - Integrations: integrations/index.md - "🐝 Bee": integrations/bee.md @@ -100,10 +96,13 @@ nav: - "spaCy": integrations/spacy.md - "txtai": integrations/txtai.md # - "LangChain 🦜🔗": integrations/langchain.md - - API reference: - - Document Converter: api_reference/document_converter.md - - Pipeline options: api_reference/pipeline_options.md - - Docling Document: api_reference/docling_document.md + - Reference: + - Python API: + - Document Converter: reference/document_converter.md + - Pipeline options: reference/pipeline_options.md + - Docling Document: reference/docling_document.md + - CLI: + - CLI reference: reference/cli.md markdown_extensions: - pymdownx.superfences diff --git a/pyproject.toml b/pyproject.toml index fb9fac32..12c5f530 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "docling" -version = "2.9.0" # DO NOT EDIT, updated automatically +version = "2.10.0" # DO NOT EDIT, updated automatically description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Panos Vagenas ", "Peter Staar "] license = "MIT"