diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index e04e2803..b2a295dc 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -1,6 +1,10 @@ on: workflow_call: +env: + HF_HUB_DOWNLOAD_TIMEOUT: "60" + HF_HUB_ETAG_TIMEOUT: "60" + jobs: run-checks: runs-on: ubuntu-latest @@ -14,6 +18,11 @@ jobs: - name: Set TESSDATA_PREFIX run: | echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV" + - name: Cache Hugging Face models + uses: actions/cache@v4 + with: + path: ~/.cache/huggingface + key: huggingface-cache-py${{ matrix.python-version }} - uses: ./.github/actions/setup-poetry with: python-version: ${{ matrix.python-version }} @@ -28,7 +37,7 @@ jobs: run: | for file in docs/examples/*.py; do # Skip batch_convert.py - if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment).py ]]; then + if [[ "$(basename "$file")" =~ ^(batch_convert|minimal_vlm_pipeline|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api).py ]]; then echo "Skipping $file" continue fi diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 0fc3ac7a..dd976ea3 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -14,10 +14,6 @@ jobs: - uses: ./.github/actions/setup-poetry - name: Build docs run: poetry run mkdocs build --verbose --clean - - name: Make docs LLM ready - if: inputs.deploy - uses: demodrive-ai/llms-txt-action@ad720693843126e6a73910a667d0eba37c1dea4b - name: Build and push docs if: inputs.deploy - run: poetry run mkdocs gh-deploy --force --dirty - + run: poetry run mkdocs gh-deploy --force diff --git a/CHANGELOG.md b/CHANGELOG.md index 597dde6e..917b3be0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,134 @@ +## [v2.25.1](https://github.com/DS4SD/docling/releases/tag/v2.25.1) - 2025-03-03 + +### Fix + +* Enable locks for threadsafe pdfium ([#1052](https://github.com/DS4SD/docling/issues/1052)) ([`8dc0562`](https://github.com/DS4SD/docling/commit/8dc0562542299cf972d14eeeb4393e50b589c8ad)) +* **html:** Use 'start' attribute when parsing ordered lists from HTML docs ([#1062](https://github.com/DS4SD/docling/issues/1062)) ([`de7b963`](https://github.com/DS4SD/docling/commit/de7b963b09a34916f0a8d99649269aeb37db1408)) + +### Documentation + +* Improve docs on token limit warning triggered by HybridChunker ([#1077](https://github.com/DS4SD/docling/issues/1077)) ([`db3ceef`](https://github.com/DS4SD/docling/commit/db3ceefd4ae6251a97e333bcb03051698b3fa71a)) + +## [v2.25.0](https://github.com/DS4SD/docling/releases/tag/v2.25.0) - 2025-02-26 + +### Feature + +* [Experimental] Introduce VLM pipeline using HF AutoModelForVision2Seq, featuring SmolDocling model ([#1054](https://github.com/DS4SD/docling/issues/1054)) ([`3c9fe76`](https://github.com/DS4SD/docling/commit/3c9fe76b706b7714b25d49cb09050c42e3b8c849)) +* **cli:** Add option for downloading all models, refine help messages ([#1061](https://github.com/DS4SD/docling/issues/1061)) ([`ab683e4`](https://github.com/DS4SD/docling/commit/ab683e4fb6df4973d2efda04f00c269a2dc95f5b)) + +### Fix + +* Vlm using artifacts path ([#1057](https://github.com/DS4SD/docling/issues/1057)) ([`e197225`](https://github.com/DS4SD/docling/commit/e1972257399151503d60b4806976c8b9b6911aa8)) +* **html:** Parse text in div elements as TextItem ([#1041](https://github.com/DS4SD/docling/issues/1041)) ([`1b0ead6`](https://github.com/DS4SD/docling/commit/1b0ead69078030a0e4d25b51450ef2aa4a2e79fc)) + +### Documentation + +* Extend chunking docs, add FAQ on token limit ([#1053](https://github.com/DS4SD/docling/issues/1053)) ([`c84b973`](https://github.com/DS4SD/docling/commit/c84b973959a254db22ac9a7dc8810628e4808a2d)) + +## [v2.24.0](https://github.com/DS4SD/docling/releases/tag/v2.24.0) - 2025-02-20 + +### Feature + +* Implement new reading-order model ([#916](https://github.com/DS4SD/docling/issues/916)) ([`c93e369`](https://github.com/DS4SD/docling/commit/c93e36988f1e1e461477223143c2c1fb2162d11f)) + +## [v2.23.1](https://github.com/DS4SD/docling/releases/tag/v2.23.1) - 2025-02-20 + +### Fix + +* Runtime error when Pandas Series is not always of string type ([#1024](https://github.com/DS4SD/docling/issues/1024)) ([`6796f0a`](https://github.com/DS4SD/docling/commit/6796f0a13263281cd48712b3c71579bfd81bb0d1)) + +### Documentation + +* Revamp picture description example ([#1015](https://github.com/DS4SD/docling/issues/1015)) ([`27c0400`](https://github.com/DS4SD/docling/commit/27c04007bc1be7a6f6c90aaf04ea9f4ff8eb1f3d)) + +## [v2.23.0](https://github.com/DS4SD/docling/releases/tag/v2.23.0) - 2025-02-17 + +### Feature + +* Support cuda:n GPU device allocation ([#694](https://github.com/DS4SD/docling/issues/694)) ([`77eb77b`](https://github.com/DS4SD/docling/commit/77eb77bdc2c07b632a1d171826d1855a5218399e)) +* **xml-jats:** Parse XML JATS documents ([#967](https://github.com/DS4SD/docling/issues/967)) ([`428b656`](https://github.com/DS4SD/docling/commit/428b656793cb75d108c69f20c254be7c198cee5c)) + +### Fix + +* Revise DocTags, fix iterate_items to output content_layer in items ([#965](https://github.com/DS4SD/docling/issues/965)) ([`6e75f0b`](https://github.com/DS4SD/docling/commit/6e75f0b5d3ee42738a80049d4cf2fa6d34e8ab97)) + +## [v2.22.0](https://github.com/DS4SD/docling/releases/tag/v2.22.0) - 2025-02-14 + +### Feature + +* Add support for CSV input with new backend to transform CSV files to DoclingDocument ([#945](https://github.com/DS4SD/docling/issues/945)) ([`00d9405`](https://github.com/DS4SD/docling/commit/00d9405b0ac519d321ae54e8150f5facbaabbe14)) +* Introduce the enable_remote_services option to allow remote connections while processing ([#941](https://github.com/DS4SD/docling/issues/941)) ([`2716c7d`](https://github.com/DS4SD/docling/commit/2716c7d4ffb836664178178d3f8d01b7f9112595)) +* Allow artifacts_path to be defined as ENV ([#940](https://github.com/DS4SD/docling/issues/940)) ([`5101e25`](https://github.com/DS4SD/docling/commit/5101e2519e7a5bb727531b1412b1131a7cfbda52)) + +### Fix + +* Update Pillow constraints ([#958](https://github.com/DS4SD/docling/issues/958)) ([`af19c03`](https://github.com/DS4SD/docling/commit/af19c03f6e5e0b24e12d6a3baac6c46a4c8b10d1)) +* Fix the initialization of the TesseractOcrModel ([#935](https://github.com/DS4SD/docling/issues/935)) ([`c47ae70`](https://github.com/DS4SD/docling/commit/c47ae700ece2ea4efee17f82e4667c1ce9a0ed2a)) + +### Documentation + +* Update example Dockerfile with download CLI ([#929](https://github.com/DS4SD/docling/issues/929)) ([`7493d5b`](https://github.com/DS4SD/docling/commit/7493d5b01f8be60294afeffdfb54a62bb74bcc92)) +* Examples for picture descriptions ([#951](https://github.com/DS4SD/docling/issues/951)) ([`2d66e99`](https://github.com/DS4SD/docling/commit/2d66e99b69f39a282109c366fae3679f41c6e081)) + +## [v2.21.0](https://github.com/DS4SD/docling/releases/tag/v2.21.0) - 2025-02-10 + +### Feature + +* Add content_layer property to items to address body, furniture and other roles ([#735](https://github.com/DS4SD/docling/issues/735)) ([`cf78d5b`](https://github.com/DS4SD/docling/commit/cf78d5b7b9f12728270e673857fd299efc01a7db)) + +## [v2.20.0](https://github.com/DS4SD/docling/releases/tag/v2.20.0) - 2025-02-07 + +### Feature + +* Describe pictures using vision models ([#259](https://github.com/DS4SD/docling/issues/259)) ([`4cc6e3e`](https://github.com/DS4SD/docling/commit/4cc6e3ea5e858b367136acc729b723ea0552d22a)) + +### Fix + +* Remove unused httpx ([#919](https://github.com/DS4SD/docling/issues/919)) ([`c18f47c`](https://github.com/DS4SD/docling/commit/c18f47c5c032c49bf3175aecd2236df37c0e9ae1)) + +## [v2.19.0](https://github.com/DS4SD/docling/releases/tag/v2.19.0) - 2025-02-07 + +### Feature + +* New artifacts path and CLI utility ([#876](https://github.com/DS4SD/docling/issues/876)) ([`ed74fe2`](https://github.com/DS4SD/docling/commit/ed74fe2ec0a702834f0deacfdb5717c8c587dab1)) + +### Fix + +* **markdown:** Handle nested lists ([#910](https://github.com/DS4SD/docling/issues/910)) ([`90b766e`](https://github.com/DS4SD/docling/commit/90b766e2ae1695a759191df37c272efc09be5ee3)) +* Test cases for RTL programmatic PDFs and fixes for the formula model ([#903](https://github.com/DS4SD/docling/issues/903)) ([`9114ada`](https://github.com/DS4SD/docling/commit/9114ada7bc4dd45ce0046de2f9d00a80ccb25c79)) +* **msword_backend:** Handle conversion error in label parsing ([#896](https://github.com/DS4SD/docling/issues/896)) ([`722a6eb`](https://github.com/DS4SD/docling/commit/722a6eb7b994a0261312a356df80b2fced121812)) +* Enrichment models batch size and expose picture classifier ([#878](https://github.com/DS4SD/docling/issues/878)) ([`5ad6de0`](https://github.com/DS4SD/docling/commit/5ad6de05600315617b574bd12af553e00b4d316e)) + +### Documentation + +* Introduce example with custom models for RapidOCR ([#874](https://github.com/DS4SD/docling/issues/874)) ([`6d3fea0`](https://github.com/DS4SD/docling/commit/6d3fea019635bd6ca94bd36c3928b28c245d638d)) + +## [v2.18.0](https://github.com/DS4SD/docling/releases/tag/v2.18.0) - 2025-02-03 + +### Feature + +* Expose equation exports ([#869](https://github.com/DS4SD/docling/issues/869)) ([`6a76b49`](https://github.com/DS4SD/docling/commit/6a76b49a4756fd00503d0baec5db8d23be8207e8)) +* Add option to define page range ([#852](https://github.com/DS4SD/docling/issues/852)) ([`70d68b6`](https://github.com/DS4SD/docling/commit/70d68b6164c6c7029b39dd65c5a278278768c381)) +* **docx:** Support of SDTs in docx backend ([#853](https://github.com/DS4SD/docling/issues/853)) ([`d727b04`](https://github.com/DS4SD/docling/commit/d727b04ad080df0b3811902059e0fe0539f7037e)) +* Python 3.13 support ([#841](https://github.com/DS4SD/docling/issues/841)) ([`4df085a`](https://github.com/DS4SD/docling/commit/4df085aa6c6f5cc043f4f7a9f0c1b4af43f95e8f)) + +### Fix + +* **markdown:** Fix parsing if doc ending with table ([#873](https://github.com/DS4SD/docling/issues/873)) ([`5ac2887`](https://github.com/DS4SD/docling/commit/5ac2887e4ad52ed6e7147e3af1e3ee5eb0006a70)) +* **markdown:** Add support for HTML content ([#855](https://github.com/DS4SD/docling/issues/855)) ([`94751a7`](https://github.com/DS4SD/docling/commit/94751a78f4f61b78f64952190717440ec6d84c62)) +* **docx:** Merged table cells not properly converted ([#857](https://github.com/DS4SD/docling/issues/857)) ([`0cd81a8`](https://github.com/DS4SD/docling/commit/0cd81a81226c0d4aa4f20e4e58c3b33e4fe50ce0)) +* Processing of placeholder shapes in pptx that have text but no bbox ([#868](https://github.com/DS4SD/docling/issues/868)) ([`eff16b6`](https://github.com/DS4SD/docling/commit/eff16b62ccdb0eb764eeacee550563898784dd6a)) +* KeyError in tableformer prediction ([#854](https://github.com/DS4SD/docling/issues/854)) ([`b1cf796`](https://github.com/DS4SD/docling/commit/b1cf796730901222ad0882ff44efa0ef43a743ee)) +* Fixed docx import with headers that are also lists ([#842](https://github.com/DS4SD/docling/issues/842)) ([`2c037ae`](https://github.com/DS4SD/docling/commit/2c037ae62e123967eddf065ccb2abbaf78cdcab3)) +* Use new add_code in html backend and add more typing hints ([#850](https://github.com/DS4SD/docling/issues/850)) ([`2a1f8af`](https://github.com/DS4SD/docling/commit/2a1f8afe7e8d9d508aebcfd3998ee1625c938933)) +* **markdown:** Fix empty block handling ([#843](https://github.com/DS4SD/docling/issues/843)) ([`bccb022`](https://github.com/DS4SD/docling/commit/bccb022fc82d4d0ef2ed2d8bea5f5d8e6400c1d9)) +* Fix for the crash when encountering WMF images in pptx and docx ([#837](https://github.com/DS4SD/docling/issues/837)) ([`fea0a99`](https://github.com/DS4SD/docling/commit/fea0a99a95d97e72687f48f8174d31102655483e)) + +### Documentation + +* Updated the readme with upcoming features ([#831](https://github.com/DS4SD/docling/issues/831)) ([`d7c0828`](https://github.com/DS4SD/docling/commit/d7c082894e3ef85881665d20167198adcbc1becd)) +* Add example for inspection of picture content ([#624](https://github.com/DS4SD/docling/issues/624)) ([`f9144f2`](https://github.com/DS4SD/docling/commit/f9144f2bb6b322244c9d37683dca1e537ec6d781)) + ## [v2.17.0](https://github.com/DS4SD/docling/releases/tag/v2.17.0) - 2025-01-28 ### Feature diff --git a/Dockerfile b/Dockerfile index c863f1c2..d210b5ad 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,8 +16,7 @@ ENV TORCH_HOME=/tmp/ COPY docs/examples/minimal.py /root/minimal.py -RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);' -RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);' +RUN docling-tools models download # On container environments, always set a thread budget to avoid undesired thread congestion. ENV OMP_NUM_THREADS=4 @@ -25,3 +24,6 @@ ENV OMP_NUM_THREADS=4 # On container shell: # > cd /root/ # > python minimal.py + +# Running as `docker run -e DOCLING_ARTIFACTS_PATH=/root/.cache/docling/models` will use the +# model weights included in the container image. diff --git a/README.md b/README.md index 5a957d60..842253e9 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,6 @@ For individual model usage, please refer to the model licenses found in the orig Docling has been brought to you by IBM. -[supported_formats]: https://ds4sd.github.io/docling/supported_formats/ +[supported_formats]: https://ds4sd.github.io/docling/usage/supported_formats/ [docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/ [integrations]: https://ds4sd.github.io/docling/integrations/ diff --git a/docling/backend/csv_backend.py b/docling/backend/csv_backend.py new file mode 100644 index 00000000..9097acf8 --- /dev/null +++ b/docling/backend/csv_backend.py @@ -0,0 +1,125 @@ +import csv +import logging +import warnings +from io import BytesIO, StringIO +from pathlib import Path +from typing import Set, Union + +from docling_core.types.doc import DoclingDocument, DocumentOrigin, TableCell, TableData + +from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument + +_log = logging.getLogger(__name__) + + +class CsvDocumentBackend(DeclarativeDocumentBackend): + content: StringIO + + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) + + # Load content + try: + if isinstance(self.path_or_stream, BytesIO): + self.content = StringIO(self.path_or_stream.getvalue().decode("utf-8")) + elif isinstance(self.path_or_stream, Path): + self.content = StringIO(self.path_or_stream.read_text("utf-8")) + self.valid = True + except Exception as e: + raise RuntimeError( + f"CsvDocumentBackend could not load document with hash {self.document_hash}" + ) from e + return + + def is_valid(self) -> bool: + return self.valid + + @classmethod + def supports_pagination(cls) -> bool: + return False + + def unload(self): + if isinstance(self.path_or_stream, BytesIO): + self.path_or_stream.close() + self.path_or_stream = None + + @classmethod + def supported_formats(cls) -> Set[InputFormat]: + return {InputFormat.CSV} + + def convert(self) -> DoclingDocument: + """ + Parses the CSV data into a structured document model. + """ + + # Detect CSV dialect + head = self.content.readline() + dialect = csv.Sniffer().sniff(head, ",;\t|:") + _log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"') + if not dialect.delimiter in {",", ";", "\t", "|", ":"}: + raise RuntimeError( + f"Cannot convert csv with unknown delimiter {dialect.delimiter}." + ) + + # Parce CSV + self.content.seek(0) + result = csv.reader(self.content, dialect=dialect, strict=True) + self.csv_data = list(result) + _log.info(f"Detected {len(self.csv_data)} lines") + + # Ensure uniform column length + expected_length = len(self.csv_data[0]) + is_uniform = all(len(row) == expected_length for row in self.csv_data) + if not is_uniform: + warnings.warn( + f"Inconsistent column lengths detected in CSV data. " + f"Expected {expected_length} columns, but found rows with varying lengths. " + f"Ensure all rows have the same number of columns." + ) + + # Parse the CSV into a structured document model + origin = DocumentOrigin( + filename=self.file.name or "file.csv", + mimetype="text/csv", + binary_hash=self.document_hash, + ) + + doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin) + + if self.is_valid(): + # Convert CSV data to table + if self.csv_data: + num_rows = len(self.csv_data) + num_cols = max(len(row) for row in self.csv_data) + + table_data = TableData( + num_rows=num_rows, + num_cols=num_cols, + table_cells=[], + ) + + # Convert each cell to TableCell + for row_idx, row in enumerate(self.csv_data): + for col_idx, cell_value in enumerate(row): + cell = TableCell( + text=str(cell_value), + row_span=1, # CSV doesn't support merged cells + col_span=1, + start_row_offset_idx=row_idx, + end_row_offset_idx=row_idx + 1, + start_col_offset_idx=col_idx, + end_col_offset_idx=col_idx + 1, + col_header=row_idx == 0, # First row as header + row_header=False, + ) + table_data.table_cells.append(cell) + + doc.add_table(data=table_data) + else: + raise RuntimeError( + f"Cannot convert doc with {self.document_hash} because the backend failed to init." + ) + + return doc diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py index 27a368f9..9178883f 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docling/backend/docling_parse_v2_backend.py @@ -12,6 +12,7 @@ from pypdfium2 import PdfPage from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.datamodel.base_models import Cell, Size +from docling.utils.locks import pypdfium2_lock if TYPE_CHECKING: from docling.datamodel.document import InputDocument @@ -182,20 +183,24 @@ class DoclingParseV2PageBackend(PdfPageBackend): padbox.r = page_size.width - padbox.r padbox.t = page_size.height - padbox.t - image = ( - self._ppage.render( - scale=scale * 1.5, - rotation=0, # no additional rotation - crop=padbox.as_tuple(), - ) - .to_pil() - .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale))) - ) # We resize the image from 1.5x the given scale to make it sharper. + with pypdfium2_lock: + image = ( + self._ppage.render( + scale=scale * 1.5, + rotation=0, # no additional rotation + crop=padbox.as_tuple(), + ) + .to_pil() + .resize( + size=(round(cropbox.width * scale), round(cropbox.height * scale)) + ) + ) # We resize the image from 1.5x the given scale to make it sharper. return image def get_size(self) -> Size: - return Size(width=self._ppage.get_width(), height=self._ppage.get_height()) + with pypdfium2_lock: + return Size(width=self._ppage.get_width(), height=self._ppage.get_height()) def unload(self): self._ppage = None @@ -206,23 +211,24 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) - self._pdoc = pdfium.PdfDocument(self.path_or_stream) - self.parser = pdf_parser_v2("fatal") + with pypdfium2_lock: + self._pdoc = pdfium.PdfDocument(self.path_or_stream) + self.parser = pdf_parser_v2("fatal") - success = False - if isinstance(self.path_or_stream, BytesIO): - success = self.parser.load_document_from_bytesio( - self.document_hash, self.path_or_stream - ) - elif isinstance(self.path_or_stream, Path): - success = self.parser.load_document( - self.document_hash, str(self.path_or_stream) - ) + success = False + if isinstance(self.path_or_stream, BytesIO): + success = self.parser.load_document_from_bytesio( + self.document_hash, self.path_or_stream + ) + elif isinstance(self.path_or_stream, Path): + success = self.parser.load_document( + self.document_hash, str(self.path_or_stream) + ) - if not success: - raise RuntimeError( - f"docling-parse v2 could not load document {self.document_hash}." - ) + if not success: + raise RuntimeError( + f"docling-parse v2 could not load document {self.document_hash}." + ) def page_count(self) -> int: # return len(self._pdoc) # To be replaced with docling-parse API @@ -236,9 +242,10 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend): return len_2 def load_page(self, page_no: int) -> DoclingParseV2PageBackend: - return DoclingParseV2PageBackend( - self.parser, self.document_hash, page_no, self._pdoc[page_no] - ) + with pypdfium2_lock: + return DoclingParseV2PageBackend( + self.parser, self.document_hash, page_no, self._pdoc[page_no] + ) def is_valid(self) -> bool: return self.page_count() > 0 @@ -246,5 +253,6 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend): def unload(self): super().unload() self.parser.unload_document(self.document_hash) - self._pdoc.close() - self._pdoc = None + with pypdfium2_lock: + self._pdoc.close() + self._pdoc = None diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 286dfbfa..d14b422f 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1,17 +1,22 @@ import logging from io import BytesIO from pathlib import Path -from typing import Optional, Set, Union +from typing import Final, Optional, Union, cast -from bs4 import BeautifulSoup, Tag +from bs4 import BeautifulSoup, NavigableString, PageElement, Tag +from bs4.element import PreformattedString from docling_core.types.doc import ( + DocItem, DocItemLabel, DoclingDocument, DocumentOrigin, + GroupItem, GroupLabel, TableCell, TableData, ) +from docling_core.types.doc.document import ContentLayer +from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.datamodel.base_models import InputFormat @@ -19,21 +24,38 @@ from docling.datamodel.document import InputDocument _log = logging.getLogger(__name__) +# tags that generate NodeItem elements +TAGS_FOR_NODE_ITEMS: Final = [ + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "p", + "pre", + "ul", + "ol", + "li", + "table", + "figure", + "img", +] + class HTMLDocumentBackend(DeclarativeDocumentBackend): + @override def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) - _log.debug("About to init HTML backend...") self.soup: Optional[Tag] = None # HTML file: self.path_or_stream = path_or_stream # Initialise the parents for the hierarchy self.max_levels = 10 self.level = 0 - self.parents = {} # type: ignore + self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {} for i in range(0, self.max_levels): self.parents[i] = None - self.labels = {} # type: ignore try: if isinstance(self.path_or_stream, BytesIO): @@ -45,16 +67,20 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.soup = BeautifulSoup(html_content, "html.parser") except Exception as e: raise RuntimeError( - f"Could not initialize HTML backend for file with hash {self.document_hash}." + "Could not initialize HTML backend for file with " + f"hash {self.document_hash}." ) from e + @override def is_valid(self) -> bool: return self.soup is not None @classmethod + @override def supports_pagination(cls) -> bool: return False + @override def unload(self): if isinstance(self.path_or_stream, BytesIO): self.path_or_stream.close() @@ -62,9 +88,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.path_or_stream = None @classmethod - def supported_formats(cls) -> Set[InputFormat]: + @override + def supported_formats(cls) -> set[InputFormat]: return {InputFormat.HTML} + @override def convert(self) -> DoclingDocument: # access self.path_or_stream to load stuff origin = DocumentOrigin( @@ -80,107 +108,116 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): assert self.soup is not None content = self.soup.body or self.soup # Replace
tags with newline characters - for br in content.find_all("br"): - br.replace_with("\n") - doc = self.walk(content, doc) + # TODO: remove style to avoid losing text from tags like i, b, span, ... + for br in content("br"): + br.replace_with(NavigableString("\n")) + + headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"]) + self.content_layer = ( + ContentLayer.BODY if headers is None else ContentLayer.FURNITURE + ) + self.walk(content, doc) else: raise RuntimeError( - f"Cannot convert doc with {self.document_hash} because the backend failed to init." + f"Cannot convert doc with {self.document_hash} because the backend " + "failed to init." ) return doc - def walk(self, element: Tag, doc: DoclingDocument): - try: - # Iterate over elements in the body of the document - for idx, element in enumerate(element.children): + def walk(self, tag: Tag, doc: DoclingDocument) -> None: + + # Iterate over elements in the body of the document + text: str = "" + for element in tag.children: + if isinstance(element, Tag): try: - self.analyse_element(element, idx, doc) + self.analyze_tag(cast(Tag, element), doc) except Exception as exc_child: - - _log.error(" -> error treating child: ", exc_child) - _log.error(" => element: ", element, "\n") + _log.error( + f"Error processing child from tag{tag.name}: {exc_child}" + ) raise exc_child + elif isinstance(element, NavigableString) and not isinstance( + element, PreformattedString + ): + # Floating text outside paragraphs or analyzed tags + text += element + siblings: list[Tag] = [ + item for item in element.next_siblings if isinstance(item, Tag) + ] + if element.next_sibling is None or any( + [item.name in TAGS_FOR_NODE_ITEMS for item in siblings] + ): + text = text.strip() + if text and tag.name in ["div"]: + doc.add_text( + parent=self.parents[self.level], + label=DocItemLabel.TEXT, + text=text, + content_layer=self.content_layer, + ) + text = "" - except Exception as exc: - pass + return - return doc - - def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument): - """ - if element.name!=None: - _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})") - """ - - if element.name in self.labels: - self.labels[element.name] += 1 + def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None: + if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: + self.handle_header(tag, doc) + elif tag.name in ["p"]: + self.handle_paragraph(tag, doc) + elif tag.name in ["pre"]: + self.handle_code(tag, doc) + elif tag.name in ["ul", "ol"]: + self.handle_list(tag, doc) + elif tag.name in ["li"]: + self.handle_list_item(tag, doc) + elif tag.name == "table": + self.handle_table(tag, doc) + elif tag.name == "figure": + self.handle_figure(tag, doc) + elif tag.name == "img": + self.handle_image(tag, doc) else: - self.labels[element.name] = 1 + self.walk(tag, doc) - if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: - self.handle_header(element, idx, doc) - elif element.name in ["p"]: - self.handle_paragraph(element, idx, doc) - elif element.name in ["pre"]: - self.handle_code(element, idx, doc) - elif element.name in ["ul", "ol"]: - self.handle_list(element, idx, doc) - elif element.name in ["li"]: - self.handle_listitem(element, idx, doc) - elif element.name == "table": - self.handle_table(element, idx, doc) - elif element.name == "figure": - self.handle_figure(element, idx, doc) - elif element.name == "img": - self.handle_image(element, idx, doc) - else: - self.walk(element, doc) + def get_text(self, item: PageElement) -> str: + """Get the text content of a tag.""" + parts: list[str] = self.extract_text_recursively(item) - def get_direct_text(self, item: Tag): - """Get the direct text of the
  • element (ignoring nested lists).""" - text = item.find(string=True, recursive=False) - if isinstance(text, str): - return text.strip() - - return "" + return "".join(parts) + " " # Function to recursively extract text from all child nodes - def extract_text_recursively(self, item: Tag): - result = [] + def extract_text_recursively(self, item: PageElement) -> list[str]: + result: list[str] = [] - if isinstance(item, str): + if isinstance(item, NavigableString): return [item] - if item.name not in ["ul", "ol"]: - try: - # Iterate over the children (and their text and tails) - for child in item: - try: - # Recursively get the child's text content - result.extend(self.extract_text_recursively(child)) - except: - pass - except: - _log.warn("item has no children") - pass + tag = cast(Tag, item) + if tag.name not in ["ul", "ol"]: + for child in tag: + # Recursively get the child's text content + result.extend(self.extract_text_recursively(child)) - return "".join(result) + " " + return ["".join(result) + " "] - def handle_header(self, element: Tag, idx: int, doc: DoclingDocument): + def handle_header(self, element: Tag, doc: DoclingDocument) -> None: """Handles header tags (h1, h2, etc.).""" hlevel = int(element.name.replace("h", "")) - slevel = hlevel - 1 - - label = DocItemLabel.SECTION_HEADER text = element.text.strip() if hlevel == 1: - for key, val in self.parents.items(): + self.content_layer = ContentLayer.BODY + + for key in self.parents.keys(): self.parents[key] = None self.level = 1 self.parents[self.level] = doc.add_text( - parent=self.parents[0], label=DocItemLabel.TITLE, text=text + parent=self.parents[0], + label=DocItemLabel.TITLE, + text=text, + content_layer=self.content_layer, ) else: if hlevel > self.level: @@ -191,13 +228,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): name=f"header-{i}", label=GroupLabel.SECTION, parent=self.parents[i - 1], + content_layer=self.content_layer, ) self.level = hlevel elif hlevel < self.level: # remove the tail - for key, val in self.parents.items(): + for key in self.parents.keys(): if key > hlevel: self.parents[key] = None self.level = hlevel @@ -206,42 +244,58 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[hlevel - 1], text=text, level=hlevel, + content_layer=self.content_layer, ) - def handle_code(self, element: Tag, idx: int, doc: DoclingDocument): + def handle_code(self, element: Tag, doc: DoclingDocument) -> None: """Handles monospace code snippets (pre).""" if element.text is None: return text = element.text.strip() - label = DocItemLabel.CODE - if len(text) == 0: - return - doc.add_code(parent=self.parents[self.level], text=text) + if text: + doc.add_code( + parent=self.parents[self.level], + text=text, + content_layer=self.content_layer, + ) - def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument): + def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None: """Handles paragraph tags (p).""" if element.text is None: return text = element.text.strip() - label = DocItemLabel.PARAGRAPH - if len(text) == 0: - return - doc.add_text(parent=self.parents[self.level], label=label, text=text) + if text: + doc.add_text( + parent=self.parents[self.level], + label=DocItemLabel.TEXT, + text=text, + content_layer=self.content_layer, + ) - def handle_list(self, element: Tag, idx: int, doc: DoclingDocument): + def handle_list(self, element: Tag, doc: DoclingDocument) -> None: """Handles list tags (ul, ol) and their list items.""" if element.name == "ul": # create a list group self.parents[self.level + 1] = doc.add_group( - parent=self.parents[self.level], name="list", label=GroupLabel.LIST + parent=self.parents[self.level], + name="list", + label=GroupLabel.LIST, + content_layer=self.content_layer, ) elif element.name == "ol": + start_attr = element.get("start") + start: int = ( + int(start_attr) + if isinstance(start_attr, str) and start_attr.isnumeric() + else 1 + ) # create a list group self.parents[self.level + 1] = doc.add_group( parent=self.parents[self.level], - name="ordered list", + name="ordered list" + (f" start {start}" if start != 1 else ""), label=GroupLabel.ORDERED_LIST, + content_layer=self.content_layer, ) self.level += 1 @@ -250,25 +304,36 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.parents[self.level + 1] = None self.level -= 1 - def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument): - """Handles listitem tags (li).""" - nested_lists = element.find(["ul", "ol"]) + def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None: + """Handles list item tags (li).""" + nested_list = element.find(["ul", "ol"]) - parent_list_label = self.parents[self.level].label - index_in_list = len(self.parents[self.level].children) + 1 + parent = self.parents[self.level] + if parent is None: + _log.debug(f"list-item has no parent in DoclingDocument: {element}") + return + parent_label: str = parent.label + index_in_list = len(parent.children) + 1 + if ( + parent_label == GroupLabel.ORDERED_LIST + and isinstance(parent, GroupItem) + and parent.name + ): + start_in_list: str = parent.name.split(" ")[-1] + start: int = int(start_in_list) if start_in_list.isnumeric() else 1 + index_in_list += start - 1 - if nested_lists: - name = element.name + if nested_list: # Text in list item can be hidden within hierarchy, hence # we need to extract it recursively - text = self.extract_text_recursively(element) + text: str = self.get_text(element) # Flatten text, remove break lines: text = text.replace("\n", "").replace("\r", "") text = " ".join(text.split()).strip() marker = "" enumerated = False - if parent_list_label == GroupLabel.ORDERED_LIST: + if parent_label == GroupLabel.ORDERED_LIST: marker = str(index_in_list) enumerated = True @@ -278,7 +343,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): text=text, enumerated=enumerated, marker=marker, - parent=self.parents[self.level], + parent=parent, + content_layer=self.content_layer, ) self.level += 1 @@ -287,74 +353,95 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.parents[self.level + 1] = None self.level -= 1 - elif isinstance(element.text, str): + elif element.text.strip(): text = element.text.strip() marker = "" enumerated = False - if parent_list_label == GroupLabel.ORDERED_LIST: + if parent_label == GroupLabel.ORDERED_LIST: marker = f"{str(index_in_list)}." enumerated = True doc.add_list_item( text=text, enumerated=enumerated, marker=marker, - parent=self.parents[self.level], + parent=parent, + content_layer=self.content_layer, ) else: - _log.warn("list-item has no text: ", element) - - def handle_table(self, element: Tag, idx: int, doc: DoclingDocument): - """Handles table tags.""" + _log.debug(f"list-item has no text: {element}") + @staticmethod + def parse_table_data(element: Tag) -> Optional[TableData]: nested_tables = element.find("table") if nested_tables is not None: - _log.warn("detected nested tables: skipping for now") - return + _log.debug("Skipping nested table.") + return None # Count the number of rows (number of elements) - num_rows = len(element.find_all("tr")) + num_rows = len(element("tr")) # Find the number of columns (taking into account colspan) num_cols = 0 - for row in element.find_all("tr"): + for row in element("tr"): col_count = 0 - for cell in row.find_all(["td", "th"]): - colspan = int(cell.get("colspan", 1)) + if not isinstance(row, Tag): + continue + for cell in row(["td", "th"]): + if not isinstance(row, Tag): + continue + val = cast(Tag, cell).get("colspan", "1") + colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1 col_count += colspan num_cols = max(num_cols, col_count) - grid = [[None for _ in range(num_cols)] for _ in range(num_rows)] + grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)] data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[]) # Iterate over the rows in the table - for row_idx, row in enumerate(element.find_all("tr")): + for row_idx, row in enumerate(element("tr")): + if not isinstance(row, Tag): + continue # For each row, find all the column cells (both and ) - cells = row.find_all(["td", "th"]) + cells = row(["td", "th"]) # Check if each cell in the row is a header -> means it is a column header col_header = True - for j, html_cell in enumerate(cells): - if html_cell.name == "td": + for html_cell in cells: + if isinstance(html_cell, Tag) and html_cell.name == "td": col_header = False + # Extract the text content of each cell col_idx = 0 - # Extract and print the text content of each cell - for _, html_cell in enumerate(cells): + for html_cell in cells: + if not isinstance(html_cell, Tag): + continue + # extract inline formulas + for formula in html_cell("inline-formula"): + math_parts = formula.text.split("$$") + if len(math_parts) == 3: + math_formula = f"$${math_parts[1]}$$" + formula.replace_with(NavigableString(math_formula)) + + # TODO: extract content correctly from table-cells with lists text = html_cell.text - try: - text = self.extract_table_cell_text(html_cell) - except Exception as exc: - _log.warn("exception: ", exc) - exit(-1) # label = html_cell.name - - col_span = int(html_cell.get("colspan", 1)) - row_span = int(html_cell.get("rowspan", 1)) + col_val = html_cell.get("colspan", "1") + col_span = ( + int(col_val) + if isinstance(col_val, str) and col_val.isnumeric() + else 1 + ) + row_val = html_cell.get("rowspan", "1") + row_span = ( + int(row_val) + if isinstance(row_val, str) and row_val.isnumeric() + else 1 + ) while grid[row_idx][col_idx] is not None: col_idx += 1 @@ -362,7 +449,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): for c in range(col_span): grid[row_idx + r][col_idx + c] = text - cell = TableCell( + table_cell = TableCell( text=text, row_span=row_span, col_span=col_span, @@ -373,70 +460,87 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): col_header=col_header, row_header=((not col_header) and html_cell.name == "th"), ) - data.table_cells.append(cell) + data.table_cells.append(table_cell) - doc.add_table(data=data, parent=self.parents[self.level]) + return data - def get_list_text(self, list_element: Tag, level=0): + def handle_table(self, element: Tag, doc: DoclingDocument) -> None: + """Handles table tags.""" + + table_data = HTMLDocumentBackend.parse_table_data(element) + + if table_data is not None: + doc.add_table( + data=table_data, + parent=self.parents[self.level], + content_layer=self.content_layer, + ) + + def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]: """Recursively extract text from