diff --git a/.github/actions/setup-poetry/action.yml b/.github/actions/setup-poetry/action.yml deleted file mode 100644 index 0bdd730c..00000000 --- a/.github/actions/setup-poetry/action.yml +++ /dev/null @@ -1,19 +0,0 @@ -name: 'Set up Poetry and install' -description: 'Set up a specific version of Poetry and install dependencies using caching.' -inputs: - python-version: - description: "Version range or exact version of Python or PyPy to use, using SemVer's version range syntax." - default: '3.11' -runs: - using: 'composite' - steps: - - name: Install poetry - run: pipx install poetry==1.8.3 - shell: bash - - uses: actions/setup-python@v5 - with: - python-version: ${{ inputs.python-version }} - cache: 'poetry' - - name: Install dependencies - run: poetry install --all-extras - shell: bash diff --git a/.github/mergify.yml b/.github/mergify.yml deleted file mode 100644 index 8d30d733..00000000 --- a/.github/mergify.yml +++ /dev/null @@ -1,18 +0,0 @@ -merge_protections: - - name: Enforce conventional commit - description: Make sure that we follow https://www.conventionalcommits.org/en/v1.0.0/ - if: - - base = main - success_conditions: - - "title ~= - ^(fix|feat|docs|style|refactor|perf|test|build|ci|chore|revert)(?:\\(.+\ - \\))?(!)?:" - - name: Require two reviewer for test updates - description: When test data is updated, we require two reviewers - if: - - base = main - - or: - - files ~= ^tests/data - - files ~= ^tests/data_scanned - success_conditions: - - "#approved-reviews-by >= 2" diff --git a/.github/scripts/release.sh b/.github/scripts/release.sh deleted file mode 100755 index 6cac4006..00000000 --- a/.github/scripts/release.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -set -e # trigger failure on error - do not remove! -set -x # display command on output - -if [ -z "${TARGET_VERSION}" ]; then - >&2 echo "No TARGET_VERSION specified" - exit 1 -fi -CHGLOG_FILE="${CHGLOG_FILE:-CHANGELOG.md}" - -# update package version -poetry version "${TARGET_VERSION}" - -# collect release notes -REL_NOTES=$(mktemp) -poetry run semantic-release changelog --unreleased >> "${REL_NOTES}" - -# update changelog -TMP_CHGLOG=$(mktemp) -TARGET_TAG_NAME="v${TARGET_VERSION}" -RELEASE_URL="$(gh repo view --json url -q ".url")/releases/tag/${TARGET_TAG_NAME}" -printf "## [${TARGET_TAG_NAME}](${RELEASE_URL}) - $(date -Idate)\n\n" >> "${TMP_CHGLOG}" -cat "${REL_NOTES}" >> "${TMP_CHGLOG}" -if [ -f "${CHGLOG_FILE}" ]; then - printf "\n" | cat - "${CHGLOG_FILE}" >> "${TMP_CHGLOG}" -fi -mv "${TMP_CHGLOG}" "${CHGLOG_FILE}" - -# push changes -git config --global user.name 'github-actions[bot]' -git config --global user.email 'github-actions[bot]@users.noreply.github.com' -git add pyproject.toml "${CHGLOG_FILE}" -COMMIT_MSG="chore: bump version to ${TARGET_VERSION} [skip ci]" -git commit -m "${COMMIT_MSG}" -git push origin main - -# create GitHub release (incl. Git tag) -gh release create "${TARGET_TAG_NAME}" -F "${REL_NOTES}" diff --git a/.github/workflows/cd-docs.yml b/.github/workflows/cd-docs.yml deleted file mode 100644 index 1ff7c4fc..00000000 --- a/.github/workflows/cd-docs.yml +++ /dev/null @@ -1,14 +0,0 @@ -name: "Run Docs CD" - -on: - push: - branches: - - "main" - -jobs: - build-deploy-docs: - uses: ./.github/workflows/docs.yml - with: - deploy: true - permissions: - contents: write diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml deleted file mode 100644 index 1f0502dc..00000000 --- a/.github/workflows/cd.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: "Run CD" - -on: - workflow_dispatch: - -env: - # disable keyring (https://github.com/actions/runner-images/issues/6185): - PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring - -jobs: - code-checks: - uses: ./.github/workflows/checks.yml - pre-release-check: - runs-on: ubuntu-latest - outputs: - TARGET_TAG_V: ${{ steps.version_check.outputs.TRGT_VERSION }} - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 # for fetching tags, required for semantic-release - - uses: ./.github/actions/setup-poetry - - name: Check version of potential release - id: version_check - run: | - TRGT_VERSION=$(poetry run semantic-release print-version) - echo "TRGT_VERSION=${TRGT_VERSION}" >> $GITHUB_OUTPUT - echo "${TRGT_VERSION}" - - name: Check notes of potential release - run: poetry run semantic-release changelog --unreleased - release: - needs: [code-checks, pre-release-check] - if: needs.pre-release-check.outputs.TARGET_TAG_V != '' - environment: auto-release - runs-on: ubuntu-latest - concurrency: release - steps: - - uses: actions/create-github-app-token@v1 - id: app-token - with: - app-id: ${{ vars.CI_APP_ID }} - private-key: ${{ secrets.CI_PRIVATE_KEY }} - - uses: actions/checkout@v4 - with: - token: ${{ steps.app-token.outputs.token }} - fetch-depth: 0 # for fetching tags, required for semantic-release - - uses: ./.github/actions/setup-poetry - - name: Run release script - env: - GH_TOKEN: ${{ steps.app-token.outputs.token }} - TARGET_VERSION: ${{ needs.pre-release-check.outputs.TARGET_TAG_V }} - CHGLOG_FILE: CHANGELOG.md - run: ./.github/scripts/release.sh - shell: bash diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml deleted file mode 100644 index 1cd08f2c..00000000 --- a/.github/workflows/checks.yml +++ /dev/null @@ -1,40 +0,0 @@ -on: - workflow_call: - -jobs: - run-checks: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ['3.9', '3.10', '3.11', '3.12'] - steps: - - uses: actions/checkout@v4 - - name: Install tesseract - run: sudo apt-get update && sudo apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa libleptonica-dev libtesseract-dev pkg-config - - name: Set TESSDATA_PREFIX - run: | - echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV" - - uses: ./.github/actions/setup-poetry - with: - python-version: ${{ matrix.python-version }} - - name: Run styling check - run: poetry run pre-commit run --all-files - - name: Install with poetry - run: poetry install --all-extras - - name: Testing - run: | - poetry run pytest -v tests - - name: Run examples - run: | - for file in docs/examples/*.py; do - # Skip batch_convert.py - if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment).py ]]; then - echo "Skipping $file" - continue - fi - - echo "Running example $file" - poetry run python "$file" || exit 1 - done - - name: Build with poetry - run: poetry build diff --git a/.github/workflows/ci-docs.yml b/.github/workflows/ci-docs.yml deleted file mode 100644 index 6e9134d5..00000000 --- a/.github/workflows/ci-docs.yml +++ /dev/null @@ -1,16 +0,0 @@ -name: "Run Docs CI" - -on: - pull_request: - types: [opened, reopened, synchronize] - push: - branches: - - "**" - - "!gh-pages" - -jobs: - build-docs: - if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'DS4SD/docling' && github.event.pull_request.head.repo.full_name != 'ds4sd/docling') }} - uses: ./.github/workflows/docs.yml - with: - deploy: false diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index e2b21ed2..00000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,19 +0,0 @@ -name: "Run CI" - -on: - pull_request: - types: [opened, reopened, synchronize] - push: - branches: - - "**" - - "!main" - - "!gh-pages" - -env: - # disable keyring (https://github.com/actions/runner-images/issues/6185): - PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring - -jobs: - code-checks: - if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'DS4SD/docling' && github.event.pull_request.head.repo.full_name != 'ds4sd/docling') }} - uses: ./.github/workflows/checks.yml diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml deleted file mode 100644 index 2733b522..00000000 --- a/.github/workflows/docs.yml +++ /dev/null @@ -1,20 +0,0 @@ -on: - workflow_call: - inputs: - deploy: - type: boolean - description: "If true, the docs will be deployed." - default: false - -jobs: - run-docs: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: ./.github/actions/setup-poetry - - name: Build docs - run: poetry run mkdocs build --verbose --clean - - name: Build and push docs - if: inputs.deploy - run: poetry run mkdocs gh-deploy --force - \ No newline at end of file diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml deleted file mode 100644 index 395f34cd..00000000 --- a/.github/workflows/pypi.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: "Build and publish package" - -on: - release: - types: [published] - -permissions: - contents: read - -env: - # disable keyring (https://github.com/actions/runner-images/issues/6185): - PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring - -jobs: - build-and-publish: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: ./.github/actions/setup-poetry - - name: Build and publish - run: poetry publish --build --no-interaction --username=__token__ --password=${{ secrets.PYPI_TOKEN }} diff --git a/CHANGELOG.md b/CHANGELOG.md index dee6e30c..597dde6e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,61 @@ +## [v2.17.0](https://github.com/DS4SD/docling/releases/tag/v2.17.0) - 2025-01-28 + +### Feature + +* **CLI:** Expose code and formula models in the CLI ([#820](https://github.com/DS4SD/docling/issues/820)) ([`6882e6c`](https://github.com/DS4SD/docling/commit/6882e6c38df30e4d4a1b83e01b13900ca7ea001f)) +* Add platform info to CLI version printout ([#816](https://github.com/DS4SD/docling/issues/816)) ([`95b293a`](https://github.com/DS4SD/docling/commit/95b293a72356f94c7076e3649be970c8a51121a3)) +* **ocr:** Expose `rec_keys_path` in RapidOcrOptions to support custom dictionaries ([#786](https://github.com/DS4SD/docling/issues/786)) ([`5332755`](https://github.com/DS4SD/docling/commit/53327552e83ced079ae50d8067ba7a8ce80cd9ad)) +* Introduce automatic language detection in TesseractOcrCliModel ([#800](https://github.com/DS4SD/docling/issues/800)) ([`3be2fb5`](https://github.com/DS4SD/docling/commit/3be2fb581fe5a2ebd5cec9c86bb22eb1dec6fd0f)) + +### Fix + +* Fix single newline handling in MD backend ([#824](https://github.com/DS4SD/docling/issues/824)) ([`5aed9f8`](https://github.com/DS4SD/docling/commit/5aed9f8aeba1624ba1a721e2ed3ba4aceaa7a482)) +* Use file extension if filetype fails with PDF ([#827](https://github.com/DS4SD/docling/issues/827)) ([`adf6353`](https://github.com/DS4SD/docling/commit/adf635348365f82daa64e3f879076a7baf71edc0)) +* Parse html with omitted body tag ([#818](https://github.com/DS4SD/docling/issues/818)) ([`a112d7a`](https://github.com/DS4SD/docling/commit/a112d7a03512e8a00842a100416426254d6ecfc0)) + +### Documentation + +* Document Docling JSON parsing ([#819](https://github.com/DS4SD/docling/issues/819)) ([`6875913`](https://github.com/DS4SD/docling/commit/6875913e34abacb8d71b5d31543adbf7b5bd5e92)) +* Add SSL verification error mitigation ([#821](https://github.com/DS4SD/docling/issues/821)) ([`5139b48`](https://github.com/DS4SD/docling/commit/5139b48e4e62bb061d956c132958ec2e6d88e40a)) +* **backend XML:** Do not delete temp file in notebook ([#817](https://github.com/DS4SD/docling/issues/817)) ([`4d41db3`](https://github.com/DS4SD/docling/commit/4d41db3f7abb86c8c65386bf94e7eb0bf22bb82b)) +* Typo ([#814](https://github.com/DS4SD/docling/issues/814)) ([`8a4ec77`](https://github.com/DS4SD/docling/commit/8a4ec77576b8a9fd60d0047939665d00cf93b4dd)) +* Added markdown headings to enable TOC in github pages ([#808](https://github.com/DS4SD/docling/issues/808)) ([`b885b2f`](https://github.com/DS4SD/docling/commit/b885b2fa3c2519c399ed4b9a3dd4c2f6f62235d1)) +* Description of supported formats and backends ([#788](https://github.com/DS4SD/docling/issues/788)) ([`c2ae1cc`](https://github.com/DS4SD/docling/commit/c2ae1cc4cab0f9e693c7ca460fe8afa5b515ee94)) + +## [v2.16.0](https://github.com/DS4SD/docling/releases/tag/v2.16.0) - 2025-01-24 + +### Feature + +* New document picture classifier ([#805](https://github.com/DS4SD/docling/issues/805)) ([`16a218d`](https://github.com/DS4SD/docling/commit/16a218d871c48fd9cc636b77f7b597dc40cbeeec)) +* Add Docling JSON ingestion ([#783](https://github.com/DS4SD/docling/issues/783)) ([`88a0e66`](https://github.com/DS4SD/docling/commit/88a0e66adc19238f57a942b0504926cdaeacd8cc)) +* Code and equation model for PDF and code blocks in markdown ([#752](https://github.com/DS4SD/docling/issues/752)) ([`3213b24`](https://github.com/DS4SD/docling/commit/3213b247ad6870ff984271f09f7720be68d9479b)) +* Add "auto" language for TesseractOcr ([#759](https://github.com/DS4SD/docling/issues/759)) ([`8543c22`](https://github.com/DS4SD/docling/commit/8543c22687fee40459d393bf4adcfc059712de02)) + +### Fix + +* Added extraction of byte-images in excel ([#804](https://github.com/DS4SD/docling/issues/804)) ([`a458e29`](https://github.com/DS4SD/docling/commit/a458e298ca64da2c6df29d953e95645525817bed)) +* Update docling-parse-v2 backend version with new parsing fixes ([#769](https://github.com/DS4SD/docling/issues/769)) ([`670a08b`](https://github.com/DS4SD/docling/commit/670a08bdedda847ff3b6942bcaa1a2adef79afe2)) + +### Documentation + +* Fix minor typos ([#801](https://github.com/DS4SD/docling/issues/801)) ([`c58f75d`](https://github.com/DS4SD/docling/commit/c58f75d0f75040e32820cc2915ec00755211c02f)) +* Add Azure RAG example ([#675](https://github.com/DS4SD/docling/issues/675)) ([`9020a93`](https://github.com/DS4SD/docling/commit/9020a934be35b0798c972eb77a22fb62ce654ca5)) +* Fix links between docs pages ([#697](https://github.com/DS4SD/docling/issues/697)) ([`c49b352`](https://github.com/DS4SD/docling/commit/c49b3526fb7b72e8007f785b1fcfdf58c2457756)) +* Fix correct Accelerator pipeline options in docs/examples/custom_convert.py ([#733](https://github.com/DS4SD/docling/issues/733)) ([`7686083`](https://github.com/DS4SD/docling/commit/768608351d40376c3504546f52e967195536b3d5)) +* Example to translate documents ([#739](https://github.com/DS4SD/docling/issues/739)) ([`f7e1cbf`](https://github.com/DS4SD/docling/commit/f7e1cbf629ae5f3e279296e72f656b7a453ab7a3)) + +## [v2.15.1](https://github.com/DS4SD/docling/releases/tag/v2.15.1) - 2025-01-10 + +### Fix + +* Improve OCR results, stricten criteria before dropping bitmap areas ([#719](https://github.com/DS4SD/docling/issues/719)) ([`5a060f2`](https://github.com/DS4SD/docling/commit/5a060f237d1decd0ff9db9e73478978419315778)) +* Allow earlier requests versions ([#716](https://github.com/DS4SD/docling/issues/716)) ([`e64b5a2`](https://github.com/DS4SD/docling/commit/e64b5a2f628acc340a6d94ee6f1ada2aa267cecc)) + +### Documentation + +* Add pointers to LangChain-side docs ([#718](https://github.com/DS4SD/docling/issues/718)) ([`9a6b5c8`](https://github.com/DS4SD/docling/commit/9a6b5c8c8debc81e0ddcbe91df6afbbeb29e97e6)) +* Add LangChain docs ([#717](https://github.com/DS4SD/docling/issues/717)) ([`4fa8028`](https://github.com/DS4SD/docling/commit/4fa8028bd8120d7557e1d45ba31e200e130af698)) + ## [v2.15.0](https://github.com/DS4SD/docling/releases/tag/v2.15.0) - 2025-01-08 ### Feature diff --git a/README.md b/README.md index 78acb592..5a957d60 100644 --- a/README.md +++ b/README.md @@ -22,23 +22,25 @@ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT) [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling) -Docling parses documents and exports them to the desired format with ease and speed. +Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem. ## Features -* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images) -* 📑 Advanced PDF document understanding including page layout, reading order & table structures -* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format -* 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI -* 🔍 OCR support for scanned PDFs +* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more +* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more +* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format +* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON +* 🔒 Local execution capabilities for sensitive data and air-gapped environments +* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI +* 🔍 Extensive OCR support for scanned PDFs and images * 💻 Simple and convenient CLI -Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling! - ### Coming soon -* ♾️ Equation & code extraction * 📝 Metadata extraction, including title, authors, references & language +* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling)) +* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc) +* 📝 Complex chemistry understanding (Molecular structures) ## Installation @@ -120,3 +122,7 @@ For individual model usage, please refer to the model licenses found in the orig ## IBM ❤️ Open Source AI Docling has been brought to you by IBM. + +[supported_formats]: https://ds4sd.github.io/docling/supported_formats/ +[docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/ +[integrations]: https://ds4sd.github.io/docling/integrations/ diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py index b47b11cd..491330b3 100644 --- a/docling/backend/abstract_backend.py +++ b/docling/backend/abstract_backend.py @@ -27,7 +27,6 @@ class AbstractDocumentBackend(ABC): def supports_pagination(cls) -> bool: pass - @abstractmethod def unload(self): if isinstance(self.path_or_stream, BytesIO): self.path_or_stream.close() diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py index 829419af..397bfc44 100644 --- a/docling/backend/asciidoc_backend.py +++ b/docling/backend/asciidoc_backend.py @@ -24,7 +24,6 @@ _log = logging.getLogger(__name__) class AsciiDocBackend(DeclarativeDocumentBackend): - def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index bb1fe058..6d22127b 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -132,7 +132,7 @@ class DoclingParsePageBackend(PdfPageBackend): return cells def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: - AREA_THRESHOLD = 32 * 32 + AREA_THRESHOLD = 0 # 32 * 32 for i in range(len(self._dpage["images"])): bitmap = self._dpage["images"][i] @@ -163,7 +163,7 @@ class DoclingParsePageBackend(PdfPageBackend): l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT ) else: - padbox = cropbox.to_bottom_left_origin(page_size.height) + padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy() padbox.r = page_size.width - padbox.r padbox.t = page_size.height - padbox.t diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py index 12d7df55..27a368f9 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docling/backend/docling_parse_v2_backend.py @@ -140,7 +140,7 @@ class DoclingParseV2PageBackend(PdfPageBackend): return cells def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: - AREA_THRESHOLD = 32 * 32 + AREA_THRESHOLD = 0 # 32 * 32 images = self._dpage["sanitized"]["images"]["data"] images_header = self._dpage["sanitized"]["images"]["header"] @@ -178,7 +178,7 @@ class DoclingParseV2PageBackend(PdfPageBackend): l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT ) else: - padbox = cropbox.to_bottom_left_origin(page_size.height) + padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy() padbox.r = page_size.width - padbox.r padbox.t = page_size.height - padbox.t diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index ae478885..286dfbfa 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1,9 +1,9 @@ import logging from io import BytesIO from pathlib import Path -from typing import Set, Union +from typing import Optional, Set, Union -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from docling_core.types.doc import ( DocItemLabel, DoclingDocument, @@ -24,7 +24,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) _log.debug("About to init HTML backend...") - self.soup = None + self.soup: Optional[Tag] = None # HTML file: self.path_or_stream = path_or_stream # Initialise the parents for the hierarchy @@ -78,17 +78,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if self.is_valid(): assert self.soup is not None + content = self.soup.body or self.soup # Replace
tags with newline characters - for br in self.soup.body.find_all("br"): + for br in content.find_all("br"): br.replace_with("\n") - doc = self.walk(self.soup.body, doc) + doc = self.walk(content, doc) else: raise RuntimeError( f"Cannot convert doc with {self.document_hash} because the backend failed to init." ) return doc - def walk(self, element, doc): + def walk(self, element: Tag, doc: DoclingDocument): try: # Iterate over elements in the body of the document for idx, element in enumerate(element.children): @@ -105,7 +106,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return doc - def analyse_element(self, element, idx, doc): + def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument): """ if element.name!=None: _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})") @@ -135,7 +136,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): else: self.walk(element, doc) - def get_direct_text(self, item): + def get_direct_text(self, item: Tag): """Get the direct text of the
  • element (ignoring nested lists).""" text = item.find(string=True, recursive=False) if isinstance(text, str): @@ -144,7 +145,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return "" # Function to recursively extract text from all child nodes - def extract_text_recursively(self, item): + def extract_text_recursively(self, item: Tag): result = [] if isinstance(item, str): @@ -165,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return "".join(result) + " " - def handle_header(self, element, idx, doc): + def handle_header(self, element: Tag, idx: int, doc: DoclingDocument): """Handles header tags (h1, h2, etc.).""" hlevel = int(element.name.replace("h", "")) slevel = hlevel - 1 @@ -207,7 +208,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): level=hlevel, ) - def handle_code(self, element, idx, doc): + def handle_code(self, element: Tag, idx: int, doc: DoclingDocument): """Handles monospace code snippets (pre).""" if element.text is None: return @@ -215,9 +216,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): label = DocItemLabel.CODE if len(text) == 0: return - doc.add_text(parent=self.parents[self.level], label=label, text=text) + doc.add_code(parent=self.parents[self.level], text=text) - def handle_paragraph(self, element, idx, doc): + def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument): """Handles paragraph tags (p).""" if element.text is None: return @@ -227,7 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return doc.add_text(parent=self.parents[self.level], label=label, text=text) - def handle_list(self, element, idx, doc): + def handle_list(self, element: Tag, idx: int, doc: DoclingDocument): """Handles list tags (ul, ol) and their list items.""" if element.name == "ul": @@ -249,7 +250,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.parents[self.level + 1] = None self.level -= 1 - def handle_listitem(self, element, idx, doc): + def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument): """Handles listitem tags (li).""" nested_lists = element.find(["ul", "ol"]) @@ -303,7 +304,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): else: _log.warn("list-item has no text: ", element) - def handle_table(self, element, idx, doc): + def handle_table(self, element: Tag, idx: int, doc: DoclingDocument): """Handles table tags.""" nested_tables = element.find("table") @@ -376,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): doc.add_table(data=data, parent=self.parents[self.level]) - def get_list_text(self, list_element, level=0): + def get_list_text(self, list_element: Tag, level=0): """Recursively extract text from