chore: typo fix (#1465)

* typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> --------- Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>
2025-12-10 13:48:13 +00:00 · 2025-04-28 14:52:09 +08:00
parent 3afbe6c969
commit a097ccd8d5
14 changed files with 19 additions and 19 deletions
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@@ -409,7 +409,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                        )
                    return _txt

-                # restore original HTML by removing previouly added markers
+                # restore original HTML by removing previously added markers
                for regex in [
                    rf"<pre>\s*<code>\s*{_START_MARKER}",
                    rf"{_STOP_MARKER}\s*</code>\s*</pre>",
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -436,7 +436,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):

        # Common styles for bullet and numbered lists.
        # "List Bullet", "List Number", "List Paragraph"
-        # Identify wether list is a numbered list or not
+        # Identify whether list is a numbered list or not
        # is_numbered = "List Bullet" not in paragraph.style.name
        is_numbered = False
        p_style_id, p_level = self._get_label_and_level(paragraph)
--- a/docling/backend/xml/jats_backend.py
+++ b/docling/backend/xml/jats_backend.py
@@ -91,7 +91,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
        super().__init__(in_doc, path_or_stream)
        self.path_or_stream = path_or_stream

-        # Initialize the root of the document hiearchy
+        # Initialize the root of the document hierarchy
        self.root: Optional[NodeItem] = None

        self.valid = False
--- a/docling/backend/xml/uspto_backend.py
+++ b/docling/backend/xml/uspto_backend.py
@@ -1,6 +1,6 @@
 """Backend to parse patents from the United States Patent Office (USPTO).

-The parsers included in this module can handle patent grants pubished since 1976 and
+The parsers included in this module can handle patent grants published since 1976 and
 patent applications since 2001.
 The original files can be found in https://bulkdata.uspto.gov.
 """
@@ -440,7 +440,7 @@ class PatentUsptoIce(PatentUspto):
                    )

            elif name == self.Element.PARAGRAPH.value and text:
-                # remmove blank spaces added in paragraphs
+                # remove blank spaces added in paragraphs
                text = re.sub("\\s+", " ", text)
                if self.Element.ABSTRACT.value in self.property:
                    self.abstract = (
@@ -1697,7 +1697,7 @@ class XmlTable:
 class HtmlEntity:
    """Provide utility functions to get the HTML entities of styled characters.

-    This class has been developped from:
+    This class has been developed from:
    https://unicode-table.com/en/html-entities/
    https://www.w3.org/TR/WD-math-970515/table03.html
    """
@@ -1896,7 +1896,7 @@ class HtmlEntity:
        """Get an HTML entity of a greek letter in ISO 8879.

        Args:
-            The text to transform, as an ISO 8879 entitiy.
+            The text to transform, as an ISO 8879 entity.

        Returns:
            The HTML entity representing a greek letter. If the input text is not
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -521,7 +521,7 @@ def convert(  # noqa: C901
            if image_export_mode != ImageRefMode.PLACEHOLDER:
                pipeline_options.generate_page_images = True
                pipeline_options.generate_picture_images = (
-                    True  # FIXME: to be deprecated in verson 3
+                    True  # FIXME: to be deprecated in version 3
                )
                pipeline_options.images_scale = 2

--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -234,7 +234,7 @@ class TableStructureModel(BasePageModel):
                                tcells = table_cluster.cells
                            tokens = []
                            for c in tcells:
-                                # Only allow non empty stings (spaces) into the cells of a table
+                                # Only allow non empty strings (spaces) into the cells of a table
                                if len(c.text.strip()) > 0:
                                    new_cell = copy.deepcopy(c)
                                    new_cell.rect = BoundingRectangle.from_bounding_box(
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@@ -151,7 +151,7 @@ class TesseractOcrModel(BaseOcrModel):
                            script = map_tesseract_script(script)
                            lang = f"{self.script_prefix}{script}"

-                            # Check if the detected languge is present in the system
+                            # Check if the detected language is present in the system
                            if lang not in self._tesserocr_languages:
                                msg = f"Tesseract detected the script '{script}' and language '{lang}'."
                                msg += " However this language is not installed in your system and will be ignored."