mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-10 13:48:13 +00:00
chore: typo fix (#1465)
* typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> --------- Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>
This commit is contained in:
@@ -409,7 +409,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
return _txt
|
||||
|
||||
# restore original HTML by removing previouly added markers
|
||||
# restore original HTML by removing previously added markers
|
||||
for regex in [
|
||||
rf"<pre>\s*<code>\s*{_START_MARKER}",
|
||||
rf"{_STOP_MARKER}\s*</code>\s*</pre>",
|
||||
|
||||
@@ -436,7 +436,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
# Common styles for bullet and numbered lists.
|
||||
# "List Bullet", "List Number", "List Paragraph"
|
||||
# Identify wether list is a numbered list or not
|
||||
# Identify whether list is a numbered list or not
|
||||
# is_numbered = "List Bullet" not in paragraph.style.name
|
||||
is_numbered = False
|
||||
p_style_id, p_level = self._get_label_and_level(paragraph)
|
||||
|
||||
@@ -91,7 +91,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
self.path_or_stream = path_or_stream
|
||||
|
||||
# Initialize the root of the document hiearchy
|
||||
# Initialize the root of the document hierarchy
|
||||
self.root: Optional[NodeItem] = None
|
||||
|
||||
self.valid = False
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""Backend to parse patents from the United States Patent Office (USPTO).
|
||||
|
||||
The parsers included in this module can handle patent grants pubished since 1976 and
|
||||
The parsers included in this module can handle patent grants published since 1976 and
|
||||
patent applications since 2001.
|
||||
The original files can be found in https://bulkdata.uspto.gov.
|
||||
"""
|
||||
@@ -440,7 +440,7 @@ class PatentUsptoIce(PatentUspto):
|
||||
)
|
||||
|
||||
elif name == self.Element.PARAGRAPH.value and text:
|
||||
# remmove blank spaces added in paragraphs
|
||||
# remove blank spaces added in paragraphs
|
||||
text = re.sub("\\s+", " ", text)
|
||||
if self.Element.ABSTRACT.value in self.property:
|
||||
self.abstract = (
|
||||
@@ -1697,7 +1697,7 @@ class XmlTable:
|
||||
class HtmlEntity:
|
||||
"""Provide utility functions to get the HTML entities of styled characters.
|
||||
|
||||
This class has been developped from:
|
||||
This class has been developed from:
|
||||
https://unicode-table.com/en/html-entities/
|
||||
https://www.w3.org/TR/WD-math-970515/table03.html
|
||||
"""
|
||||
@@ -1896,7 +1896,7 @@ class HtmlEntity:
|
||||
"""Get an HTML entity of a greek letter in ISO 8879.
|
||||
|
||||
Args:
|
||||
The text to transform, as an ISO 8879 entitiy.
|
||||
The text to transform, as an ISO 8879 entity.
|
||||
|
||||
Returns:
|
||||
The HTML entity representing a greek letter. If the input text is not
|
||||
|
||||
@@ -521,7 +521,7 @@ def convert( # noqa: C901
|
||||
if image_export_mode != ImageRefMode.PLACEHOLDER:
|
||||
pipeline_options.generate_page_images = True
|
||||
pipeline_options.generate_picture_images = (
|
||||
True # FIXME: to be deprecated in verson 3
|
||||
True # FIXME: to be deprecated in version 3
|
||||
)
|
||||
pipeline_options.images_scale = 2
|
||||
|
||||
|
||||
@@ -234,7 +234,7 @@ class TableStructureModel(BasePageModel):
|
||||
tcells = table_cluster.cells
|
||||
tokens = []
|
||||
for c in tcells:
|
||||
# Only allow non empty stings (spaces) into the cells of a table
|
||||
# Only allow non empty strings (spaces) into the cells of a table
|
||||
if len(c.text.strip()) > 0:
|
||||
new_cell = copy.deepcopy(c)
|
||||
new_cell.rect = BoundingRectangle.from_bounding_box(
|
||||
|
||||
@@ -151,7 +151,7 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
script = map_tesseract_script(script)
|
||||
lang = f"{self.script_prefix}{script}"
|
||||
|
||||
# Check if the detected languge is present in the system
|
||||
# Check if the detected language is present in the system
|
||||
if lang not in self._tesserocr_languages:
|
||||
msg = f"Tesseract detected the script '{script}' and language '{lang}'."
|
||||
msg += " However this language is not installed in your system and will be ignored."
|
||||
|
||||
Reference in New Issue
Block a user