feat: add backend for METS with Google Books profile (#1989)

* add backend for METS with Google Books profile

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* Fixes for cell indexing

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* use HTMLParser and add options from CLI

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix typing and unloading

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* restore guess format

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* rename inputformat

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* use PdfDocumentBackend

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* use test file from test folder (still missing)

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add test file

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-08-18 11:43:20 +02:00
committed by GitHub
parent 9687297262
commit 31087f3fcc
9 changed files with 529 additions and 7 deletions

View File

@@ -56,6 +56,7 @@ class InputFormat(str, Enum):
XLSX = "xlsx"
XML_USPTO = "xml_uspto"
XML_JATS = "xml_jats"
METS_GBS = "mets_gbs"
JSON_DOCLING = "json_docling"
AUDIO = "audio"
@@ -81,6 +82,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.CSV: ["csv"],
InputFormat.XLSX: ["xlsx", "xlsm"],
InputFormat.XML_USPTO: ["xml", "txt"],
InputFormat.METS_GBS: ["tar.gz"],
InputFormat.JSON_DOCLING: ["json"],
InputFormat.AUDIO: ["wav", "mp3"],
}
@@ -113,6 +115,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
],
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
InputFormat.METS_GBS: ["application/mets+xml"],
InputFormat.JSON_DOCLING: ["application/json"],
InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
}