From 52713f0cf50059423e29f0d357f3f01142e19fda Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Fri, 11 Oct 2024 12:57:47 +0200 Subject: [PATCH] Optionally produce legacy_doc Signed-off-by: Christoph Auer --- docling/datamodel/document.py | 2 +- docling/datamodel/pipeline_options.py | 5 ++++- docling/models/ds_glm_model.py | 9 +++++++-- .../pipeline/standard_pdf_model_pipeline.py | 6 ++++-- examples/run_with_formats.py | 8 ++++++-- tests/data/word_sample.docx | Bin 103051 -> 103139 bytes tests/test_e2e_ocr_conversion.py | 2 +- 7 files changed, 23 insertions(+), 9 deletions(-) diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 721ccfcd..ede4e328 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -172,7 +172,7 @@ class ConvertedDocument(BaseModel): pages: List[Page] = [] assembled: AssembledUnit = AssembledUnit() - legacy_output: DsDocument = _EMPTY_LEGACY_DOC + legacy_output: Optional[DsDocument] = None # _EMPTY_LEGACY_DOC output: DoclingDocument = _EMPTY_DOCLING_DOC def _to_legacy_document(self) -> DsDocument: diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 4be6fcec..45a9771b 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -59,7 +59,10 @@ class TesseractOcrOptions(OcrOptions): ) -class PipelineOptions(BaseModel): ... +class PipelineOptions(BaseModel): + create_legacy_output: bool = ( + True # This defautl will be set to False on a future version of docling + ) class PdfPipelineOptions(PipelineOptions): diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py index 06c0aea3..529b12ce 100644 --- a/docling/models/ds_glm_model.py +++ b/docling/models/ds_glm_model.py @@ -22,6 +22,8 @@ from docling.datamodel.document import ConversionResult class GlmModel: def __init__(self, config): self.config = config + self.create_legacy_output = config.get("create_legacy_output", True) + self.model_names = self.config.get( "model_names", "" ) # "language;term;reference" @@ -42,7 +44,10 @@ class GlmModel: ) docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental - legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict) + legacy_doc: DsLegacyDocument = None + + if self.create_legacy_output: + legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict) # DEBUG code: def draw_clusters_and_cells(ds_document, page_no): @@ -92,4 +97,4 @@ class GlmModel: # draw_clusters_and_cells(ds_doc, 0) # draw_clusters_and_cells(exported_doc, 0) - return (legacy_doc, docling_doc) + return (docling_doc, legacy_doc) diff --git a/docling/pipeline/standard_pdf_model_pipeline.py b/docling/pipeline/standard_pdf_model_pipeline.py index dba1f3dc..c0dab3b4 100644 --- a/docling/pipeline/standard_pdf_model_pipeline.py +++ b/docling/pipeline/standard_pdf_model_pipeline.py @@ -37,7 +37,9 @@ class StandardPdfModelPipeline(PaginatedModelPipeline): artifacts_path = self.download_models_hf() self.artifacts_path = Path(artifacts_path) - self.glm_model = GlmModel(config={}) + self.glm_model = GlmModel( + config={"create_legacy_output": pipeline_options.create_legacy_output} + ) ocr_model: BaseOcrModel if isinstance(pipeline_options.ocr_options, EasyOcrOptions): @@ -128,7 +130,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline): elements=all_elements, headers=all_headers, body=all_body ) - conv_res.legacy_output, conv_res.output = self.glm_model(conv_res) + conv_res.output, conv_res.legacy_output = self.glm_model(conv_res) return conv_res diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py index 398121bd..f086bae2 100644 --- a/examples/run_with_formats.py +++ b/examples/run_with_formats.py @@ -1,3 +1,4 @@ +import json import logging from pathlib import Path @@ -54,12 +55,15 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal conv_results = doc_converter.convert_batch(input) for res in conv_results: - out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md" + out_path = Path("./scratch") print( f"Document {res.input.file.name} converted with status {res.status}." f"\nSaved markdown output to: {str(out_path)}" ) # print(res.experimental.export_to_markdown()) # Export Docling document format to markdown (experimental): - with out_path.open("w") as fp: + with (out_path / f"{res.input.file.name}.md").open("w") as fp: fp.write(res.output.export_to_markdown()) + + with (out_path / f"{res.input.file.name}.json").open("w") as fp: + fp.write(json.dumps(res.output.export_to_dict())) diff --git a/tests/data/word_sample.docx b/tests/data/word_sample.docx index 70ef7375ec93deb062e369988a88a9676e96c3b4..b1889405865c5bfcc594a04de63ae20168e4a84c 100644 GIT binary patch delta 9806 zcmZ8nWl&trww*zPySoP;+}+*X-Q9I?4?b9McX!ti++72~Ap{7JpfBIOukO3`)}KD- zbXRv*_3AC_?D#Uc!!o!=BzSPq7RP%SG629Y1OQ+F0018scS}}F7Yi>ZD`!s@Unj>= z-3{kMZtNh|d9bNgA(|A2K9j=*Z)C3$G)%Z>YX5y?ifw!hH6r#AtW@ZmG`PWJQNg~+ z(S|*aLo{5n8*{<~jrF>`_uVS9%vQozPEUAY+9xVXmb5AB_#*b<89Z`3xui<(N_v=n zWLbPASM}=tdc||yAvRsiK-DD*NXZ;(kD=A4l*BmoC!nDoIqFG}&V@MA>`E~$ZP-gu z6}u25c};;(XmE0+N}0s=kWYvV?NJO{iK1{ldLzd$=T}Q$JjA@=k?&T>Eq2!WJjgGg z$k&Uk@+CeQqa2;4f>#KVgD}t=`6UQVqJB*u=T0TpdQd;ImoD(7`VcH;3s{|?&LXCc z?8n_8Bl&)J5E^-@VyFjn4u%OYrpF1H5-KBw+r4=LsdQH;EK7c`e5lWF$|^1ClrqDr zJMf#Y9XMZAI2Y^d@A6`lV|xpWo*pZzobjJb+Y_{WhlBjQ@n4T$B{^;VG2_eMXsOB5A=hrNaLYN>oS z(?@iRv|L@RFH#AxyfNNI!fy{_!-c{$_X14A$hM*yFFiU@nK_4B7(t5C2}0 z!t@YytMEXf34*3@0QjRtzVoWmmBe$M5kY&keiwJgxCgs(}xoi zhwdxXVrhb@Qw-&)5W$oha&$Ci#-s{+gc>kA^!?{6%Zr}Tu{%GrWq5=u4Su>a;Wy5m z5-#ur!g$!|r?=C?W9V@y@e&&Nvc+;<-N)VS=F=)ifoUYd^ls*_8%#rgaw4pNQK2)d zq<@;deYVhamUXEU~c;AVAy=n03=zlrGBBFOUJ+Rm;E zmAC3Hazdmb><7RJQiR6H{ZLMH2vj4j{t(!XCH;&-(^+9bw(aJZV-Y#gXdU zP5Z8os4ST->jt~^3iCt1;S4U_3H3NxrA%I%cfHP+_F5GBO0)8ssH_d+yp@exIs?2@ z{^_Ymdx3@3mtGh>gMvOJG&H>V<1S#!-(H}sNCQii@}-JEXzp5M%}mEe9wO5N^WTF)$OOsSs115254ViH+s6JS(ASCEemXCxUVNX<-`Aq-IgqqP7 zTn%S`gU)gCU8}PjDY#)~<& zR#>GdY)b%&!}7w5eB(5M5{vK|s&nk`$7`^>rnrZPxzer< z*=$`T76Z+}QxDL9gZcF1%Tn-gHi3psFaEiFUtQmK6g1@K+~oJ_@5v|g8F64qCs}U? zks_Q(@-SzoB=hzetH-DX+*Ewe0>1>9J&6rW*+vZ){>^vZR+23Z#W!dG;QjqlZ_9a$ z8$TFlcnv78woGq}9K+8ER3}&H!RfK2-dv!JkCGglsDC012o=&Tu%xCDNX;FUKlZ#b zg>Ul1oOpkJ!=h(XQs`&c9RM4U#8*ABR9qk7CBJKB80RaJBO6%``4(@E{p#dV4`$VR z#;ipS>@B4cQc$5?e=G{85?rVY+d){dmkIt}$eZPP!VN|Di!?`^yTps7J%EK@$$to` zZ|;cBt$>5?IOCWu?XDKHbk>8G_ByAJtK?TEz$gBtQ-3icPceh0Mi=ZiMZ2`;hk~ge zl6v$Db)FKI?}TTcPcR1FB1T`(cH)AP1c=D)8bgu*SD3s`F3ct!w#d>aqFRo}sdQxO zKjy$Djkq<2Zn2m-97nPj<^5u!Ch#dvW;JY~2xG2VXxdFpm6Q0Rm6)9j-0R?iHy0qD2iSd!>uyo<>@ zrsQBd zGQ-M+=NKsN1$TQHRVQ_h&~$#H)*tJ(t{a+loUogCy9AcLl!0@D$)nVBeq38RI7kAgAmhCf>Q;=48P?`)%xh&8|J{XoHt z_?q5VyY-Yv($GxBvke=6hy&q$2y3$kjafK4L*VGEI&*gGn=~q~j^5AB8dH;`9IG-; zDVBnzLVNItW?S@Z$jN>2>8}Q&U&&zwnAv77_5)m87A>}-nUsfP{yQNGqY%H^j{4GT z**`_o>RP!|CS9sjsdkePDej z$9AD+IT$@RmAuMqAbZE@G_<1SJr(C{)XIOCVygvex-QAOF6kgT#)^LpNiZ~dj{P}f)V+Yfwi*R^N2HEwn# z_*64gGYL$6OApg+GjU*h(+2fIU4O0!MDBX6V5|QSrkrPq0vs+Eln_1y z@ISnZ2R-fa0tU`atWgtjN$8$Nfd{r1wu@jz;-goEdM>2yr2Bkx^;NV8?~ecrJ!*dtyhwvjV#d)#x=qXg-8=CkPJ@U3 z{cqL9T3?U3L@0sVc@WzmFg=>qz=)Y-I!NW1^k*|KRH?GF_F4j>Ua8gW?rsqk+AxR{ zuA!B>4^{j}=ryZwGRCANM)7x|wRyp@{6IGTfE zI#D{$XqHCm)27oFZ#~YAf;E_+c=~lJhc1V8UDV-a&_(M-0mVh7dMT(HE|uzts~5Q zo#r;7;2Jh1z*osxmgDZ+Tgkp&^$Sbbp0ARWK8~9X$3%giH*pn@-Q;9dc(v$DnXE2A zycS!mASl@Gi|hB>v~wIX@PzXg2tDc=_6arY2Wp!?hB>^K6J8gN+%o`K%0PS+mip-E zO2NJbv#Y9>S~LhmlX~Z79gJGF)k8H=D}>8#<>}nRljvj?r<=RDv;Fq!z>%|=p_%*z zWPv?*g_ZHJb8wNlCDCf7s8-(C${9X+I=HZ{oPK}-z@*6%^^;HsQ)gKkc!lEw9eb28 z;NWS^_xb8I1l@+vGt^mslPn6Qdq+x)2e;}DhI4Zd0t#UeBa>{R0=TGQI6zlny+Cr3 zq7!emx0OmZkg=EO5P9-LwFF{x3JVMmsq}4k=>#Zsk47mvg?}s7lZW>(>wJhA3x?s9 z!8=6sCigA{&Z82%Siz~VBFB;E_9>J;h;%=t-TuR9QUk3bz>ID7rL(-TD-VfK>!IY! zZYL?^L2F5Ozjm+Lzl&f{1|p}m6(`)qIHQAn1f!v4iSi-$Opu)+EuE(Vnm~RjTn=Sp z01_fJaqb8IQk9I7KI_CD=pwt_OlE&4&})*?gc*f7e7(x&d#;|3Tl16yYUB^@ar^I zA2^cX-v)ta1!%J*~}vD@v(*ma47^mYk6;zkA1Ern^Fz^aFDY+O%aRg0m8%R zt}EV)`D~b)a)-zc(43}2Y2iC|c?vNlNl`vh2cpY`!4R9bp1u1I*qJE_XYr^mb8g-3 zurIxq%fNJr;@o)@vtHEgY_i{E7*L%dXc<$kEicK)kd$S0qjVIuYV{nv%x{Md2zd&K z$~&%IdPFL3Jg}@^Zqc}2U3>N5KhLQzdGtGNA5Fg^a~ZI@-@vCW;9$d2C{&zYr=bv# zgSW`BL-y#dM0A5yZn%_dX*DuDHTKHApMltE>(yN!Tv^_O_Z_rt@($YY`_lGO3gM|9hXn4lV)Bx;*X=r^{!}|s=V~^O zxn{UQk$@f2gcw6Zoops0OmZGNd68S#`LjEbKd%ut()qRg7~Bc1dgxeeuNs~QMpv4Q zCw_+}qoUB=xZvpk~z}>`87rWDFA< z)Aa-8@6&&o-(LE}qB|s@|G>Q&q+^VBT;u^;qda>TMzL2uZGqSYuJ48!dlZ<237l+` z=3imTW7Z}B-Zh}**-f$tXB^V5mZxE7ROvr72UO@q>*jSH%l@1AXxUgIZc|={NN`Qr zJ!|2++ekWhc9xkXPq>=aoYi1y%>jb6=*TNxpCT;S&)&A-9gSaCHWcOFDHN&rBsWF- zaGh~hGs7&t+uu%iZX<0bMGxyDBaJyW_QKZrj@cqtmP@wnhan8*acoWRjls;1M}K3` zg`IuL`&syDxvY>vT1z_qmNqbuK=efXC2-B-q{jA@?8|jA^Lvcd_b263@V)TIjHN29 zx;39~@T1KaQ%5dFKyS48oy~RswK$hc+jp(Go(d0adXn`BsJ|f+-&7oGP%gB%U!um z%K_OM)Tj6!wu$#pN)|lh~qB@Tlmwli)&1^h2yyNdeMv z1c1k-_-7zi^pn~#ogtJ$ZvasqUrAH56y{YjuXd?9wab>NZ>c~!IH+IhCI`{6$e{Z; zrsJ`Pae!ULxgF=w)<~3BH43`2G&E89H4<7)#9uDI)$mA+nOs0;-m9NY8}&g>$rn`z zO7(B)WEallp4a$xNIv*DyCTJ@BDPHA|}6L``A6x z4y{4U@Eq~ol6(1PFrh9f#-+$ZF7B*POo<_8f3+IwkK@|ZTpVe%mVvBXiJC)KEQ5HG z;~J7`xmC>g8Z;&h`FTIg@bL)2q5co$nEEbCZn670zRHzBiB8zhmLplUS=8XWCDs@T zC!1xbHuko5Z9kJ;Mey=vts4SZ@=zws*ycY zcNztdYN+5`h$^zSYr+gBB*ZdYr5T?-O*6byVWdKp1j7bXgpt9DG93NkFid%tP|ARr zfUEwNmwOgwfZN(%lFJdSVb)1?eVpLT9T5(V$a8|qkmBCH=GRaAVMO*uIhaW74QO?$ z`X;p9ADu2Y0iL{&=nFS|-;~SI|1pIJM^)>Rr5_mj*Amh zm=m5&29xb2(lTuzxoTLEb}AsCR581FQ~Pz%i_&4)gY_qg0Mz)QfyL+O-9Y{}iVc`u zLI;?QQKT!J=AVb0ZB!J*HC!dGg+t;errYAKbCZ&v7S2)ZbgK2xCRfc9C%GzYT`SrbpVZX?apZ|AMT&^H=T!&`Jxhvf zEG2C)FV3i4q}u=TTA>wS{VmTW>>hpIeN}K#7XapCI`yVATDheTH2V!!1lM*LNBa@p z=_`T@4X=<}d!$(J5-ug@c-RLrVWn|@HP3$?HpV5;E438xW#db5Lz|zRQJ>(c{kWdL ze?bbSuum(LsVT^7C&}CJdN?k{ttS)+kYkk)RAlo(ec0aM{`l<`uw=H6Ip!GW zjk&&1?rp)umzF;&SRlaQt$)#`Q!f)nE74XFo^<5dcae>OwnG69HE#iVUyhcy%eXfj z#Fvzn1`9GBMlqE z?V|ia0v5IQfiVBDWM19FBN@28jc74DXXkC)A>L>ECUhlnu8ku(3(^@DN?5jCf{bVt z*6oZa#QiCY(${OLHIa8`Ie^6N;9E};Ml`~i%xEm#y0A2{SX|YMI|dyo;2F;?g8ulN zQ7YHB^W0)otq6>zb-5~ZuxW2r=A-k0%{m0v?K?&97-kxG3?ADPF@YMX?ZUhj${!?lHf2cygz?T3%;D)quRTI?~#vKZ>%n`giY&25A4=)<}W31#;T8H(TOn z@k3gyu4oxf+BozNV#SW{6@)+LkhAmTC>xMnwW{TJC_n4SwG0arqxifF{4kK45npX| zG4AvzW>rqjJqOwkq{)t42_^NYre?TqmLm;B_v2_jiZ+~!eHX4ZIF-}AZ2sNz z)tIG|3;JZ)SRxee2nhomIqI`oC40e=w`7O1g5^_AHQx> zwrRCK&E2=FR5PACn|?6jr_v%sjM4GnU)?>EXE=yhf<(WFqTUY;?XH>Y+}+qSJ_@Rg z2idbD0~O&PskD+iSilv8njw>DHO67?f{F-cXD5n~ERYF@9xRw4v!EY_3&2&mK2r#n zz8o_7c#E?`rx?vIH@<7H+2wSIpviZndoBX?wKQhgRDO>yl)h|TK`}48JCUvKEcgvU z8>c*G!~MD`7P!w>S6=4MHiV$sWnoUREETtM2$&gd>=y8oR7Qm5~Y&D2`zz*}!I>CsVe^kZ~Fv!X|o8Sh6am*|D#;DTo8Z6T56DytV>)irQ%}rbd84 zEHf!3OL6PvA<_+Up>h?&f;Y5(q;F_|ssc0&HUJKgwub|UPzKLX;f4tSv~(!D0JG4+ zTBu2e=BjC=Eph_bId=UiUJ>0`i%~ewD{w4~hf9ed9J(xTF2qD$FD}Uu?+k|BGh^T8 z@%OIv2}XIp@SacllT#Mj#Yv7di}RFnD%_`kmrvYdk6BZ{+kf;0aFn{MKnW&KC}xIKT2K7BzCXOl#CU7u z1JH8C-y0N%HeAT!mkSY}qGp{;`lXDY8aG<5M-xe1t;(&_8IBPQ!P7EItR4WrGHUCo z!|FTHSNHQ;!f`Cg&s37MuI8UN^RF&+88_grtW9~gSN2pL+Y1{k*Vos(dvVU-?WJGy z2qHHP>yzuMv3QIp3%3leyRneg#!u2kw!|hCH$4fpz10ie{20rutgi1|@}BcX)p>2) zBDU4mjA9Yx8#O!apc9o0|A4dL=M3I_U%B>7}l;HzTODSG9eS;&S*T ztn&1jfcvL~`s+fmWDdK2V4#GA70j&^46VlJ0Cp}2L^v}0;QC@aYN)gKNMhq5+^zK6 zS3LYIK#N?nWURW_m}Gujd4zvp9SB7#-l9dCbWy9t0fVv zzucLD*S~+eA)hpb-*lyE0TGOu0a6*h9C0I=n5_E5gFqRw((*JENJ*x{<16E6XwO#l zrCh*Og%)g1BqWC#D{KP&5OV8MhUBcj>*J0dlW_^COw1 z_r%rwX6Xa9TXD(j8j?8EWi=PEd)etW-*7jL?X>NrhVXM+J`&}V8|Jf3l)_`?+FNNl z%kXA4Yehb^oS3eMuX2+%>bx*>NRVG~BbZ<) zqmS-1$d;36hcz3!e{_RwCF%xF$haVEto zd+Nhe0xQ+B2X1Cc=@R@k0T-@6w{a%`D=;$XrxF5XgT0%IFK&|BILrWjJMCKAZ-Li( zsem8oRCBz?zx?`wIc_6T{+K^i*?fOfk$$mSbam#rEw!H!6QH|Z&ngK81-OiDWFa?G zU_%Ef$;_e+I{UB@yd&X>{AB*|);BG%Rpx`d&QjWb4&MFfG*TN3{D)2mhLjHYzCZxr zuH!TkNC^ZhsEmM+L#}0Cq8T1p&PZdljOa)=W94L#Lpvi!x1gLbX?7>ygzS2}G+p3c54YnmF9#T}vVd6j<+h3dn)W4=FI~U(K<~4(p%Z9@vMHV!gN(Oq*2dk^$Rw~6t zUJ!zegax?<2pNs8=gzi#4pE~F z>ls&{Qng7ff^$33;OUD8wa_&j> z!6)rTRM!3BfK0qNAOqL{X)=v7kps#GtvZZ>y~LFFW4-blSc;LEAN-@N7k}BJHDZ-> z8~VAQzFm%8CEiwz+Rrm{?(Sm&E05X>n?lPa1%3OU5i}DzH{yyL%51I80(yeECG~*a z^k&!$Q(tP&akPD1`3b;jO*_P5iiQTdcr3+$MX)_5=qpI@8z-2jJS8lE80wiRaz%Mf z*!SE7*YkuvJ3}?_igffdqlWN)6khz`7j$Fny8WigolMH}#cxXfjA)tE z53RyVgXtu10=NyveJFhl`Mqic4iBilae5=OPOIA39WnJXw>iJ`A1AAhbVkuxH({rA z{HgQkDBpL+eW-)x5}sByAP?RqVsWL_^XAEWre$Jnl3pazMs2dvU&``@eHe+r&ZYn1}~fLm~cOEmUSuB2WcbpE;8V zWc~L)hD=}*5C@70HxoYzNd0f&HB;=LPs#+Dx=BDGpkrpnKh0_8auQG+N=7deKN(0z z^#7%)zw7=lOf&V9fx^JC%3;6nPw?KTB85>A^874)kJ1-3Xq2QKSt)Sv-+px>;J$Qm3f!~q$2u<^3Z>K iVPOD(zna>AVKb8=6(|ISm5^zg3Z#J(OZ;oB0RIEc^IP!% delta 9766 zcmZX4Wl){Xvi07$yL)hVmtcY5?(QzZWdj>`cXyZIPH-nka0u=$!TrlQx9Xht-mm}6 zJT*Nv-Bmr^z1B0kB`{GXFm;Hq;A;G>3??K1AWHxMKnDN-9*)jtEM|_TZuaI5uFRhH zc7s|I4!hh~>$pe4b@Vgp9Nt=&3Mx`9g2ZIBfzlmEiK$27;gkqi`>>G^Z!zx%J;iCs zW&%z-2{{n@T7{H<)?8RFquw8HN*QM&uR!{rY z`nF89{eEu?4nK-X+)rPAJ3!M7M?gCTC~y~A4c4W`)$6zVEYjeB&AD=tAkIcBsmtz| z1(MSW`$l{K+BxjKm5iZYr>jKxHjX7N+O5c-Ubfqp9}H2kPu5@N@anTLOj=ElLJrws zsi{1#46YTvU08r}7IrL$%}B(zUEA#1nF4NS*j?|Lc&DB!rDeg)2>j56@%bX9y> zDSvRT-6yk6_^Gpsd~1!EYQO&@6xChJG*&n$3lt!bCM^F0970dCr@M6XO39sntQ$6i zX<_)CYoP8xL6FU(*1{~}x7aAl!F(39Dpg7M?uk1~3v?Z#CZ5fZZzq}>)|WZQ&nKOH zC*%$nzZ)otw$)~^v6DHSBb6|{c;3>R?O`>{9;aLuLj~l|+GqEBjNY!e_qmv4mr1`c zP&x-9ynY$31K%@&4t}#}{dUI~amR=@I|rpA8g7^oHe7Y(BhNwSssPe&Ar4B$UT#vLt~TK-8652)Tew(XP^ zQXTCf7}R&OXqrnQUG5y)uP!4AjQFLOo!@uUakPs2sRmDoL|o$<`2Hn$t_$Xj_gF)PhY#8w~uOPfsvM z+!W^cV8I2mrMy0Zo-Px!WfoTWEJwT@{D$?`u~lC&xVZ3OlF#+dnqD7dg#0r_fnrR; z3YUgqm7V$kPP~x2Se)pM7>ydExRR+qK)LIW_I?qO*#upeK?mg-Z-po^0`*vQROxoI zUYx*@AToc*Za^tlS&C9=m{Pm_7vYoWkFfo(pxFr?Qq7iaDotf0kY;XzXN&R6 z!F={Rou)XTCC2r$-cK8T&;I4l*IR=ownKaEbEWSwe8ReRlS#~KU75~XOuA7B|X zd1AdTrBPoYspDmruk!`!*Ciady{OMvQGg6O^@^=8~uf+aRF@ZV> zd|k~3p&E8pE_BHPl>v)@ODJfk{2;>Ggu}d{?0=A!|HKa)1dB#JbO%-WsaKW&{ zYjxfrBVIza6}Td;QlUnng>@nVs@yCmtjo9kkdqGx0N~@}O#8RPZ*Dw4?EWR7jIw1B z8`;SWlvDuY`J%04p*>Tgm6Yq+nN4jy?sCwbw4_QR#h5~k=hPyyJ6aY^!pDd2M?i5U zXFK9ib~T3|UjSMsYoHYb_gsTa5?FVo_{n|iIceo(($!73GEec9uE^vk#TYXUkM+sl&`Bzg|9bb^O!&*0Tg1t_{VQS$?WoTQs&%qCOJz3vuA9!uzmKMZnL(aR$n7n7?!GKXeq7h^9Nzx z^hRgVCs!!6P-`s$&dJYcEwJ`NNIyu?;3hx&58}$m7^4o6Q>aT+DG#H;l)_S-6C*KB zEQ$YqM331fgAtuZ0a%jJ&4G7BA7RV(@v3@rxA{ud%7VhT$!XOZ-lM`_^^~o#$e8KI zM#uof6yJh?5MtiJcSeMv*bubr_219MdlB3yoW0>V$EXYtTcW(_I&Wc zK&>}&ttS$=iipsP;E89z4f}ti?KV_SEFOY5kGbGUMil=JpfXiSjsXiEc z5cd7WAeHK~|CA6J6tfW*#7A%%df3rf@*H*;gcp_Pxx~?Im2vI5qCS)R{SnZiY$_gO z(pF;BI~2tyfi=9b(*f5=$+6H>C2BPJ6eq24jn2ZZ(*IUf^l=Nha~*)QVcBYoP8?Bo z(uwL)8o2MJksp~uVsVWt@#taJ6tfi>?Aep6V_@{x&{HXcs}3d(55w z8QRU(1_{}=LhHlq)z}2NY;sfk@zOmN0xIRi7jp%CQ)_Mt=Qg>^KS6i)7spR&E%{a0 zuDUDdReO#{y6fk`50>mU70R9q==jmjeZj%q(PUJsDz=T4fh@|(In4MC?-kcsgYQO(G)-DIR5M9e?<773yeWaYZ&=m*APDWIYw)Ym?Q1c66U>*Ag}$Ck|M8)?q%(#Y$% zR+4BXXDZw)c8@rHqt90X9hR17UM(RhU6%Zc;$@2L8l_DT4D}z$8G2@;X~*hLZh9zl zMZGQZrXt(hMD8d{W$;Tlvu=bt+a&#gysX#vy`H@xJ_GEx-k9sl6O*z6$8*Cj6OHzU z9ne^249E zEN0XL!+18^Tg+>cEd><5b{4BY#0Hv@2{T$1<-y&?Fbb@4)u9%lxlV(~TYRitSa{!B zN*@9j$`G$2(#+xr#tzlmS=QVsF7 zkZv75Zdql1U;cx1?J<;hPIS#*#ukT4B^|{RcQ)E8hKtMt6N)-iFqWt(Yw01J(D_o$ z$P5M;jG$&wQR!hxXZ&hKJsJ&Rg=^}YU>2>7iFdJh3`Qy!z3lhUh&*dX~I*K}(Y${~yIj5VECUzO-Wx71dPHhS=f#)pMn&txTb7D`VEk$@^ zQwHn%JlC}7BYzSv=)f|cW6PZB)bXKo<}#epaE?}w%4SX#W0hZ8&>aO0^G)QP;^%mH zhbU4tC4){fEvR-{I<#EW?Ng()H6j)d9+^hw;mP*#f9RJx_nSQZk@=p4bIZM7AFtJ_ zb71SEcAZP5SV=~reb}_s*wa{snFIliQ<>F!lO&8o3jSM0$nZukY0ilmH_IO5gG{+9 zR;e?*d76JiD#@8t9gy|%b-nc-E@&U}<}~R+5k7uAqWEn6?xc{e5yD|Nb~WzgiM2P5 z)JmRN?|*+XUlA>h<+T~X$yVU{1iPY^=N58!?jx$)19M4Zy3NX2>KGe|ZNg|3t8)cB{u&C# zCu8idAXUd|spAkGP!X=Y8@{cZNt3VXweB&$b9V8Mqfe=vjr?5Cu*VR}9#}D61Ouhu zMueZ@)PFh`fK+na*A{ptjL@4~ywC;d0ZLuXi)~R)2!&Z5d1d#YWMceCKjIf^utsYx z1joVm)M{&s-J&?)kYHtMd|S^AQ-r?a_owR@Vu^}GPe$_3JmJnRw$xV7CT9pu;tX&D zhvc{cSm)i$9f zJjG?d7YT8Voc8$~>-Ew{HToKjo<@5`fIlsjF^g``#j*npRqYd)xnwn?QY?F4PvtE$ zU(rc#{E}>M(9Bc$n$OHn#V7fLPX@W9Oi-ZJ$)>j@u<1Nv8b?#*}J{$=S&nY7Mge2)-d*E0P{+w zQBxmmwE1~WMNhD*{)Cy(RhmGQ+!tY^jD*+6U3=*T5ltWFZ9^RG{1_Y99VUd$3x<`i zoGL|TtHj9MPure&!Q!`MKjNzbeQaER3?Sv+?X#?8f9IKcG``W^zOcgqcV~E4g@F?$ z7o+sv|J^)60+eJSp|Aij062ud2>TP@e0&w*Cdn9|47|#U<-bXDF8sx-Y$`;F0GnRt zXuh>#-Ov9N4}+_F)ZPF6g_wrY82MCeReBdF&@z4Ka?N*tH<`OXl}euj|JmgYaoUZR zhJ5VN;^6(&NjS43mOGt>7dLxqO|x=6>uHH)sWbggFrs)fB;5Di97+xzF3Djy>qM2% zJLrjU46si1Bs4lw#2r5c)_Jgu($VM2@o2F~-qKps4p@mEAm)b`5|2H5r=K!Y7FVDVpdkBnoK#*?m@(Wh(iw z1pbi-H`KQHJD^m!~c7sR36px5$**2%JvGwmzCkjaQpr7Le~^MdNiWnh<>-j;$>G~;6xc6 zcka`!1rEm=koaj}ZA6dl{=Dn5#;PtX$Jd$VVs>`nRGsZ%_-z#dj|XRay{goPB?x9n zvs~QoYkTr!tR8A(?U%T*oe$THUiG(;QAbqBsEZCX^1j2T&SxgMXd9X(-1NgB;Y#4g z<|J<$^HkmlR&<0s$_Y1YLm=P0(fBZaRapI#rUg@_?n>yL=S-wH$gcOi@pkTcjcT%T zfA@;}H}*e1{&5)qfFKW|Ru3ZJgBUosi6D~%8UUC^{Ra)@V(#i{?O^HhFB+;zb5n7b z7tK%j_&uz0S~GK#3BArH^S}tYwT!6>CYf5;#J+Pu7`l4U_NH_3p(H zw^IwFL_{9Nzc86?f+bA1q|#F`^(mE=a@tcu2C$^ZTD+xdH;fb!)kV`qX*Kd#>hV2-2wZ1MB-?q$Z}Et^$v?UXXlR zbvf0HF8D3nh`b)TAugdslatC?o#b?5*|bP*lwP@Wk9x&B2RnYf1oDyh-eu7WZVnfK zo1W0q1`!-LB`v3|s_hfRbSY7>EAYquLOPP8`|62SFr$HL0J5vBCPc)6U+ezJpGON- z=Ku}pU`TUbe;I=Gqgz8>v^Ue#y|Ga@l3R!d7ij%MfkV5xC#ReVkSK%V;gWs~}~!pLIe`mTg9ZD z^5f75N3Lo`2_Ku?GpJRE+YvfFIlQkg@Ag9g92@buKxbx2vz(h%+I`@j(VN)G^EcGc zo&oNf$~j_UA3oEu2Rp;{p$vT$MQjE5#!MrLm*d zsKetz)Qr%LLa^{$1n%9T(E%@FEsrAym<4HlW7(qn(Cmjm&XZ{Ty`+%r4~hA#Rx^S( zBjK_3Bt8R{#CC;w!n-v@pEA+_@hgz2Y}pn6?Xz^5!7{iM5u8Z*CWQxs*qC=+@UC4w zqPzY(E5)G@a#8Cix-NVqjE6UX#5N|yQ_9Nv8arUjpNL|@+q~~rIgnCZBB9R>+~VIJ z$vCX0aW#|S<`gzqx@*t%r4}o9ZW@+Zhz{+#Y|CvSUm&2elurkn2Tn&;r&4#kT5qwc zq?z70l`rrF1MG!Gt*GzqULR;-N|K>#mYMpCYX9hWgS5Lrc!`759*Vm9+>!?2gR|Dm zq(*xOHr-?Y6*%!p%f1We2I+tcoc<)zzJYJm8l`Zcy^@gYXzoXYSj~83R*({s!`RRu zHPLdF2#;4wl0URsl3MGfd_~;1MEou`PuUkOc$pKRa}IItRiog_Rhm+z3L}IR)XrCl zX{x$2ZSqUpP@#Y*4b*aC(cVaX6)58wPmlM4g^TktVB~;in!9-f^0_~Dh7H$vGrHY-FU0F2)sUaHptFb9c4R;Duzzb=#w2pDBxMkvMyh(y*EKkTU-$e{_}$Kyr0P(g_-pOQ!uRKGryt+d7uF9RXXw)k&!}u~SFI1OLuokryci z3g&EFIOcS#Z!~Wpmg>_!xq=|cJ2S$QD#(UulR`|J(9f! zok8S(&27+mM$MbUL0_H0+HKmP_MfF3WDSv={*`*U)3+G`TKmrnflGAa9D>@#d53xe zY;g8IQWTudH9qZKta4nat6noxxCqOl0(q=I^K%x+9cHG2+%=-wLG5#VPK3WU9eONQ)Qt z&IaD5+!>P@0f?tKT_jSDoNUzbqR-S3eQI*q15J0V_HP7U=IJdYr8R$TO>g;Ysqw1! z|BT>O9j2X)Uro)ku^O(!fNUn)RxQ5ToL?Ay)^shN|?E^M%0S-D@o*u_<_tTmMn$wkE#=)>|Ew z{K0r#OtIgDrXiC#d1zDO-np*^pSSFu0D%)sYZy*T+1;ohq2qD{si?U&JMT2&E_<^T%7kvgL9XzhYKNq$_i-{$4>-1)V1QyWk$I zb2%!C`!EhDH%?Qxox+>{LN&=4&WOK_v?8#}?MYxEA41Jf>kcD%*H{KlR@ky~F!%|a zIYzJ4{(4JQ4lEZPa)9B=FNRU;POCloY|K9|&3~^^G%io9m_JT8qVj<*LM>4Yqxw7r zFGn|IU~=+v-Ywc%{^3Tku98*aXDTR89tkjQ&Bc@w1+U;g{)RP?4?{l4DT4gsCOG!v zT?U70)M%8|#;Tty{rIzUNvhcJ@d!UyfyRtMxrA0ILEa{&Lq~d5eb3J=jwSKrS4E4LK%z5z752%rTqq&hst!A8jV)zd_f(F&C$gFW3nK z0O)URa0I5Kf$76M^qiDj$Qq>yaWb`g63j#X2B0MV%Yq*>gYLWnCpK)RJ2sMnpj)%- zz(p$k>#>4Y!^lT_`XqylS5Rk+&iI6>WD%+>)$B~B^a87_Y~OMO$UUE{tcu8Mf{RTl zlXMe3Bf3v|54P_iwNiukbRP((@LJ)yV~)mU4Z4 z{rwZGOqOoCO-D~WlaOYSxd0luGz#@n!mnv-no!QO0xq5~+&2?47fndvFDNJl5dnUh&w znUks&%M_ancfKUAm-tM7zRJcINknj(H$6UvhK|GSf#BszxgX4I=%sxTFh@3*+L42t zye7gL&S7_@1`;kgSn;OORf(6DCGq&S|KxxJGmJ4NRY1wuPmzA* z^z^Q>7dOL0J^)-byV&B%_ z8z_2~H#e9|a#KuMdzj8g#g#!s$B@*THom-g%*&c!^&Mx`z*@sbq7N^-k(NG}OfQ#p ztQ-;}+s0hY0+u(!`wL0PzVw-Hp(k|o3vd?2NNjs1j;#tGU899B`)*fGfoMu68%!vN z*xLgBfp%PnA12WL2WNtH;*Ve30lvidRINnJq}GTT$G~083)*Z)(Zf6*73&7(_+SRw zUI{eMx)+JGi@mIGS57?8OT7_Hr<2%chlz02AMRMT>W`k%#q9A$HeVwp@ZzqM1|5FM z@51MfTysY2ea*zQ?`5luwZ@(b-x{iqV9)0TOHOzwnaes@Z%}%Cap8HVH-Ch|)NPDK z*B~{Aul)2Fs7Q7Jts!kJ$mh9EWAgKe(GFO!={SQ>ToYIhK`_3 z^Fbp!<`*{Mg1%qtCoD)`>9516z2Dm&XWZBbxWHhA z+DowRlPNNmw7D`f$j;kevsmNxbR7;9hyRQ~dCuWrX%=L~AtSWuZ{mkV#2-#NOcx3+ zaI>|+-)!KTaZ1!vfgXhtmn1Z@1CjHQL0=}gACBXY@rGbvS>19Hm)!12d>jAZo1quZ zmNBv4Ryg^pit2nU#Y ztnm~apx_Z`GIA?;bGl=$pY=Ak6wQN!yF)tI6&?Y2u6@+KE;LM`WoK7GL0iljBj%pJ zQQ>!5m{aXo)NhqQE%wGedlvnDX?ER`Z1%*yS)|Sv8j$EAJ`K#1MSs|}*J>`L*%XH7 zc;$EYyi;Nuk11snTP-^KUg6785MT2=*(gbX`XU3uIzEv9xcX^RK|pvY+oxsD*0XRR zCr~l-OROfNbyJuIA>G-<3YWMEw%x?Nq;%Cg2zFU%_J001h-3HgFoW%fK&$*Wh|^+? zQ`U?jV}Q(&OyvN-4}I$|a-{6T%AFXEIuw?zet|T{NFY7XHZ3y}NCH`=ur5mnrgA)t73QalGm-U!>h0>!CuuQtjeBSR&w7( zXO74?d?j@hOWIqXoCuu3wPzSf)#4=6A!^_6H?3L;Gm{<*o?A(7?4C&$5=LGRKw`SMPjOkRqn3q>MaYM-r#jM~@7yU=YqQtGsso z6E}g+fUe&G#A!J_NrpKhp7xM5J-ax?##bozwQ`m-_;orruET5!bO*5S)1u)B zUnhk!R2(}dJx{mozp>xp9>;+Opr{LTalxI-4F=>eYcdh5DQd<(#8Y;l73EKcYW(Zn(|_d|FRd;PCB;eR@k z4glaU>-7)eNdqMS1tAs!(%KV%)P(;jQhy8C|8~Ya?KuHRMf9IU^VgsM?aQx_G=W4Q x1>wJ<9`f%hG!y`U_K%YQfWS1bM4%8Jpcdz diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index c0c0a497..9d98abfa 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -16,7 +16,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 -GENERATE = True +GENERATE = False # Debug