From 499960449397932d9c1f6d9761057922b1417a12 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Mon, 11 Nov 2024 20:52:06 +0100 Subject: [PATCH 1/2] fix uint64 hash to pyarrow Signed-off-by: Michele Dolfi --- .../language/pdf2parquet/python/src/pdf2parquet_transform.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py b/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py index 0f5de10c0..20ef49dc3 100644 --- a/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py +++ b/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py @@ -24,6 +24,7 @@ import filetype import pandas as pd import pyarrow as pa +import numpy as np from data_processing.transform import AbstractBinaryTransform, TransformConfiguration from data_processing.utils import TransformUtils, get_logger, str2bool from data_processing.utils.cli_utils import CLIArgumentProvider @@ -237,7 +238,7 @@ def _convert_pdf2parquet( num_pages = len(doc.pages) num_tables = len(doc.tables) num_doc_elements = len(doc.texts) - document_hash = doc.origin.binary_hash + document_hash = np.uint64(doc.origin.binary_hash) self._update_metrics(num_pages=num_pages, elapse_time=elapse_time) From 6f5e2cdfe1d63d1716434c311d8a94f8fd059800 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Mon, 11 Nov 2024 21:07:04 +0100 Subject: [PATCH 2/2] update test results with new type in the binary_hash column Signed-off-by: Michele Dolfi --- .../test-data/expected/archive1.parquet | Bin 23361 -> 23307 bytes .../python/test-data/expected/metadata.json | 17 ++++++++++------- .../test-data/expected/redp5110-ch1.parquet | Bin 9683 -> 9632 bytes .../test-data/expected_batch/metadata.json | 14 +++++++------- .../expected_batch/redp5110-ch1.parquet | Bin 27200 -> 27147 bytes .../test-data/expected_json/archive1.parquet | Bin 10880 -> 10828 bytes .../test-data/expected_json/metadata.json | 17 ++++++++++------- .../expected_json/redp5110-ch1.parquet | Bin 12073 -> 12022 bytes .../archive1.parquet | Bin 19976 -> 19923 bytes .../expected_md_no_table_no_ocr/metadata.json | 17 ++++++++++------- .../redp5110-ch1.parquet | Bin 9683 -> 9632 bytes .../ray/test-data/expected/archive1.parquet | Bin 23361 -> 23307 bytes .../ray/test-data/expected/metadata.json | 17 ++++++++++------- .../test-data/expected/redp5110-ch1.parquet | Bin 9683 -> 9632 bytes 14 files changed, 47 insertions(+), 35 deletions(-) diff --git a/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet index 907fb3803409984ec6c0877bd457432d2d3f8e1e..f68ff66e19381d604d2ccd48b920e31e847dc341 100644 GIT binary patch delta 1240 zcmX@Ojj?+hbtIALjJ%bs_9rjy>}m}`Hr z=i+?AABkBF`n|8bic=PHE%dNkiS-c7R(yBOiul?Yr+j;QOnHC0722mkV z6&;|d0-|i9Y%*pH8Y;=DsYVv&28Oz(MkeOECML$_x|W7!NxFuHhGquFNy(OpY38O9 zDi*1U7D<-L7P=-DhGx1ZW=6)kNlD2jx@IY6sfGpyCMKz=29pKk#27D4RtQ#zoW`v7 z<`SdYo{NkGEFsyBej|y=b;d%I&zmYRnoa%?EH+unOq6lcWCJsK{soL;M}U@IXH9$`}Z z_MB1e!!vLgqv(QZ!e$~&4&u(q(D3vG`i5cg^EdV&`qqOR_UMk5;DD)sus~|W7&I_N zCns8qGtQX&J5gCmD-7lh_b+JA^6;&)tp)WP5m4h{xU22mkV z6&;|l0-|i9Y%*pH8Y)StCP{{7#-_T7#-^saCdn!0x`~OFX1YeE1_s7yhKVKyCT6A* zDyAtGmKKJF=DMaSNr}29rbb4(i6-V2x@n2Y#%2bIsg^0msgnid#2Bwk{uiVWIfGg4 z-6ck~Jr^1AT0({$f6q)9MOd@s3~&0LyMY0^U^O)MFS#Tj5-Us%L;v8e62$EbE=vPZ}qpu5V|{yYUL zWr3Q{Qk+?p3RVhn*u5u=Y7?G-9Htm5Jh|Ue&n^!ol5I*E8B&;}7#J9g42(>44GnaS zEJF-UtPCxz42<;5&5SJ!4UF}S3@i=IO&J;3Tqpmulrcb71e27I!fgO3nwYv?O^&rv zbUneOcIE}6+M(xQUt`fkl!4GV@dCR0K}9jI1CVAgD!OBjp57!lU@E{Yuo^K24J@MI zr1WmGpS6b31z?0Wy#a<97G*dzL)^nL`LJ~|(}ItaHEh(_W_@5(JM(^WVwmydcAI&d z^=!|J(jlpe5Roi}NlKEF|J#ZammKGkmJ~Ia)u@~%L>MWEKv71tcr)Q#wS`fPMU6x3 r#pZcodQ6Otlg~z|^Ex{E2I%YSmpcN1+h)c{CMJ${IR=IR#~?!h6sK|) diff --git a/transforms/language/pdf2parquet/python/test-data/expected/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected/metadata.json index b9a535098..330ee3a5c 100644 --- a/transforms/language/pdf2parquet/python/test-data/expected/metadata.json +++ b/transforms/language/pdf2parquet/python/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-29 14:17:59", - "end_time": "2024-10-29 14:18:05", + "start_time": "2024-11-11 21:04:30", + "end_time": "2024-11-11 21:04:38", "status": "success" }, "code": { @@ -15,6 +15,7 @@ "path": "path" }, "job_input_params": { + "batch_size": -1, "artifacts_path": null, "contents_type": "text/markdown", "do_table_structure": true, @@ -28,23 +29,25 @@ "random_samples": -1, "files_to_use": [ ".pdf", + ".docx", + ".pptx", ".zip" ], "num_processors": 0 }, "execution_stats": { - "cpus": 16.8, + "cpus": 21.1, "gpus": 0, - "memory": 31.22, + "memory": 32.09, "object_store": 0, - "execution time, min": 0.108 + "execution time, min": 0.139 }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 33044, - "processing_time": 6.478, + "result_size": 32939, + "processing_time": 5.596, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet index 39613b1d11428da1c8a0da0c7fe9f02ddb1908a6..17a7cf950f26c3ba9a2a3bea0631053eb829c83b 100644 GIT binary patch delta 1111 zcmccYy})~caJ{i%T1sM~iGi-MS+cRNiJ_UHZjyYLZ2Yfw?GysF0|N zjwq9;fGC?Nn~WKQh6*wIBnYcw1{yzkf}q&s8N#|=R5BA}WT1f7WJ?hVMzhHmMZ_l8 zh=?-%2%PLMC(Ap5QEUsNT2r9fkpLNxWQE*;5E)S(NmB+k0Z9qw)QS>O9x*1dDzSNt zYWITG7Hv)!m1R`j7Ku>9l95=P0ao;eS?n9L+Ji8)1EHWWnXD`3I5|c>bh5p8K4aJ9 z_2SYjZ(`KeO+F`{5ND`sXc%H-Xk}nxWo)5mVPI}xW{f3RrHl-WOn^$UsFuJZ2MR%^ zPl=P4NGLmONl-f#4{{D3^?1z!2AvZF14BWCf}jJs6D2rcG6I=VVKMmjn=I&D6w%*4d;o{NDYz%j@W E0CoWgNB{r; delta 1074 zcmZ4Bec5|~uu7tdVXCQNVv4S%VT!q~iCJ2rZlYyMl5T2BN=mYkp@nf`Qp#jU5n*ql zR7wz~YVrg@vB@)pb-jqOoJb=B1+*qxib#k`aEOPPZ8w_D&WK=%O%4*#W?B$5*+5QK zaR;N=9Y(dLK(!+QG6*FIme}MdIgQB|MFQj|gvyBWNSZRR2}nvXr&g4R@`y2sRf)}G zR9hFK_G$7$ImO8dqJNm$q9(_QNwIDSSDO$vdB14KsrYI|Bnl1Ls<%$pVsMsvw4g1P4^eL5x8I z&PIrWT`(bSa+HFa!KW0pEh)fIh3P=BaOe#b5Sx5nvV`%<skc% zA{Ko_nF$F>ao>iy7O(7qo?$p8aRWpr>@s0=5M>Y*5mnOxdRqX9*<>U*U@~A9Sgja? z28QTl2QNvcJ1Ztfd#M}kSVX=ifOW- zx60(X-WxboWqj5MI{@Qhv#-xzQEH~EhbbZi(vS#B`kB1ORf$;*J^jqVNIUk-YSfN5 zkWCvFq-%Qt#l{ z@NJ$McjU9Gz3+qnXkI^^u%f}uB|%3qE8?fgezk8|tog0q7hWitqNXIkFfmcsTO}pg z!raKjz)&|aDcMNZB-zMZH__B6QP<4eILSOI)!foD(LzGS$i&dV&?MDB*C5HXn zh=-VMH=4~3l$q$F!?a-XrnU%4Ho{^cEk*Tqnsky0{fr+7>p{0?br8yJB z1MSJ@J>(3K)WRhtq;Q)9iU6ivODCIo%6T*~i#=dc`?Z)!?baeD88ZeAOntFQ8gW)p9O%J4H)z=888d1R*XRdizwK+8&*y> z^insPu!2c#({fP65v?2KumxTXOq145HcVAxZ&|~n*0-8z@d8v9#jE+6?00)g9R#tbGVj!roSh5*MPLjY!=X;T0I diff --git a/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet index 7f34e1ba88d2cf2783bd056ca1fd6466e49f88da..42b0a245de8b32b108dabf57ad9e3de87ccabd6f 100644 GIT binary patch delta 1111 zcmZn&JrlCQL%hC^(MOa;ltGArL1U}_Uxk3Lj0{Yzd;%L(`L_s7esXt<(58pnh6g;| zKOHE1p&7gQLfZk~X-Bo@?JTX}}&C=9J*TlfWRM*1T&{)^f$iO7o(89#P#4^cTLdC>5 z#lk!(IY~D;(ZX2QB+WcU*CH)7RoBoY)x^vs&B)ZqBzf{dF)_v&lldj&!>2H-ZHiNC zi6vkK$u=yMlb*alLZ8uWvcII*wn@CU+_;Zf28GU}XA{J$bFN6zif)wO#3x&nVBDoFFDX`G<4@)189JQ8H5O z`|{O3c_hMjRXwX>s(jG`N@Xx$% zj~>es955AN7FdlKg9a8+u=Dm+O!k&j(_c`ocBxE;XuXpc$kj8ws-CPUugdbIN^RO? zFZqj{sxm%ngdKqCceAuY3=@r$#zaLCT!}%0Sq(KcD4{2W+1L|+CbJr~VhiLvNYEOP z7_|12L3&Zc)|gxD++kKR7BvpBSLvHY)pVE`11CFasPjfTIy*YLIyy!=IyyRSF4JIQ O;`qSDz!2aVWC#Ff?ljH- delta 1090 zcmX>T(h$1ALp*Q-qmL+yD1#6KgT_|rGe&pmEw{4hg1!$CT-hV_;3$vE{-i#caAniwli4v-S@R6s|bYx^Yl{5LjloaciOtqej$q7;&lND5iCMSrAPd+c5z_hM#vZ0I=>x2Te zL-~^fWD=zebd4-S3{9+z46F=H^^6RS%`7d=41hsqG5NHNf&r>DL_$K!$iT=%*AP_> zq7@WYOq)t4PgPNN_)@I)t_T!TSk)6@79><8coba1cpMmj!1lw9$>(LoRKWrg98e*9 zF$N6~8%-4KlnIrS`Bc>mK9#F2DVG83KvRW7FUaa%xlE=#HIu7URav%GtG$@4D1UMD zZ}}J|8mEOqMG?aBF2t;c6!DYyNGdX`VTpKQ^a!_OR-;zzfZR!L@PhQB2CXr-*tf&1 yVk~MLVpB3UKUCFWVsxC$r=ia4=;#}uudiS32n24Moiv!3IDT_6Fa$UT83F)sB{1v& diff --git a/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json index 04bec2b88..ed05c6b34 100644 --- a/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json +++ b/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-29 14:20:01", - "end_time": "2024-10-29 14:20:07", + "start_time": "2024-11-11 21:05:31", + "end_time": "2024-11-11 21:05:36", "status": "success" }, "code": { @@ -15,6 +15,7 @@ "path": "path" }, "job_input_params": { + "batch_size": -1, "artifacts_path": null, "contents_type": "application/json", "do_table_structure": true, @@ -28,23 +29,25 @@ "random_samples": -1, "files_to_use": [ ".pdf", + ".docx", + ".pptx", ".zip" ], "num_processors": 0 }, "execution_stats": { - "cpus": 18.0, + "cpus": 21.4, "gpus": 0, - "memory": 30.77, + "memory": 32.33, "object_store": 0, - "execution time, min": 0.105 + "execution time, min": 0.096 }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 22953, - "processing_time": 6.282, + "result_size": 22850, + "processing_time": 3.229, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet index 32905aa74d9d9bdec26cb6d83c07c16dbd219bb5..0f4bda73ea6a56650936193b1484c5ac9a7f0334 100644 GIT binary patch delta 1075 zcmZ1(_bqmVt&FlklCfc;rJ=cQnu&#>u1ShTnr@P%fsw9hiiw2EVse6vs236H z3?x*DQ8js?wAkdUGP=|>GEquva+$0IquFFWIkCw*Wks1jl})}NE6Y29QEUsN+MiOj zeWfxW$qKn$l`^6{lBNu70+JHUsTC!nJYq~@RbulP)oxU%&6zBxp*Z=d++W2_bqHlF z8HvRiV0CYp#eOiW-KkdFRW&(LW7gz&S@FqJ6e1YgCcjmXVSUlasJ3b{pJJl0p{}7} zh>@X{fvJ_TnVyBYfq^A36h+LXj0}uSfO4491`>GWKtaazrEPMeri#tWJ;598#O3zL#oxwqrK~zXoMMsoLR6vwXlubs0g8>LYG6=N#U}U5YBS9$pS(v_mUjoE*d0c- zKc#B>N@WnzVw3;KCNXtYO-_)LU|CVA_GWX9oE#%lQ~hLq4Jp<&HELbell3%aO^%lp zpPZu*!FX-*DGeFc9ZigCzb3!daG0#7s3>BfYh)Q>Xkuk#U}a#gXK89+U}Q8oQAT)j zo+7&eHc1I7BLgE7T|;bgpip5t(mq*FQ^l#RjZtk*E2E4Vg9aWQcuh-`(zQRZV>gq7 zD1)ewsEUp#lc<0wn<$%%1P22H16T&Z5@XQ7A_{iVjPA*+G}VoMbuy~$>0~5DccPTo zWHIFu#z&L8mDSnL^fIb_>tUQY&t~#hvRNJ(PQSHe_Mj4Qd*yM9=LX$&u0~Eh) zWt0)+ku+ss6Ofc(POT^r2r)2dY}Nm(V3W$o z;KC%uz`$T+U}U0eXb40hMut`fra+`;Y;0_3YHnt3U~Xp2$iNn0$vC-CTi9F*n_7q} z39NDubAUlB#<*&-qoI=P5hk@ahZxnK9RzxxK?AEEBFqDZEg#V346(9z?LqXpzR3>^ z#Z|$42@Z(JReLc84KN#|3Ly%1Oy6WG9cgX8h=;-LQ`JXov6Gta7%?CIJ83F*-S3d3l delta 1126 zcmcaSo3UdK;|4Qrs|BX~E%#ShrF8DP+f^O9+9)kk>_UWbUFO7izar5k$B#afPEKRG zniP9K^JBiK|J5g+)A-H!4+N#Du}m(|7WFbUwMaC#Oi9+YG&8i+HAyov)U`-TG1pBs zN-{T3GdHqGF*1};F*Y_&O|wX`&`mNlNYXX2Oij_XG)ObmO*J!1F*Zy#GqW@^n`9_9 z`Kh)66-}J1sHHWzQ%6Bmf@8_&g$hZ5LIx5X9d7Sq=Cg`&h)rIlqs@3>@--cK`j0`DEQVa|X zMg~SEx`qb2MwTIlCRT=)R>p>U7Di?UmX=0(hGu5whDMAGY_67!lMA(l&809^!&FIN zk%O583|hv2lf?~{Tu(5m9XQOWw(St3j2VLl7A?dW2MSjYpwk-;7rnO!(hU3dU$qxy z5ET+t(E)l=K$K0C4H$|r6<`)vjTnOlhUnxEh7ydACii)07#umysP^p`C<2JonrPI( zH08|Xe;#UVeWw}K4xF0IZ+vm{OJiOk>ZT%LCN)wLiUzY9N=li0)=ddDp-ld-H=9Wf zEulD2IgWs?COce3nbpt()|gA|*%U@G7BvpB1Dn%5b(t6)Cr|ZO=XG@S4ba!uFLwk2 Tx6LQLnV2}H026kAV~`;LSdU8< diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json index bf5c9e12a..e8a3894bf 100644 --- a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json +++ b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-29 14:19:30", - "end_time": "2024-10-29 14:19:33", + "start_time": "2024-11-11 21:05:04", + "end_time": "2024-11-11 21:05:06", "status": "success" }, "code": { @@ -15,6 +15,7 @@ "path": "path" }, "job_input_params": { + "batch_size": -1, "artifacts_path": null, "contents_type": "text/markdown", "do_table_structure": false, @@ -28,23 +29,25 @@ "random_samples": -1, "files_to_use": [ ".pdf", + ".docx", + ".pptx", ".zip" ], "num_processors": 0 }, "execution_stats": { - "cpus": 17.3, + "cpus": 21.6, "gpus": 0, - "memory": 28.85, + "memory": 29.57, "object_store": 0, - "execution time, min": 0.043 + "execution time, min": 0.041 }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 29659, - "processing_time": 2.554, + "result_size": 29555, + "processing_time": 1.997, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet index 69bc4e421e08636b0d2d0a608b6925d371c8e1e6..db8b58790e17709bc4cc924bfb1f47474d895581 100644 GIT binary patch delta 1109 zcmccYy})~caJ@yUp^1gDNs4Z=fq|i}iLp_NuBEY&rEY3kN>ZYcNt%(Rg@Gu8sF0|N zjwq9;fGC?Nn~WKQh6*wIBnYcw1{yzkf}q&s8N#|=R5BA}WT1f7WJ?hVMzhHmMZ_l8 zh=?-%2%PLMC(Ap5QEUsNT2r9fkpLNxWQE*;5E)S(NmB+k0Z9qw)QS>O9x*1dDzSNt zYWITG7Hv)!m1R`j7Ku>9l95=P0ao;eS?n9L+Ji8)1EHWWnXD`3I5|c>bh5p8K4aJ9 z_2SYjZ(`KeO+F`{5ND`sXc%H-Xk}n(WniXfVQyq(fGt?1j0}uSfJ(8bmcSzi3PGk% ziIbN|C_8LPP&*Y5at;YcAG@@n{#$mDcUMKv{2l1Lb8@~~%Cqh90zJwk54i=hU(6_?ne z)r?|HY8+zIHm57{ruBCxxvTmBOIZ)ip(9+T*WwN7)us2aE zC5Tcrd4izW+IFT9YkBBt#`R#6!%s8_i~CM6kpr2Z?AiEeM)yASbK1 zgHh}bqgqp-+K~Vmgc1ZxY;u&G#^j440df;UWkh)-O&Qn(Bqf+rD@sIp#F)gY#O5)o ztqW26Gdg2j`ww#(Kt#*9G&k9xdj0mG1ufq}t+ecEJxNeLD9X`k&SI3NO_?Zp^0 z;B15_*!~G=lcN;W3_hi(ZAk%!DNF}~g+p(kfY{{ok|m5+CLdK$W7(IX_G0pT1?$Nx zrRHr8kZxe2Zt@UjRwE$=XcCTnC2TP-j2hz(RE!&-2gna-VJ0<{P`2SbwVF|kNsU8n r-)0jbtIALjJ%bs_9rjy>}m}`Hr z=i+?AABkBF`n|8bic=PHE%dNkiS-c7R(yBOiul?Yr+j;QOnHC0722mkV z6&;|d0-|i9Y%*pH8Y;=DsYVv&28Oz(MkeOECML$_x|W7!NxFuHhGquFNy(OpY38O9 zDi*1U7D<-L7P=-DhGx1ZW=6)kNlD2jx@IY6sfGpyCMKz=29pKk#27D4RtQ#zoW`v7 z<`SdYo{NkGEFsyBej|y=b;d%I&zmYRnoa%?EH+unOq6lcWCJsK{soL;M}U@IXH9$`}Z z_MB1e!!vLgqv(QZ!e$~&4&u(q(D3vG`i5cg^EdV&`qqOR_UMk5;DD)sus~|W7&I_N zCns8qGtQX&J5gCmD-7lh_b+JA^6;&)tp)WP5m4h{xU22mkV z6&;|l0-|i9Y%*pH8Y)StCP{{7#-_T7#-^saCdn!0x`~OFX1YeE1_s7yhKVKyCT6A* zDyAtGmKKJF=DMaSNr}29rbb4(i6-V2x@n2Y#%2bIsg^0msgnid#2Bwk{uiVWIfGg4 z-6ck~Jr^1AT0({$f6q)9MOd@s3~&0LyMY0^U^O)MFS#Tj5-Us%L;v8e62$EbE=vPZ}qpu5V|{yYUL zWr3Q{Qk+?p3RVhn*u5u=Y7?G-9Htm5Jh|Ue&n^!ol5I*E8B&;}7#J9g42(>44GnaS zEJF-UtPCxz42<;5&5SJ!4UF}S3@i=IO&J;3Tqpmulrcb71e27I!fgO3nwYv?O^&rv zbUneOcIE}6+M(xQUt`fkl!4GV@dCR0K}9jI1CVAgD!OBjp57!lU@E{Yuo^K24J@MI zr1WmGpS6b31z?0Wy#a<97G*dzL)^nL`LJ~|(}ItaHEh(_W_@5(JM(^WVwmydcAI&d z^=!|J(jlpe5Roi}NlKEF|J#ZammKGkmJ~Ia)u@~%L>MWEKv71tcr)Q#wS`fPMU6x3 r#pZcodQ6Otlg~z|^Ex{E2I%YSmpcN1+h)c{CMJ${IR=IR#~?!h6sK|) diff --git a/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json b/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json index b9a535098..330ee3a5c 100644 --- a/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json +++ b/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-29 14:17:59", - "end_time": "2024-10-29 14:18:05", + "start_time": "2024-11-11 21:04:30", + "end_time": "2024-11-11 21:04:38", "status": "success" }, "code": { @@ -15,6 +15,7 @@ "path": "path" }, "job_input_params": { + "batch_size": -1, "artifacts_path": null, "contents_type": "text/markdown", "do_table_structure": true, @@ -28,23 +29,25 @@ "random_samples": -1, "files_to_use": [ ".pdf", + ".docx", + ".pptx", ".zip" ], "num_processors": 0 }, "execution_stats": { - "cpus": 16.8, + "cpus": 21.1, "gpus": 0, - "memory": 31.22, + "memory": 32.09, "object_store": 0, - "execution time, min": 0.108 + "execution time, min": 0.139 }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 33044, - "processing_time": 6.478, + "result_size": 32939, + "processing_time": 5.596, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet b/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet index 39613b1d11428da1c8a0da0c7fe9f02ddb1908a6..17a7cf950f26c3ba9a2a3bea0631053eb829c83b 100644 GIT binary patch delta 1111 zcmccYy})~caJ{i%T1sM~iGi-MS+cRNiJ_UHZjyYLZ2Yfw?GysF0|N zjwq9;fGC?Nn~WKQh6*wIBnYcw1{yzkf}q&s8N#|=R5BA}WT1f7WJ?hVMzhHmMZ_l8 zh=?-%2%PLMC(Ap5QEUsNT2r9fkpLNxWQE*;5E)S(NmB+k0Z9qw)QS>O9x*1dDzSNt zYWITG7Hv)!m1R`j7Ku>9l95=P0ao;eS?n9L+Ji8)1EHWWnXD`3I5|c>bh5p8K4aJ9 z_2SYjZ(`KeO+F`{5ND`sXc%H-Xk}nxWo)5mVPI}xW{f3RrHl-WOn^$UsFuJZ2MR%^ zPl=P4NGLmONl-f#4{{D3^?1z!2AvZF14BWCf}jJs6D2rcG6I=VVKMmjn=I&D6w%*4d;o{NDYz%j@W E0CoWgNB{r; delta 1074 zcmZ4Bec5|~uu7tdVXCQNVv4S%VT!q~iCJ2rZlYyMl5T2BN=mYkp@nf`Qp#jU5n*ql zR7wz~YVrg@vB@)pb-jqOoJb=B1+*qxib#k`aEOPPZ8w_D&WK=%O%4*#W?B$5*+5QK zaR;N=9Y(dLK(!+QG6*FIme}MdIgQB|MFQj|gvyBWNSZRR2}nvXr&g4R@`y2sRf)}G zR9hFK_G$7$ImO8dqJNm$q9(_QNwIDSSDO$vdB14KsrYI|Bnl1Ls<%$pVsMsvw4g1P4^eL5x8I z&PIrWT`(bSa+HFa!KW0pEh)fIh3P=BaOe#b5Sx5nvV`%<