fix(pypdfium): resolve overlapping text when merging bounding boxes (#1549)

get merged_text from boundingbox instead of merging it to prevent overlaps

Signed-off-by: Pedro Ribeiro <pedro_ribeiro_93@hotmail.com>
This commit is contained in:
Pedro Ribeiro
2025-05-19 14:26:00 +01:00
committed by GitHub
parent 12a0e64892
commit 98b5eeb844
52 changed files with 52225 additions and 4690 deletions

View File

@@ -4913,9 +4913,9 @@
{
"bbox": [
315.65362548828125,
489.1985778808594,
489.19854736328125,
537.1475219726562,
563.276611328125
563.2765655517578
],
"page": 1,
"span": [
@@ -4979,9 +4979,9 @@
{
"bbox": [
312.10369873046875,
541.3901519775391,
541.3901214599609,
550.38916015625,
713.5591354370117
713.5591125488281
],
"page": 3,
"span": [
@@ -5003,7 +5003,7 @@
74.30525970458984,
608.2984924316406,
519.9801025390625,
714.0887985229492
714.0887908935547
],
"page": 5,
"span": [
@@ -5024,7 +5024,7 @@
"bbox": [
53.03328323364258,
284.3311462402344,
285.3731689453125,
285.3731384277344,
534.3346557617188
],
"page": 5,
@@ -5047,7 +5047,7 @@
49.97503662109375,
604.4210662841797,
301.6335754394531,
688.2873382568359
688.2873153686523
],
"page": 8,
"span": [
@@ -5066,7 +5066,7 @@
"prov": [
{
"bbox": [
305.5836486816406,
305.58367919921875,
611.3732452392578,
554.8258666992188,
693.3458404541016
@@ -5111,9 +5111,9 @@
{
"bbox": [
216.76925659179688,
348.65301513671875,
348.6529541015625,
375.7829284667969,
411.5093688964844
411.5093994140625
],
"page": 8,
"span": [
@@ -5132,10 +5132,10 @@
"prov": [
{
"bbox": [
383.1364440917969,
383.1363830566406,
349.2250671386719,
542.1132202148438,
410.7686767578125
542.1131591796875,
410.7687072753906
],
"page": 8,
"span": [
@@ -5220,7 +5220,7 @@
"prov": [
{
"bbox": [
51.15378952026367,
51.153778076171875,
447.09332275390625,
282.8598937988281,
687.6914825439453
@@ -5286,8 +5286,8 @@
"prov": [
{
"bbox": [
55.116363525390625,
542.6654510498047,
55.11635208129883,
542.6654968261719,
279.370849609375,
655.7449951171875
],
@@ -5375,9 +5375,9 @@
{
"bbox": [
323.46868896484375,
327.739501953125,
327.73956298828125,
525.9569091796875,
429.5491638183594
429.5492248535156
],
"page": 15,
"span": [
@@ -5421,7 +5421,7 @@
66.79948425292969,
293.8616027832031,
528.5565795898438,
538.3836822509766
538.3837127685547
],
"page": 16,
"span": [
@@ -5443,9 +5443,9 @@
{
"bbox": [
315.65362548828125,
489.1985778808594,
489.19854736328125,
537.1475219726562,
563.276611328125
563.2765655517578
],
"page": 1,
"span": [
@@ -6250,10 +6250,10 @@
"prov": [
{
"bbox": [
310.67584228515625,
310.6757507324219,
636.7794799804688,
542.9547119140625,
718.8061141967773
542.9546508789062,
718.8061218261719
],
"page": 4,
"span": [
@@ -9329,7 +9329,7 @@
332.9688720703125,
148.73028564453125,
520.942138671875,
251.7164306640625
251.71649169921875
],
"page": 7,
"span": [
@@ -10152,9 +10152,9 @@
{
"bbox": [
53.62853240966797,
499.60003662109375,
499.6000061035156,
298.5574951171875,
573.0514221191406
573.0514526367188
],
"page": 8,
"span": [
@@ -12941,7 +12941,7 @@
"prov": [
{
"bbox": [
83.31756591796875,
83.31759643554688,
304.7430114746094,
248.873046875,
395.9864501953125
@@ -12968,9 +12968,9 @@
{
"bbox": [
310.3294372558594,
655.8524780273438,
655.8524932861328,
555.8338623046875,
690.8223266601562
690.8223342895508
],
"page": 13,
"span": [
@@ -12994,9 +12994,9 @@
{
"bbox": [
309.9566345214844,
607.2774658203125,
607.2774353027344,
555.7466430664062,
637.3855133056641
637.3854827880859
],
"page": 13,
"span": [
@@ -13019,10 +13019,10 @@
"prov": [
{
"bbox": [
309.9635314941406,
558.4485473632812,
555.7054443359375,
596.2945861816406
309.96356201171875,
558.4485168457031,
555.7053833007812,
596.2945098876953
],
"page": 13,
"span": [
@@ -13175,10 +13175,10 @@
"prov": [
{
"bbox": [
51.72642135620117,
447.7554931640625,
283.114013671875,
518.3907165527344
51.726383209228516,
447.7555236816406,
283.1140441894531,
518.3907470703125
],
"page": 14,
"span": [
@@ -13201,7 +13201,7 @@
"prov": [
{
"bbox": [
51.434879302978516,
51.43488693237305,
300.17974853515625,
310.7267150878906,
338.51251220703125
@@ -13253,7 +13253,7 @@
"prov": [
{
"bbox": [
51.27280807495117,
51.2728271484375,
200.086669921875,
311.0897216796875,
238.271484375
@@ -13435,10 +13435,10 @@
"prov": [
{
"bbox": [
319.06494140625,
319.0649719238281,
122.80792236328125,
533.77392578125,
182.1591796875
533.7738647460938,
182.1590576171875
],
"page": 14,
"span": [
@@ -13461,8 +13461,8 @@
"prov": [
{
"bbox": [
55.116363525390625,
542.6654510498047,
55.11635208129883,
542.6654968261719,
279.370849609375,
655.7449951171875
],
@@ -13513,10 +13513,10 @@
"prov": [
{
"bbox": [
323.0059509277344,
569.0885925292969,
525.95166015625,
670.4528503417969
323.0059814453125,
569.0885772705078,
525.9517211914062,
670.4528656005859
],
"page": 15,
"span": [
@@ -13540,9 +13540,9 @@
{
"bbox": [
323.384765625,
447.90789794921875,
447.9078674316406,
526.1268920898438,
550.0270538330078
550.0270690917969
],
"page": 15,
"span": [
@@ -13566,9 +13566,9 @@
{
"bbox": [
323.46868896484375,
327.739501953125,
327.73956298828125,
525.9569091796875,
429.5491638183594
429.5492248535156
],
"page": 15,
"span": [