-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpdfrects.c
1863 lines (1646 loc) · 47.6 KB
/
pdfrects.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* pdfrects.c
*
* functions on poppler rectangles:
*
* - functions on individual rectangles (containment, join, etc)
* - functions for rectangle lists representing the union of the areas
* - functions for blocks of text in a page
* - functions for short, recurring blocks of text
*/
/*
* the textarea
*
* the union of the rectangles in the page that contain text
*
* the functions that find the blocks of text first nullify the rectangles of
* white space by turning them into 0-width rectangles; the resulting rectangle
* list may not contain them
*
* RectangleList *rectanglelist_textarea(PopplerPage *);
* a list of rectangles that do not touch or overlap and cover all text in
* the page
*
* RectangleList *rectanglelist_textarea_distance(PopplerPage *, gdouble *);
* the second argument is the minimal distance to consider a white space;
* lower values lead to finer coverings of the used area
*
* PopplerRectangle *rectanglelist_boundingbox(PopplerPage *);
* the overall bounding box of the page: the smallest rectangle that
* covers all text in the page
*
* PopplerRectangle *rectanglelist_boundingbox_document(PopplerDocument *);
* overall bounding box of the whole document: smallest rectangle that
* contains all text in all pages
*
* PopplerRectangle *rectanglelist_pagelargest(PopplerPage *page);
* the largest box in a page
*
* PopplerRectangle *rectanglelist_largest_document(PopplerDocument *doc);
* union of the largest boxes in the whole document (NULL if no text)
*/
/*
* the algorithm for finding blocks of text:
*
* 1. C = list containing a rectangle for each character in the page
* (obtained from poppler)
*
* 2. C = join consecutive rectangles of C if they touch or overlap
* (this step is only for efficiency)
*
* 3. W = list comprising only a rectangle as large as the whole page
* for each rectangle R in C:
* - subtract R from each rectangle in W
* (each subtraction may generate up to four rectangles)
* now W is the white area of the page
*
* 4. B = list comprising only a rectangle as large as the whole page
* for each rectangle R in W:
* - subtract R from each rectangle in B
* now B covers the used area of the page
*
* 5. for each pair of rectangles in B:
* - if they touch or overlap, join them
* repeat until nothing changes
*
* 6. return B
*
* variable debugtextrectangle is for the intermediate lists of rectangles:
* - if its value x is greater than zero, the algorithm is cut short after
* step x and the current list of rectangles is returned; this allows for
* visualizing the intermediate results
* - also, the number of rectangles after each step is printed
* - if its value is -1, the number of rectangles is printed every time it
* changes during a subtraction
*/
/*
* rectangle sorting
*
* rectangles can be sorted in two ways:
* 1. by their positions
* 2. by the order their characters occur in the document
*
* rectanglelist_charsort() is of the second kind: the block that contains the
* first character in the page is the first block; the second block is that
* containing the first character not in the first block, and so on
*
* the order based on the position of the rectangles can be obtained by
* progressively refining an order between rectangles:
*
* a. if two rectangles overlap horizontally, order them by their y1
* b. transitively close the resulting ordering
* c. if two rectangles are still incomparable, order them by x1
*
* rectanglelist_quicksort() skips step b. for the sake of code simplicity
*
* rectanglelist_twosort() implements the ordering in two steps: first, a
* variant of insertionsort that keeps into account the lack of transitive
* closure sorts the elements that overlap horizontally based on their y1;
* second, a variant of bubblesort sorts the elements according to their x1
* while not swapping elements that overlap horizontally
*/
/*
* the recurring blocks of text
*
* a document may contain text that is not part of its main content, like page
* numbers or more generally headers or footers
*
* RectangleList *rectanglevector_frequent(PopplerDocument *doc,
* gdouble height, gdouble distance);
* try to locate the recurring text in a page; the result is a set of
* rectangles such that every recurring text contains at least one of
* them; this is an heuristics function, so it is not guaranteed to work
* in all cases (see below); also, for large document pages are sampled at
* random, so the result may change every time
*
* PopplerRectangle *rectanglevector_main(PopplerDocument *doc,
* RectangleList *recur, gdouble height, gdouble distance);
* the largest rectangle in the page once the rectangles in recur have
* been removed; clipping to this rectangle is the first way to remove the
* recurring text from the page
*
* void rectanglelist_clip_containing(cairo_t *cr, PopplerPage *page,
* RectangleList *textarea, RectangleList *rm);
* clip out from the page all rectangles in the textarea that contains any
* of the rectangles in the rm list; this is the second way to remove the
* recurring text from the page
*
* recurring text is usually short in height and positioned the same in most
* pages, or in most even pages or most odd pages; these blocks are identified
* by searching for short areas of text present identical or similar in many
* pages
*
* page numbers may change size (e.g., from a single digit to two); headers and
* footers also may (e.g., the chapter title or author); for this reason,
* containment is allowed: if a short block of text in a page is contained in
* one of another page, it is counted as occurring in two pages; height is
* required to be the same; two blocks of identical size count much more than
* one contained in another
*
* the recurring blocks are found by a vector of rectangles sorted by rank; if
* a rectangle to insert contains some in the vector, their rank is increased
* by one; if it is contained in some it replaces them, again with increased
* rank; otherwise, the rectangle is inserted with rank one; at the end only
* the top rectangles are retained; this algorithm is not exact
*/
/*
* todo:
* images
* fix rectangle sorting by position
* rectangle sorting by characters
* page size (from cmdline or from original file)
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <libgen.h>
#include <poppler.h>
#include <cairo.h>
#include <cairo-pdf.h>
#include "pdfrects.h"
/*
* return the intermediate list of rectangles after the step of this number in
* the algorithm (see above); if not zero, print number of rectangles at each
* step; if -1, print the number of rectangles every time it changes
*/
int debugtextrectangles = 0;
/*
* print the frequent text areas during their construction or clip:
* 0x01 = yaml
* 0x02 = simple text
* 0x04 = rectangles being cut out
*/
int debugfrequent = 0x02 | 0x04;
/*
* print a rectangle
*/
void rectangle_print(FILE *fd, PopplerRectangle *r) {
if (r == NULL)
fprintf(fd, "[]");
else
fprintf(fd, "[%g,%g-%g,%g]", r->x1, r->y1, r->x2, r->y2);
}
void rectangle_printyaml(FILE *fd, char *first, char *indent,
PopplerRectangle *r) {
if (r == NULL) {
printf("%sNULL\n", indent);
return;
}
fprintf(fd, "%sx1: %g\n", first, r->x1);
fprintf(fd, "%sy1: %g\n", indent, r->y1);
fprintf(fd, "%sx2: %g\n", indent, r->x2);
fprintf(fd, "%sy2: %g\n", indent, r->y2);
}
/*
* parse a rectangle
*/
PopplerRectangle *rectangle_parse(char *s) {
gdouble x1, y1, x2, y2;
PopplerRectangle *res;
if (sscanf(s, "[%lg,%lg-%lg,%lg]", &x1, &y1, &x2, &y2) != 4)
return NULL;
res = poppler_rectangle_new();
res->x1 = x1;
res->y1 = y1;
res->x2 = x2;
res->y2 = y2;
return res;
}
/*
* normalize a rectangle: x1 <= x2 and y1 <= y2
*/
void rectangle_normalize(PopplerRectangle *rect) {
gdouble temp;
if (rect->x1 > rect->x2) {
temp = rect->x1;
rect->x1 = rect->x2;
rect->x2 = temp;
}
if (rect->y1 > rect->y2) {
temp = rect->y1;
rect->y1 = rect->y2;
rect->y2 = temp;
}
}
/*
* width and height of a rectangle
*/
double rectangle_width(PopplerRectangle *r) {
return r->x2 - r->x1;
}
double rectangle_height(PopplerRectangle *r) {
return r->y2 - r->y1;
}
/*
* area of a rectangle
*/
double rectangle_area(PopplerRectangle *r) {
return rectangle_width(r) * rectangle_height(r);
}
/*
* check if rectangle satisfies the bounds: both dimensions and at least one
*/
gboolean rectangle_bound(PopplerRectangle *r, RectangleBound *b) {
return r->x2 - r->x1 > b->both && r->y2 - r->y1 > b->both &&
(r->x2 - r->x1 > b->each || r->y2 - r->y1 > b->each);
}
/*
* floating number tolerance
*/
#define TOLERANCE 0.001
/*
* check if two rectangles are the same
*/
gboolean rectangle_hequal(PopplerRectangle *a, PopplerRectangle *b) {
return a->x1 == b->x1 && a->x2 == b->x2;
}
gboolean rectangle_vequal(PopplerRectangle *a, PopplerRectangle *b) {
return a->y1 == b->y1 && a->y2 == b->y2;
}
gboolean rectangle_equal(PopplerRectangle *a, PopplerRectangle *b) {
return rectangle_hequal(a, b) && rectangle_vequal(a, b);
}
/*
* check whether the first rectangle contains the second
*/
gboolean rectangle_hcontain(PopplerRectangle *a, PopplerRectangle *b) {
return a->x1 <= b->x1 + TOLERANCE && b->x2 - TOLERANCE <= a->x2;
}
gboolean rectangle_vcontain(PopplerRectangle *a, PopplerRectangle *b) {
return a->y1 <= b->y1 + TOLERANCE && b->y2 -TOLERANCE <= a->y2;
}
gboolean rectangle_contain(PopplerRectangle *a, PopplerRectangle *b) {
return rectangle_hcontain(a, b) && rectangle_vcontain(a, b);
}
/*
* check if rectangles overlap
*/
gboolean rectangle_hoverlap(PopplerRectangle *a, PopplerRectangle *b) {
return ! (a->x2 <= b->x1 || a->x1 >= b->x2);
}
gboolean rectangle_voverlap(PopplerRectangle *a, PopplerRectangle *b) {
return ! (a->y2 <= b->y1 || a->y1 >= b->y2);
}
gboolean rectangle_overlap(PopplerRectangle *a, PopplerRectangle *b) {
return rectangle_hoverlap(a, b) && rectangle_voverlap(a, b);
}
/*
* check if rectangles touch (meet or overlap)
*/
gboolean rectangle_htouch(PopplerRectangle *a, PopplerRectangle *b) {
return ! (a->x2 < b->x1 || a->x1 > b->x2);
}
gboolean rectangle_vtouch(PopplerRectangle *a, PopplerRectangle *b) {
return ! (a->y2 < b->y1 || a->y1 > b->y2);
}
gboolean rectangle_touch(PopplerRectangle *a, PopplerRectangle *b) {
return rectangle_htouch(a, b) && rectangle_vtouch(a, b);
}
/*
* distance between rectangles
*/
gdouble rectangle_hdistance(PopplerRectangle *a, PopplerRectangle *b) {
return MAX(MAX(b->x1 - a->x2, 0), MAX(a->x1 - b->x2, 0));
}
gdouble rectangle_vdistance(PopplerRectangle *a, PopplerRectangle *b) {
return MAX(MAX(b->y1 - a->y2, 0), MAX(a->y1 - b->y2, 0));
}
/*
* check whether a rectangle satisfies bounds and containment
*/
gboolean rectangle_boundcontain(PopplerRectangle *r,
PopplerRectangle *contained,
RectangleBound *bounds) {
if (bounds != NULL && ! rectangle_bound(r, bounds))
return FALSE;
if (contained != NULL && ! rectangle_contain(r, contained))
return FALSE;
return TRUE;
}
/*
* copy a rectangle onto another
*/
void rectangle_copy(PopplerRectangle *dest, PopplerRectangle *orig) {
memcpy(dest, orig, sizeof(PopplerRectangle));
}
/*
* swap two rectangles
*/
void rectangle_swap(PopplerRectangle *a, PopplerRectangle *b) {
PopplerRectangle temp;
temp = *a;
*a = *b;
*b = temp;
}
/*
* shift a rectangle
*/
void rectangle_shift(PopplerRectangle *r, gdouble x, gdouble y) {
r->x1 += x;
r->y1 += y;
r->x2 += x;
r->y2 += y;
}
/*
* expand a rectangle
*/
void rectangle_expand(PopplerRectangle *r, gdouble dx, gdouble dy) {
r->x1 -= dx;
r->y1 -= dy;
r->x2 += dx;
r->y2 += dy;
}
/*
* make the first rectangle the intersection of the other two
*/
void rectangle_intersect(PopplerRectangle *r,
PopplerRectangle *a, PopplerRectangle *b) {
r->x1 = MAX(a->x1, b->x1);
r->y1 = MAX(a->y1, b->y1);
r->x2 = MIN(a->x2, b->x2);
r->y2 = MIN(a->y2, b->y2);
}
/*
* join rectangles: the first becomes the smallest rectangle containing both
*/
void rectangle_join(PopplerRectangle *a, PopplerRectangle *b) {
if (b == NULL)
return;
a->x1 = MIN(a->x1, b->x1);
a->y1 = MIN(a->y1, b->y1);
a->x2 = MAX(a->x2, b->x2);
a->y2 = MAX(a->y2, b->y2);
}
/*
* compare the position of two rectangles
*/
int rectangle_hcompare(PopplerRectangle *a, PopplerRectangle *b) {
return a->x1 < b->x1 ? -1 : a->x1 == b->x1 ? 0 : 1;
}
int rectangle_vcompare(PopplerRectangle *a, PopplerRectangle *b) {
return a->y1 < b->y1 ? -1 : a->y1 == b->y1 ? 0 : 1;
}
int rectangle_compare(PopplerRectangle *a, PopplerRectangle *b) {
return rectangle_htouch(a, b) ?
rectangle_vcompare(a, b) :
rectangle_hcompare(a, b);
}
/*
* compare the area of two rectangles
*/
int rectangle_areacompare(PopplerRectangle *a, PopplerRectangle *b) {
gdouble aa, ab;
aa = rectangle_area(a);
ab = rectangle_area(b);
return aa < ab ? -1 : aa == ab ? 0 : 1;
}
/*
* a rectangle as large as the page
*/
void rectangle_page(PopplerPage *page, PopplerRectangle *rect) {
rect->x1 = 0;
rect->y1 = 0;
poppler_page_get_size(page, &rect->x2, &rect->y2);
}
/*
* allocate a rectangle list with maximum n rectangles and currently none
*/
RectangleList *rectanglelist_new(int n) {
RectangleList *res;
res = malloc(sizeof(RectangleList));
res->num = 0;
res->max = n;
res->rect = malloc(res->max * sizeof(PopplerRectangle));
return res;
}
/*
* make a copy of a rectangle list
*/
RectangleList *rectanglelist_copy(RectangleList *src) {
RectangleList *dst;
dst = rectanglelist_new(src->max);
dst->num = src->num;
memcpy(dst->rect, src->rect, dst->num * sizeof(PopplerRectangle));
return dst;
}
/*
* thighten a rectangle list by deallocating the unused entries
*/
void rectanglelist_tighten(RectangleList *r) {
r->max = r->num;
r->rect = realloc(r->rect, r->max * sizeof(PopplerRectangle));
}
/*
* free a rectangle list
*/
void rectanglelist_free(RectangleList *rl) {
if (rl == NULL)
return;
free(rl->rect);
free(rl);
}
/*
* print a rectangle list
*/
void rectanglelist_print(FILE *fd, RectangleList *rl) {
gint r;
for (r = 0; r < rl->num; r++) {
rectangle_print(fd, &rl->rect[r]);
fprintf(fd, "\n");
}
}
void rectanglelist_printyaml(FILE *fd, char *first, char *indent,
RectangleList *rl) {
gint r;
for (r = 0; r < rl->num; r++)
rectangle_printyaml(fd, first, indent, &rl->rect[r]);
}
/*
* remove a rectangle from a list
*/
void rectanglelist_delete(RectangleList *rl, gint n) {
if (n >= rl->num)
return;
rectangle_copy(rl->rect + n, rl->rect + --rl->num);
}
/*
* append a rectangle to a list
*/
void rectanglelist_append(RectangleList *rl, PopplerRectangle *rect) {
if (rl->num >= rl->max) {
rl->max += MAXRECT;
rl->rect =
realloc(rl->rect, rl->max * sizeof(PopplerRectangle));
}
rectangle_copy(rl->rect + rl->num++, rect);
}
/*
* add a rectangle to a list
*
* since a RectangleList represents the area that is the union of its
* rectangles, when adding a new rectangle some simplifications can be done:
*
* - the request is ignored if the rectangle is contained in one in the list
* - if the rectangle contains some rectangles in the list, it replaces them
*
* still, a rectangle list can be redundant: for example, a rectangle may be
* contained in the union of other two
*/
gboolean rectanglelist_add(RectangleList *rl, PopplerRectangle *rect) {
gint r;
gboolean placed;
placed = FALSE;
for (r = 0; r < rl->num; r++) {
if (rectangle_contain(rl->rect + r, rect))
return TRUE;
if (rectangle_contain(rect, rl->rect + r)) {
if (! placed) {
rectangle_copy(rl->rect + r, rect);
placed = TRUE;
}
else
rectanglelist_delete(rl, r);
}
}
if (! placed)
rectanglelist_append(rl, rect);
return TRUE;
}
/*
* smallest rectangle enclosing all in a rectangle list
*/
PopplerRectangle *rectanglelist_joinall(RectangleList *rl) {
PopplerRectangle *a;
int r;
if (rl == NULL || rl->num == 0)
return NULL;
a = poppler_rectangle_copy(&rl->rect[0]);
for (r = 1; r < rl->num; r++)
rectangle_join(a, &rl->rect[r]);
return a;
}
/*
* horizontal and vertical extents of a rectangle list
*/
RectangleList *rectanglelist_directionalextents(RectangleList *src,
int (*compare)(PopplerRectangle *a, PopplerRectangle *b),
gboolean (*touch)(PopplerRectangle *, PopplerRectangle *)) {
int i, j;
RectangleList *sorted, *dst;
sorted = rectanglelist_copy(src);
if (sorted->num == 0)
return sorted;
qsort(sorted->rect, sorted->num, sizeof(PopplerRectangle),
(int (*)(const void *, const void *)) compare);
dst = rectanglelist_new(MAXRECT);
for (i = 0; i < sorted->num; i++) {
j = dst->num - 1;
if (dst->num != 0 && touch(&dst->rect[j], &sorted->rect[i]))
rectangle_join(&dst->rect[j], &sorted->rect[i]);
else
rectanglelist_append(dst, &sorted->rect[i]);
}
rectanglelist_free(sorted);
return dst;
}
RectangleList *rectanglelist_hextents(RectangleList *src) {
return rectanglelist_directionalextents(src,
rectangle_hcompare, rectangle_htouch);
}
RectangleList *rectanglelist_vextents(RectangleList *src) {
return rectanglelist_directionalextents(src,
rectangle_vcompare, rectangle_vtouch);
}
/*
* total width and height of a rectangle list
*/
double rectanglelist_sum(RectangleList *rl,
double (*measure)(PopplerRectangle *)) {
double res = 0;
int i;
for (i = 0; i < rl->num; i++)
res += measure(&rl->rect[i]);
return res;
}
double rectanglelist_sumwidth(RectangleList *rl) {
return rectanglelist_sum(rl, rectangle_width);
}
double rectanglelist_sumheight(RectangleList *rl) {
return rectanglelist_sum(rl, rectangle_height);
}
/*
* average width and height of a rectangle list
*/
double rectanglelist_average(RectangleList *rl,
double (*measure)(PopplerRectangle *)) {
return rectanglelist_sum(rl, measure) / rl->num;
}
double rectanglelist_averagewidth(RectangleList *rl) {
return rectanglelist_average(rl, rectangle_width);
}
double rectanglelist_averageheight(RectangleList *rl) {
return rectanglelist_average(rl, rectangle_height);
}
/*
* index of rectangle in a list containing another rectangle
*/
gint rectanglelist_contain(RectangleList *rl, PopplerRectangle *r) {
gint index;
for (index = 0; index < rl->num; index++)
if (rectangle_contain(&rl->rect[index], r))
return index;
return -1;
}
/*
* index of rectangle in a list touching another rectangle
*/
gint rectanglelist_touch(RectangleList *rl, PopplerRectangle *r) {
gint index;
for (index = 0; index < rl->num; index++)
if (rectangle_touch(&rl->rect[index], r))
return index;
return -1;
}
/*
* index of rectangle in a list overlapping another rectangle
*/
gint rectanglelist_overlap(RectangleList *rl, PopplerRectangle *r) {
gint index;
for (index = 0; index < rl->num; index++)
if (rectangle_overlap(&rl->rect[index], r))
return index;
return -1;
}
/*
* sort a rectangle list by position, quick and approximate
*/
void rectanglelist_quicksort(RectangleList *rl, PopplerPage *page) {
(void) page;
qsort(rl->rect, rl->num, sizeof(PopplerRectangle),
(int (*)(const void *, const void *)) rectangle_compare);
}
/*
* sort a rectangle list by position, in two steps
*/
void rectanglelist_twosort(RectangleList *rl, PopplerPage *page) {
int i, j;
int sorted;
PopplerRectangle *r, *s;
(void) page;
/*
* sort vertically if horizontally overlapping
*
* iteratively find the minimum among i+1...rl->num-1
*
* when replacing the current minimum restart since the order is not
* transitively closed; e.g., {2, 0, 1} with 0<1 and 1<2 but not 0<2;
* without the restart the current minimum starts as 2, is not replaced
* by 0 but is then replaced by 1; minimum ends as 1 instead of 0
*/
for (i = 0; i < rl->num - 1; i++) {
r = &rl->rect[i];
for (j = i + 1; j < rl->num; j++) {
s = &rl->rect[j];
if (rectangle_htouch(r, s) &&
rectangle_vcompare(r, s) == 1) {
r = s;
j = i; // order is not transitivity closed
}
}
rectangle_swap(&rl->rect[i], r);
}
/*
* sort horizontally, but respect the previous ordering
*
* this step is correct for the same reason of regular bubblesort: the
* first scan of the list moves the element that should go last at the
* end of the list; the same for the other scans for the remaining part
* of the list
*
* let k be the rectangle that should be placed last in the list: no
* other rectangle is greater than it according to the order < of the
* first step and no incomparable rectangle has greater x1
*
* the previous step placed all rectangles lower than k according to <
* before k in the list; for example, if i<j<k then i is before j and j
* is before k; the second step never swaps horizontally-overlapping
* rectangles, so it does not move i ahead of j and j ahead of k; as a
* result, k cannot be overtaken by a rectangle that is directly (like
* j) or indirectly (like i) lower than it according to the order <; in
* the same way, a rectangle that is incomparable with k cannot
* overtake it since no incomparable rectangle has x1 larger than that
* of k by assumption; all of this proves that when the element
* preceding k is compared with k, they are not swapped
*
* as a result, the scan proceeds with k and not with its preceding
* element; since the elements following k are incomparable with k
* according to <, they all have lower x1; the scan therefore moves k
* to the last position
*/
for (i = 0, sorted = 0; i < rl->num && ! sorted; i++) {
sorted = 1;
for (j = 0; j < rl->num - 1; j++) {
r = &rl->rect[j];
s = &rl->rect[j + 1];
if (rectangle_htouch(r, s))
continue;
if (rectangle_hcompare(r, s) != 1)
continue;
rectangle_swap(r, s);
sorted = 0;
}
}
}
/*
* sort rectangles according to the order of their characters in the page
*/
void rectanglelist_charsort(RectangleList *rl, PopplerPage *page) {
PopplerRectangle *rect;
unsigned i, n;
int j, p;
poppler_page_get_text_layout(page, &rect, &n);
for (i = 0, p = 0; i < n && p < rl->num; i++)
for (j = p; j < rl->num; j++)
if (rectangle_contain(&rl->rect[j], &rect[i])) {
rectangle_swap(&rl->rect[p], &rl->rect[j]);
p++;
break;
}
g_free(rect);
}
/*
* largest rectangle in a list
*/
PopplerRectangle *rectanglelist_largest(RectangleList *rl) {
PopplerRectangle *max;
gdouble maxarea, area;
int r;
if (rl->num == 0)
return NULL;
max = &rl->rect[0];
maxarea = rectangle_area(max);
for (r = 1; r < rl->num; r++) {
area = rectangle_area(&rl->rect[r]);
if (maxarea < area) {
max = &rl->rect[r];
maxarea = area;
}
}
return max;
}
/*
* sort rectangles by area
*/
void rectanglelist_areasort(RectangleList *rl) {
qsort(rl->rect, rl->num, sizeof(PopplerRectangle),
(int (*)(const void *, const void *)) rectangle_areacompare);
}
/*
* position a rectangle in a page partially filled by others
*/
gboolean rectanglelist_place(PopplerRectangle *page,
RectangleList *rl, PopplerRectangle *r,
PopplerRectangle *moved) {
PopplerRectangle or;
gdouble x, y, minx, miny;
gint index;
rectangle_copy(&or, r);
rectangle_shift(&or, -or.x1, -or.y1);
for (y = page->y1; y + or.y2 <= page->y2; y = miny) {
miny = page->y2;
for (x = page->x1; x + or.x2 <= page->x2; x = minx) {
rectangle_copy(moved, &or);
rectangle_shift(moved, x, y);
index = rectanglelist_overlap(rl, moved);
if (index == -1)
return TRUE;
minx = rl->rect[index].x2;
miny = MIN(miny, rl->rect[index].y2);
}
}
return FALSE;
}
/*
* append the subtraction of rectangle sub from list orig to list res:
* res += orig - sub
*
* for each rectangle in orig, subtract sub from it; this may generate up to
* four rectangles, which are appended to list res if they contain rectangle
* cont (if not NULL) and satisfy bounds b (if not NULL)
*/
gboolean rectanglelist_subtract_append(RectangleList *dest,
RectangleList *orig, PopplerRectangle *sub,
PopplerRectangle *cont, RectangleBound *b) {
gint i;
PopplerRectangle *a, r;
for (i = 0; i < orig->num; i++) {
a = orig->rect + i;
r.x1 = a->x1;
r.y1 = a->y1;
r.x2 = MIN(a->x2, sub->x1);
r.y2 = a->y2;
if (rectangle_boundcontain(&r, cont, b))
if (! rectanglelist_add(dest, &r))
return FALSE;
r.x1 = a->x1;
r.y1 = a->y1;
r.x2 = a->x2;
r.y2 = MIN(a->y2, sub->y1);
if (rectangle_boundcontain(&r, cont, b))
if (! rectanglelist_add(dest, &r))
return FALSE;
r.x1 = MAX(a->x1, sub->x2);
r.y1 = a->y1;
r.x2 = a->x2;
r.y2 = a->y2;
if (rectangle_boundcontain(&r, cont, b))
if (! rectanglelist_add(dest, &r))
return FALSE;
r.x1 = a->x1;
r.y1 = MAX(a->y1, sub->y2);
r.x2 = a->x2;
r.y2 = a->y2;
if (rectangle_boundcontain(&r, cont, b))
if (! rectanglelist_add(dest, &r))
return FALSE;
}
return TRUE;
}
/*
* subtract a rectangle list from another: orig -= sub
*/
gboolean rectanglelist_subtract(RectangleList **orig, RectangleList *sub,
PopplerRectangle *cont, RectangleBound *b) {
RectangleList *dest;
gint r;
RectangleBound bd = {0.0, 0.0};
for (r = 0; r < sub->num; r++) {
dest = rectanglelist_new(MAXRECT);
if (! rectanglelist_subtract_append(dest, *orig, sub->rect + r,
cont, b != NULL ? b : &bd))
return FALSE;
if (debugtextrectangles == -1 && dest->num != (*orig)->num)
printf("rectangles: %d\n", dest->num);
rectanglelist_free(*orig);
*orig = dest;
}
return TRUE;
}
/*
* subtract a rectangle list from a single rectangle: res = r - rl
*/
RectangleList *rectanglelist_subtract1(PopplerRectangle *r, RectangleList *rl,
PopplerRectangle *cont, RectangleBound *b) {
RectangleList *res;
res = rectanglelist_new(MAXRECT);
rectangle_copy(res->rect, r);
res->num = 1;
if (! rectanglelist_subtract(&res, rl, cont, b)) {
rectanglelist_free(res);
return NULL;
}
return res;
}
/*
* join consecutive touching rectangles of a list
*/
void rectanglelist_consecutive(RectangleList *orig) {
gint i, j;
if (orig->num == 0)
return;
for (j = 0, i = 1; i < orig->num; i++)
if (rectangle_touch(orig->rect + j, orig->rect + i))
rectangle_join(orig->rect + j, orig->rect + i);
else {
j++;
rectangle_copy(orig->rect + j, orig->rect + i);
}
orig->num = j + 1;
}
/*
* join touching rectangles in a rectangle list
*/
void rectanglelist_join(RectangleList *orig) {
gint i, j, n;
/* why the do-while loop: joining may produce a rectangle that overlaps
* a previous one, for example:
* 1 2
* 3
* 654
* rectangle 1 does not touch any else; but then 2 is joined to 3, then
* 4, 5 and 6; the resulting rectangle includes 1, since joining two
* rectangles produces a rectangle that includes both */
do {
n = orig->num;
for (i = 0; i < orig->num; i++)
for (j = i + 1; j < orig->num; j++)
if (rectangle_touch(orig->rect + i,
orig->rect + j)) {
rectangle_join(orig->rect + i,
orig->rect + j);
rectanglelist_delete(orig, j);
j = i;
}
} while (n != orig->num);
}
/*
* the rectangles of the single characters in the page
*/
RectangleList *rectanglelist_characters(PopplerPage *page) {
RectangleList *layout;
char *text, *cur, *next;
guint n;
gint r;
layout = rectanglelist_new(0);
poppler_page_get_text_layout(page, &layout->rect, &n);
layout->num = n;