summaryrefslogtreecommitdiff
blob: 53002041f41dcc5bb74dc38dccb9ca8f4eb1d6bd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
/**********************************************************************
 * File:        pageres.cpp  (Formerly page_res.c)
 * Description: Hierarchy of results classes from PAGE_RES to WERD_RES
 *              and an iterator class to iterate over the words.
 * Main purposes:
 *              Easy way to iterate over the words without a 3-nested loop.
 *              Holds data used during word recognition.
 *              Holds information about alternative spacing paths.
 * Author:      Phil Cheatle
 *
 * (C) Copyright 1992, Hewlett-Packard Ltd.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/

#include "pageres.h"

#include "blamer.h"        // for BlamerBundle
#include "blobs.h"         // for TWERD, TBLOB
#include "boxword.h"       // for BoxWord
#include "errcode.h"       // for ASSERT_HOST
#include "ocrblock.h"      // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)
#include "ocrrow.h"        // for ROW, ROW_IT
#include "pdblock.h"       // for PDBLK
#include "polyblk.h"       // for POLY_BLOCK
#include "seam.h"          // for SEAM, start_seam_list
#include "stepblob.h"      // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
#include "tprintf.h"       // for tprintf

#include <tesseract/publictypes.h>   // for OcrEngineMode, OEM_LSTM_ONLY

#include <cassert>         // for assert
#include <cstdint>         // for INT32_MAX
#include <cstring>         // for strlen

struct Pix;

namespace tesseract {

ELISTIZE (BLOCK_RES)
CLISTIZE (BLOCK_RES) ELISTIZE (ROW_RES) ELISTIZE (WERD_RES)

// Gain factor for computing thresholds that determine the ambiguity of a word.
static const double kStopperAmbiguityThresholdGain = 8.0;
// Constant offset for computing thresholds that determine the ambiguity of a
// word.
static const double kStopperAmbiguityThresholdOffset = 1.5;
// Max number of broken pieces to associate.
const int kWordrecMaxNumJoinChunks = 4;
// Max ratio of word box height to line size to allow it to be processed as
// a line with other words.
const double kMaxWordSizeRatio = 1.25;
// Max ratio of line box height to line size to allow a new word to be added.
const double kMaxLineSizeRatio = 1.25;
// Max ratio of word gap to line size to allow a new word to be added.
const double kMaxWordGapRatio = 2.0;

// Computes and returns a threshold of certainty difference used to determine
// which words to keep, based on the adjustment factors of the two words.
// TODO(rays) This is horrible. Replace with an enhance params training model.
static double StopperAmbigThreshold(double f1, double f2) {
  return (f2 - f1) * kStopperAmbiguityThresholdGain -
      kStopperAmbiguityThresholdOffset;
}

/*************************************************************************
 * PAGE_RES::PAGE_RES
 *
 * Constructor for page results
 *************************************************************************/
PAGE_RES::PAGE_RES(
    bool merge_similar_words,
    BLOCK_LIST *the_block_list,
    WERD_CHOICE **prev_word_best_choice_ptr) {
  Init();
  BLOCK_IT block_it(the_block_list);
  BLOCK_RES_IT block_res_it(&block_res_list);
  for (block_it.mark_cycle_pt();
       !block_it.cycled_list(); block_it.forward()) {
    block_res_it.add_to_end(new BLOCK_RES(merge_similar_words,
                                          block_it.data()));
  }
  prev_word_best_choice = prev_word_best_choice_ptr;
}

/*************************************************************************
 * BLOCK_RES::BLOCK_RES
 *
 * Constructor for BLOCK results
 *************************************************************************/

BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) {
  ROW_IT row_it (the_block->row_list ());
  ROW_RES_IT row_res_it(&row_res_list);

  char_count = 0;
  rej_count = 0;
  font_class = -1;               //not assigned
  x_height = -1.0;
  font_assigned = false;
  row_count = 0;

  block = the_block;

  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
    row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data()));
  }
}

/*************************************************************************
 * ROW_RES::ROW_RES
 *
 * Constructor for ROW results
 *************************************************************************/

ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {
  WERD_IT word_it(the_row->word_list());
  WERD_RES_IT word_res_it(&word_res_list);
  WERD_RES *combo = nullptr;        // current combination of fuzzies
  WERD *copy_word;

  char_count = 0;
  rej_count = 0;
  whole_word_rej_count = 0;

  row = the_row;
  bool add_next_word = false;
  TBOX union_box;
  float line_height = the_row->x_height() + the_row->ascenders() -
      the_row->descenders();
  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
    auto* word_res = new WERD_RES(word_it.data());
    word_res->x_height = the_row->x_height();
    if (add_next_word) {
      ASSERT_HOST(combo != nullptr);
      // We are adding this word to the combination.
      word_res->part_of_combo = true;
      combo->copy_on(word_res);
    } else if (merge_similar_words) {
      union_box = word_res->word->bounding_box();
      add_next_word = !word_res->word->flag(W_REP_CHAR) &&
          union_box.height() <= line_height * kMaxWordSizeRatio;
      word_res->odd_size = !add_next_word;
    }
    WERD* next_word = word_it.data_relative(1);
    if (merge_similar_words) {
      if (add_next_word && !next_word->flag(W_REP_CHAR)) {
        // Next word will be added on if all of the following are true:
        // Not a rep char.
        // Box height small enough.
        // Union box height small enough.
        // Horizontal gap small enough.
        TBOX next_box = next_word->bounding_box();
        int prev_right = union_box.right();
        union_box += next_box;
        if (next_box.height() > line_height * kMaxWordSizeRatio ||
            union_box.height() > line_height * kMaxLineSizeRatio ||
            next_box.left() > prev_right + line_height * kMaxWordGapRatio) {
          add_next_word = false;
        }
      }
      next_word->set_flag(W_FUZZY_NON, add_next_word);
    } else {
      add_next_word = next_word->flag(W_FUZZY_NON);
    }
    if (add_next_word) {
      if (combo == nullptr) {
        copy_word = new WERD;
        *copy_word = *(word_it.data());  // deep copy
        combo = new WERD_RES(copy_word);
        combo->x_height = the_row->x_height();
        combo->combination = true;
        word_res_it.add_to_end(combo);
      }
      word_res->part_of_combo = true;
    } else {
      combo = nullptr;
    }
    word_res_it.add_to_end(word_res);
  }
}


WERD_RES& WERD_RES::operator=(const WERD_RES & source) {
  this->ELIST_LINK::operator=(source);
  Clear();
  if (source.combination) {
    word = new WERD;
    *word = *(source.word);      // deep copy
  } else {
    word = source.word;          // pt to same word
  }
  if (source.bln_boxes != nullptr)
    bln_boxes = new tesseract::BoxWord(*source.bln_boxes);
  if (source.chopped_word != nullptr)
    chopped_word = new TWERD(*source.chopped_word);
  if (source.rebuild_word != nullptr)
    rebuild_word = new TWERD(*source.rebuild_word);
  // TODO(rays) Do we ever need to copy the seam_array?
  blob_row = source.blob_row;
  denorm = source.denorm;
  if (source.box_word != nullptr)
    box_word = new tesseract::BoxWord(*source.box_word);
  best_state = source.best_state;
  correct_text = source.correct_text;
  blob_widths = source.blob_widths;
  blob_gaps = source.blob_gaps;
  // None of the uses of operator= require the ratings matrix to be copied,
  // so don't as it would be really slow.

  // Copy the cooked choices.
  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&source.best_choices));
  WERD_CHOICE_IT wc_dest_it(&best_choices);
  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
    const WERD_CHOICE *choice = wc_it.data();
    wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice));
  }
  if (!wc_dest_it.empty()) {
    wc_dest_it.move_to_first();
    best_choice = wc_dest_it.data();
  } else {
    best_choice = nullptr;
  }

  if (source.raw_choice != nullptr) {
    raw_choice = new WERD_CHOICE(*source.raw_choice);
  } else {
    raw_choice = nullptr;
  }
  if (source.ep_choice != nullptr) {
    ep_choice = new WERD_CHOICE(*source.ep_choice);
  } else {
    ep_choice = nullptr;
  }
  reject_map = source.reject_map;
  combination = source.combination;
  part_of_combo = source.part_of_combo;
  CopySimpleFields(source);
  if (source.blamer_bundle != nullptr) {
    blamer_bundle =  new BlamerBundle(*(source.blamer_bundle));
  }
  return *this;
}

// Copies basic fields that don't involve pointers that might be useful
// to copy when making one WERD_RES from another.
void WERD_RES::CopySimpleFields(const WERD_RES& source) {
  tess_failed = source.tess_failed;
  tess_accepted = source.tess_accepted;
  tess_would_adapt = source.tess_would_adapt;
  done = source.done;
  unlv_crunch_mode = source.unlv_crunch_mode;
  small_caps = source.small_caps;
  odd_size = source.odd_size;
  fontinfo = source.fontinfo;
  fontinfo2 = source.fontinfo2;
  fontinfo_id_count = source.fontinfo_id_count;
  fontinfo_id2_count = source.fontinfo_id2_count;
  x_height = source.x_height;
  caps_height = source.caps_height;
  baseline_shift = source.baseline_shift;
  guessed_x_ht = source.guessed_x_ht;
  guessed_caps_ht = source.guessed_caps_ht;
  reject_spaces = source.reject_spaces;
  uch_set = source.uch_set;
  tesseract = source.tesseract;
}

// Initializes a blank (default constructed) WERD_RES from one that has
// already been recognized.
// Use SetupFor*Recognition afterwards to complete the setup and make
// it ready for a retry recognition.
void WERD_RES::InitForRetryRecognition(const WERD_RES& source) {
  word = source.word;
  CopySimpleFields(source);
  if (source.blamer_bundle != nullptr) {
    blamer_bundle = new BlamerBundle();
    blamer_bundle->CopyTruth(*source.blamer_bundle);
  }
}

// Sets up the members used in recognition: bln_boxes, chopped_word,
// seam_array, denorm.  Returns false if
// the word is empty and sets up fake results.  If use_body_size is
// true and row->body_size is set, then body_size will be used for
// blob normalization instead of xheight + ascrise. This flag is for
// those languages that are using CJK pitch model and thus it has to
// be true if and only if tesseract->textord_use_cjk_fp_model is
// true.
// If allow_detailed_fx is true, the feature extractor will receive fine
// precision outline information, allowing smoother features and better
// features on low resolution images.
// The norm_mode_hint sets the default mode for normalization in absence
// of any of the above flags.
// norm_box is used to override the word bounding box to determine the
// normalization scale and offset.
// Returns false if the word is empty and sets up fake results.
bool WERD_RES::SetupForRecognition(const UNICHARSET& unicharset_in,
                                   tesseract::Tesseract* tess, Pix* pix,
                                   int norm_mode,
                                   const TBOX* norm_box,
                                   bool numeric_mode,
                                   bool use_body_size,
                                   bool allow_detailed_fx,
                                   ROW *row, const BLOCK* block) {
  auto norm_mode_hint =
      static_cast<tesseract::OcrEngineMode>(norm_mode);
  tesseract = tess;
  POLY_BLOCK* pb = block != nullptr ? block->pdblk.poly_block() : nullptr;
  if ((norm_mode_hint != tesseract::OEM_LSTM_ONLY &&
       word->cblob_list()->empty()) ||
      (pb != nullptr && !pb->IsText())) {
    // Empty words occur when all the blobs have been moved to the rej_blobs
    // list, which seems to occur frequently in junk.
    SetupFake(unicharset_in);
    word->set_flag(W_REP_CHAR, false);
    return false;
  }
  ClearResults();
  SetupWordScript(unicharset_in);
  chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word);
  float word_xheight = use_body_size && row != nullptr && row->body_size() > 0.0f
                     ? row->body_size() : x_height;
  chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE),
                            word_xheight, baseline_shift, numeric_mode,
                            norm_mode_hint, norm_box, &denorm);
  blob_row = row;
  SetupBasicsFromChoppedWord(unicharset_in);
  SetupBlamerBundle();
  int num_blobs = chopped_word->NumBlobs();
  ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks);
  tess_failed = false;
  return true;
}

// Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
// accumulators from a made chopped word.  We presume the fields are already
// empty.
void WERD_RES::SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in) {
  bln_boxes = tesseract::BoxWord::CopyFromNormalized(chopped_word);
  start_seam_list(chopped_word, &seam_array);
  SetupBlobWidthsAndGaps();
  ClearWordChoices();
}

// Sets up the members used in recognition for an empty recognition result:
// bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) {
  ClearResults();
  SetupWordScript(unicharset_in);
  chopped_word = new TWERD;
  rebuild_word = new TWERD;
  bln_boxes = new tesseract::BoxWord;
  box_word = new tesseract::BoxWord;
  int blob_count = word->cblob_list()->length();
  if (blob_count > 0) {
    auto** fake_choices = new BLOB_CHOICE*[blob_count];
    // For non-text blocks, just pass any blobs through to the box_word
    // and call the word failed with a fake classification.
    C_BLOB_IT b_it(word->cblob_list());
    int blob_id = 0;
    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
      TBOX box = b_it.data()->bounding_box();
      box_word->InsertBox(box_word->length(), box);
      fake_choices[blob_id++] = new BLOB_CHOICE;
    }
    FakeClassifyWord(blob_count, fake_choices);
    delete [] fake_choices;
  } else {
    auto* word = new WERD_CHOICE(&unicharset_in);
    word->make_bad();
    LogNewRawChoice(word);
    // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice.
    LogNewCookedChoice(1, false, word);
  }
  tess_failed = true;
  done = true;
}

void WERD_RES::SetupWordScript(const UNICHARSET& uch) {
  uch_set = &uch;
  int script = uch.default_sid();
  word->set_script_id(script);
  word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight());
  word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
}

// Sets up the blamer_bundle if it is not null, using the initialized denorm.
void WERD_RES::SetupBlamerBundle() {
  if (blamer_bundle != nullptr) {
    blamer_bundle->SetupNormTruthWord(denorm);
  }
}

// Computes the blob_widths and blob_gaps from the chopped_word.
void WERD_RES::SetupBlobWidthsAndGaps() {
  blob_widths.truncate(0);
  blob_gaps.truncate(0);
  int num_blobs = chopped_word->NumBlobs();
  for (int b = 0; b < num_blobs; ++b) {
    TBLOB *blob = chopped_word->blobs[b];
    TBOX box = blob->bounding_box();
    blob_widths.push_back(box.width());
    if (b + 1 < num_blobs) {
      blob_gaps.push_back(
          chopped_word->blobs[b + 1]->bounding_box().left() - box.right());
    }
  }
}

// Updates internal data to account for a new SEAM (chop) at the given
// blob_number. Fixes the ratings matrix and states in the choices, as well
// as the blob widths and gaps.
void WERD_RES::InsertSeam(int blob_number, SEAM* seam) {
  // Insert the seam into the SEAMS array.
  seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);
  seam_array.insert(seam, blob_number);
  if (ratings != nullptr) {
    // Expand the ratings matrix.
    ratings = ratings->ConsumeAndMakeBigger(blob_number);
    // Fix all the segmentation states.
    if (raw_choice != nullptr)
      raw_choice->UpdateStateForSplit(blob_number);
    WERD_CHOICE_IT wc_it(&best_choices);
    for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
      WERD_CHOICE* choice = wc_it.data();
      choice->UpdateStateForSplit(blob_number);
    }
    SetupBlobWidthsAndGaps();
  }
}

// Returns true if all the word choices except the first have adjust_factors
// worse than the given threshold.
bool WERD_RES::AlternativeChoiceAdjustmentsWorseThan(float threshold) const {
  // The choices are not changed by this iteration.
  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
  for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {
    WERD_CHOICE* choice = wc_it.data();
    if (choice->adjust_factor() <= threshold)
      return false;
  }
  return true;
}

// Returns true if the current word is ambiguous (by number of answers or
// by dangerous ambigs.)
bool WERD_RES::IsAmbiguous() {
  return !best_choices.singleton() || best_choice->dangerous_ambig_found();
}

// Returns true if the ratings matrix size matches the sum of each of the
// segmentation states.
bool WERD_RES::StatesAllValid() {
  int ratings_dim = ratings->dimension();
  if (raw_choice->TotalOfStates() != ratings_dim) {
    tprintf("raw_choice has total of states = %d vs ratings dim of %d\n",
            raw_choice->TotalOfStates(), ratings_dim);
    return false;
  }
  WERD_CHOICE_IT it(&best_choices);
  int index = 0;
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
    WERD_CHOICE* choice = it.data();
    if (choice->TotalOfStates() != ratings_dim) {
      tprintf("Cooked #%d has total of states = %d vs ratings dim of %d\n",
              index, choice->TotalOfStates(), ratings_dim);
      return false;
    }
  }
  return true;
}

// Prints a list of words found if debug is true or the word result matches
// the word_to_debug.
void WERD_RES::DebugWordChoices(bool debug, const char* word_to_debug) {
  if (debug ||
      (word_to_debug != nullptr && *word_to_debug != '\0' && best_choice != nullptr &&
       best_choice->unichar_string() == STRING(word_to_debug))) {
    if (raw_choice != nullptr)
      raw_choice->print("\nBest Raw Choice");

    WERD_CHOICE_IT it(&best_choices);
    int index = 0;
    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
      WERD_CHOICE* choice = it.data();
      STRING label;
      label.add_str_int("\nCooked Choice #", index);
      choice->print(label.c_str());
    }
  }
}

// Prints the top choice along with the accepted/done flags.
void WERD_RES::DebugTopChoice(const char* msg) const {
  tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ",
          tess_accepted, tess_would_adapt, done);
  if (best_choice == nullptr)
    tprintf("<Null choice>\n");
  else
    best_choice->print(msg);
}

// Removes from best_choices all choices which are not within a reasonable
// range of the best choice.
// TODO(rays) incorporate the information used here into the params training
// re-ranker, in place of this heuristic that is based on the previous
// adjustment factor.
void WERD_RES::FilterWordChoices(int debug_level) {
  if (best_choice == nullptr || best_choices.singleton())
    return;

  if (debug_level >= 2)
    best_choice->print("\nFiltering against best choice");
  WERD_CHOICE_IT it(&best_choices);
  int index = 0;
  for (it.forward(); !it.at_first(); it.forward(), ++index) {
    WERD_CHOICE* choice = it.data();
    float threshold = StopperAmbigThreshold(best_choice->adjust_factor(),
                                            choice->adjust_factor());
    // i, j index the blob choice in choice, best_choice.
    // chunk is an index into the chopped_word blobs (AKA chunks).
    // Since the two words may use different segmentations of the chunks, we
    // iterate over the chunks to find out whether a comparable blob
    // classification is much worse than the best result.
    int i = 0, j = 0, chunk = 0;
    // Each iteration of the while deals with 1 chunk. On entry choice_chunk
    // and best_chunk are the indices of the first chunk in the NEXT blob,
    // i.e. we don't have to increment i, j while chunk < choice_chunk and
    // best_chunk respectively.
    int choice_chunk = choice->state(0), best_chunk = best_choice->state(0);
    while (i < choice->length() && j < best_choice->length()) {
      if (choice->unichar_id(i) != best_choice->unichar_id(j) &&
          choice->certainty(i) - best_choice->certainty(j) < threshold) {
        if (debug_level >= 2) {
          choice->print("WorstCertaintyDiffWorseThan");
          tprintf(
              "i %d j %d Choice->Blob[i].Certainty %.4g"
              " WorstOtherChoiceCertainty %g Threshold %g\n",
              i, j, choice->certainty(i), best_choice->certainty(j), threshold);
          tprintf("Discarding bad choice #%d\n", index);
        }
        delete it.extract();
        break;
      }
      ++chunk;
      // If needed, advance choice_chunk to keep up with chunk.
      while (choice_chunk < chunk && ++i < choice->length())
        choice_chunk += choice->state(i);
      // If needed, advance best_chunk to keep up with chunk.
      while (best_chunk < chunk && ++j < best_choice->length())
        best_chunk += best_choice->state(j);
    }
  }
}

void WERD_RES::ComputeAdaptionThresholds(float certainty_scale,
                                         float min_rating,
                                         float max_rating,
                                         float rating_margin,
                                         float* thresholds) {
  int chunk = 0;
  int end_chunk = best_choice->state(0);
  int end_raw_chunk = raw_choice->state(0);
  int raw_blob = 0;
  for (int i = 0; i < best_choice->length(); i++, thresholds++) {
    float avg_rating = 0.0f;
    int num_error_chunks = 0;

    // For each chunk in best choice blob i, count non-matching raw results.
    while (chunk < end_chunk) {
      if (chunk >= end_raw_chunk) {
        ++raw_blob;
        end_raw_chunk += raw_choice->state(raw_blob);
      }
      if (best_choice->unichar_id(i) !=
          raw_choice->unichar_id(raw_blob)) {
        avg_rating += raw_choice->certainty(raw_blob);
        ++num_error_chunks;
      }
      ++chunk;
    }

    if (num_error_chunks > 0) {
      avg_rating /= num_error_chunks;
      *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);
    } else {
      *thresholds = max_rating;
    }

    if (*thresholds > max_rating)
      *thresholds = max_rating;
    if (*thresholds < min_rating)
      *thresholds = min_rating;
  }
}

// Saves a copy of the word_choice if it has the best unadjusted rating.
// Returns true if the word_choice was the new best.
bool WERD_RES::LogNewRawChoice(WERD_CHOICE* word_choice) {
  if (raw_choice == nullptr || word_choice->rating() < raw_choice->rating()) {
    delete raw_choice;
    raw_choice = new WERD_CHOICE(*word_choice);
    raw_choice->set_permuter(TOP_CHOICE_PERM);
    return true;
  }
  return false;
}

// Consumes word_choice by adding it to best_choices, (taking ownership) if
// the certainty for word_choice is some distance of the best choice in
// best_choices, or by deleting the word_choice and returning false.
// The best_choices list is kept in sorted order by rating. Duplicates are
// removed, and the list is kept no longer than max_num_choices in length.
// Returns true if the word_choice is still a valid pointer.
bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug,
                                  WERD_CHOICE* word_choice) {
  if (best_choice != nullptr) {
    // Throw out obviously bad choices to save some work.
    // TODO(rays) Get rid of this! This piece of code produces different
    // results according to the order in which words are found, which is an
    // undesirable behavior. It would be better to keep all the choices and
    // prune them later when more information is available.
    float max_certainty_delta =
        StopperAmbigThreshold(best_choice->adjust_factor(),
                              word_choice->adjust_factor());
    if (max_certainty_delta > -kStopperAmbiguityThresholdOffset)
      max_certainty_delta = -kStopperAmbiguityThresholdOffset;
    if (word_choice->certainty() - best_choice->certainty() <
        max_certainty_delta) {
      if (debug) {
        STRING bad_string;
        word_choice->string_and_lengths(&bad_string, nullptr);
        tprintf("Discarding choice \"%s\" with an overly low certainty"
                " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",
                bad_string.c_str(), word_choice->certainty(),
                best_choice->certainty(),
                max_certainty_delta + best_choice->certainty());
      }
      delete word_choice;
      return false;
    }
  }

  // Insert in the list in order of increasing rating, but knock out worse
  // string duplicates.
  WERD_CHOICE_IT it(&best_choices);
  const STRING& new_str = word_choice->unichar_string();
  bool inserted = false;
  int num_choices = 0;
  if (!it.empty()) {
    do {
      WERD_CHOICE* choice = it.data();
      if (choice->rating() > word_choice->rating() && !inserted) {
        // Time to insert.
        it.add_before_stay_put(word_choice);
        inserted = true;
        if (num_choices == 0)
          best_choice = word_choice;  // This is the new best.
        ++num_choices;
      }
      if (choice->unichar_string() == new_str) {
        if (inserted) {
          // New is better.
          delete it.extract();
        } else {
          // Old is better.
          if (debug) {
            tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n",
                    new_str.c_str(), word_choice->rating(), choice->rating());
          }
          delete word_choice;
          return false;
        }
      } else {
        ++num_choices;
        if (num_choices > max_num_choices)
          delete it.extract();
      }
      it.forward();
    } while (!it.at_first());
  }
  if (!inserted && num_choices < max_num_choices) {
    it.add_to_end(word_choice);
    inserted = true;
    if (num_choices == 0)
      best_choice = word_choice;  // This is the new best.
  }
  if (debug) {
    if (inserted)
      tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary");
    else
      tprintf("Poor");
    word_choice->print(" Word Choice");
  }
  if (!inserted) {
    delete word_choice;
    return false;
  }
  return true;
}


// Simple helper moves the ownership of the pointer data from src to dest,
// first deleting anything in dest, and nulling out src afterwards.
template<class T> static void MovePointerData(T** dest, T**src) {
  delete *dest;
  *dest = *src;
  *src = nullptr;
}

// Prints a brief list of all the best choices.
void WERD_RES::PrintBestChoices() const {
  STRING alternates_str;
  WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
    if (!it.at_first()) alternates_str += "\", \"";
    alternates_str += it.data()->unichar_string();
  }
  tprintf("Alternates for \"%s\": {\"%s\"}\n",
          best_choice->unichar_string().c_str(), alternates_str.c_str());
}

// Returns the sum of the widths of the blob between start_blob and last_blob
// inclusive.
int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) {
  int result = 0;
  for (int b = start_blob; b <= last_blob; ++b) {
    result += blob_widths[b];
    if (b < last_blob)
      result += blob_gaps[b];
  }
  return result;
}
// Returns the width of a gap between the specified blob and the next one.
int WERD_RES::GetBlobsGap(int blob_index) {
  if (blob_index < 0 || blob_index >= blob_gaps.size())
    return 0;
  return blob_gaps[blob_index];
}

// Returns the BLOB_CHOICE corresponding to the given index in the
// best choice word taken from the appropriate cell in the ratings MATRIX.
// Borrowed pointer, so do not delete. May return nullptr if there is no
// BLOB_CHOICE matching the unichar_id at the given index.
BLOB_CHOICE* WERD_RES::GetBlobChoice(int index) const {
  if (index < 0 || index >= best_choice->length()) return nullptr;
  BLOB_CHOICE_LIST* choices = GetBlobChoices(index);
  return FindMatchingChoice(best_choice->unichar_id(index), choices);
}

// Returns the BLOB_CHOICE_LIST corresponding to the given index in the
// best choice word taken from the appropriate cell in the ratings MATRIX.
// Borrowed pointer, so do not delete.
BLOB_CHOICE_LIST* WERD_RES::GetBlobChoices(int index) const {
  return best_choice->blob_choices(index, ratings);
}

// Moves the results fields from word to this. This takes ownership of all
// the data, so src can be destructed.
void WERD_RES::ConsumeWordResults(WERD_RES* word) {
  denorm = word->denorm;
  blob_row = word->blob_row;
  MovePointerData(&chopped_word, &word->chopped_word);
  MovePointerData(&rebuild_word, &word->rebuild_word);
  MovePointerData(&box_word, &word->box_word);
  seam_array.delete_data_pointers();
  seam_array = word->seam_array;
  word->seam_array.clear();
  best_state.move(&word->best_state);
  correct_text.move(&word->correct_text);
  blob_widths.move(&word->blob_widths);
  blob_gaps.move(&word->blob_gaps);
  if (ratings != nullptr) ratings->delete_matrix_pointers();
  MovePointerData(&ratings, &word->ratings);
  best_choice = word->best_choice;
  MovePointerData(&raw_choice, &word->raw_choice);
  best_choices.clear();
  WERD_CHOICE_IT wc_it(&best_choices);
  wc_it.add_list_after(&word->best_choices);
  reject_map = word->reject_map;
  if (word->blamer_bundle != nullptr) {
    assert(blamer_bundle != nullptr);
    blamer_bundle->CopyResults(*(word->blamer_bundle));
  }
  CopySimpleFields(*word);
}

// Replace the best choice and rebuild box word.
// choice must be from the current best_choices list.
void WERD_RES::ReplaceBestChoice(WERD_CHOICE* choice) {
  best_choice = choice;
  RebuildBestState();
  SetupBoxWord();
  // Make up a fake reject map of the right length to keep the
  // rejection pass happy.
  reject_map.initialise(best_state.size());
  done = tess_accepted = tess_would_adapt = true;
  SetScriptPositions();
}

// Builds the rebuild_word and sets the best_state from the chopped_word and
// the best_choice->state.
void WERD_RES::RebuildBestState() {
  ASSERT_HOST(best_choice != nullptr);
  delete rebuild_word;
  rebuild_word = new TWERD;
  if (seam_array.empty())
    start_seam_list(chopped_word, &seam_array);
  best_state.truncate(0);
  int start = 0;
  for (int i = 0; i < best_choice->length(); ++i) {
    int length = best_choice->state(i);
    best_state.push_back(length);
    if (length > 1) {
      SEAM::JoinPieces(seam_array, chopped_word->blobs, start,
                       start + length - 1);
    }
    TBLOB* blob = chopped_word->blobs[start];
    rebuild_word->blobs.push_back(new TBLOB(*blob));
    if (length > 1) {
      SEAM::BreakPieces(seam_array, chopped_word->blobs, start,
                        start + length - 1);
    }
    start += length;
  }
}

// Copies the chopped_word to the rebuild_word, faking a best_state as well.
// Also sets up the output box_word.
void WERD_RES::CloneChoppedToRebuild() {
  delete rebuild_word;
  rebuild_word = new TWERD(*chopped_word);
  SetupBoxWord();
  int word_len = box_word->length();
  best_state.reserve(word_len);
  correct_text.reserve(word_len);
  for (int i = 0; i < word_len; ++i) {
    best_state.push_back(1);
    correct_text.push_back(STRING(""));
  }
}

// Sets/replaces the box_word with one made from the rebuild_word.
void WERD_RES::SetupBoxWord() {
  delete box_word;
  rebuild_word->ComputeBoundingBoxes();
  box_word = tesseract::BoxWord::CopyFromNormalized(rebuild_word);
  box_word->ClipToOriginalWord(denorm.block(), word);
}

// Sets up the script positions in the output best_choice using the best_choice
// to get the unichars, and the unicharset to get the target positions.
void WERD_RES::SetScriptPositions() {
  best_choice->SetScriptPositions(small_caps, chopped_word);
}
// Sets all the blobs in all the words (raw choice and best choices) to be
// the given position. (When a sub/superscript is recognized as a separate
// word, it falls victim to the rule that a whole word cannot be sub or
// superscript, so this function overrides that problem.)
void WERD_RES::SetAllScriptPositions(tesseract::ScriptPos position) {
  raw_choice->SetAllScriptPositions(position);
  WERD_CHOICE_IT wc_it(&best_choices);
  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward())
    wc_it.data()->SetAllScriptPositions(position);
}

// Classifies the word with some already-calculated BLOB_CHOICEs.
// The choices are an array of blob_count pointers to BLOB_CHOICE,
// providing a single classifier result for each blob.
// The BLOB_CHOICEs are consumed and the word takes ownership.
// The number of blobs in the box_word must match blob_count.
void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
  // Setup the WERD_RES.
  ASSERT_HOST(box_word != nullptr);
  ASSERT_HOST(blob_count == box_word->length());
  ClearWordChoices();
  ClearRatings();
  ratings = new MATRIX(blob_count, 1);
  for (int c = 0; c < blob_count; ++c) {
    auto* choice_list = new BLOB_CHOICE_LIST;
    BLOB_CHOICE_IT choice_it(choice_list);
    choice_it.add_after_then_move(choices[c]);
    ratings->put(c, c, choice_list);
  }
  FakeWordFromRatings(TOP_CHOICE_PERM);
  reject_map.initialise(blob_count);
  best_state.init_to_size(blob_count, 1);
  done = true;
}

// Creates a WERD_CHOICE for the word using the top choices from the leading
// diagonal of the ratings matrix.
void WERD_RES::FakeWordFromRatings(PermuterType permuter) {
  int num_blobs = ratings->dimension();
  auto* word_choice = new WERD_CHOICE(uch_set, num_blobs);
  word_choice->set_permuter(permuter);
  for (int b = 0; b < num_blobs; ++b) {
    UNICHAR_ID unichar_id = UNICHAR_SPACE;
    // Initialize rating and certainty like in WERD_CHOICE::make_bad().
    float rating = WERD_CHOICE::kBadRating;
    float certainty = -FLT_MAX;
    BLOB_CHOICE_LIST* choices = ratings->get(b, b);
    if (choices != nullptr && !choices->empty()) {
      BLOB_CHOICE_IT bc_it(choices);
      BLOB_CHOICE* choice = bc_it.data();
      unichar_id = choice->unichar_id();
      rating = choice->rating();
      certainty = choice->certainty();
    }
    word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,
                                                   certainty);
  }
  LogNewRawChoice(word_choice);
  // Ownership of word_choice taken by word here.
  LogNewCookedChoice(1, false, word_choice);
}

// Copies the best_choice strings to the correct_text for adaption/training.
void WERD_RES::BestChoiceToCorrectText() {
  correct_text.clear();
  ASSERT_HOST(best_choice != nullptr);
  for (int i = 0; i < best_choice->length(); ++i) {
    UNICHAR_ID choice_id = best_choice->unichar_id(i);
    const char* blob_choice = uch_set->id_to_unichar(choice_id);
    correct_text.push_back(STRING(blob_choice));
  }
}

// Merges 2 adjacent blobs in the result if the permanent callback
// class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
// callback box_cb is nullptr or returns true, setting the merged blob
// result to the class returned from class_cb.
// Returns true if anything was merged.
bool WERD_RES::ConditionalBlobMerge(
    std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> class_cb,
    std::function<bool(const TBOX&, const TBOX&)> box_cb) {
  ASSERT_HOST(best_choice->length() == 0 || ratings != nullptr);
  bool modified = false;
  for (int i = 0; i + 1 < best_choice->length(); ++i) {
    UNICHAR_ID new_id = class_cb(best_choice->unichar_id(i),
                                 best_choice->unichar_id(i+1));
    if (new_id != INVALID_UNICHAR_ID &&
        (box_cb == nullptr || box_cb(box_word->BlobBox(i),
                                     box_word->BlobBox(i + 1)))) {
      // Raw choice should not be fixed.
      best_choice->set_unichar_id(new_id, i);
      modified = true;
      MergeAdjacentBlobs(i);
      const MATRIX_COORD& coord = best_choice->MatrixCoord(i);
      if (!coord.Valid(*ratings)) {
        ratings->IncreaseBandSize(coord.row + 1 - coord.col);
      }
      BLOB_CHOICE_LIST* blob_choices = GetBlobChoices(i);
      if (FindMatchingChoice(new_id, blob_choices) == nullptr) {
        // Insert a fake result.
        auto* blob_choice = new BLOB_CHOICE;
        blob_choice->set_unichar_id(new_id);
        BLOB_CHOICE_IT bc_it(blob_choices);
        bc_it.add_before_then_move(blob_choice);
      }
    }
  }
  return modified;
}

// Merges 2 adjacent blobs in the result (index and index+1) and corrects
// all the data to account for the change.
void WERD_RES::MergeAdjacentBlobs(int index) {
  if (reject_map.length() == best_choice->length())
    reject_map.remove_pos(index);
  best_choice->remove_unichar_id(index + 1);
  rebuild_word->MergeBlobs(index, index + 2);
  box_word->MergeBoxes(index, index + 2);
  if (index + 1 < best_state.size()) {
    best_state[index] += best_state[index + 1];
    best_state.remove(index + 1);
  }
}

// TODO(tkielbus) Decide between keeping this behavior here or modifying the
// training data.

// Utility function for fix_quotes
// Return true if the next character in the string (given the UTF8 length in
// bytes) is a quote character.
static int is_simple_quote(const char* signed_str, int length) {
  const auto* str =
      reinterpret_cast<const unsigned char*>(signed_str);
  // Standard 1 byte quotes.
  return (length == 1 && (*str == '\'' || *str == '`')) ||
      // UTF-8 3 bytes curved quotes.
      (length == 3 && ((*str == 0xe2 &&
                        *(str + 1) == 0x80 &&
                        *(str + 2) == 0x98) ||
                       (*str == 0xe2 &&
                        *(str + 1) == 0x80 &&
                        *(str + 2) == 0x99)));
}

// Callback helper for fix_quotes returns a double quote if both
// arguments are quote, otherwise INVALID_UNICHAR_ID.
UNICHAR_ID WERD_RES::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) {
  const char *ch = uch_set->id_to_unichar(id1);
  const char *next_ch = uch_set->id_to_unichar(id2);
  if (is_simple_quote(ch, strlen(ch)) &&
      is_simple_quote(next_ch, strlen(next_ch)))
    return uch_set->unichar_to_id("\"");
  return INVALID_UNICHAR_ID;
}

// Change pairs of quotes to double quotes.
void WERD_RES::fix_quotes() {
  if (!uch_set->contains_unichar("\"") ||
      !uch_set->get_enabled(uch_set->unichar_to_id("\"")))
    return;  // Don't create it if it is disallowed.

  using namespace std::placeholders;  // for _1, _2
  ConditionalBlobMerge(std::bind(&WERD_RES::BothQuotes, this, _1, _2),
                       nullptr);
}

// Callback helper for fix_hyphens returns UNICHAR_ID of - if both
// arguments are hyphen, otherwise INVALID_UNICHAR_ID.
UNICHAR_ID WERD_RES::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) {
  const char *ch = uch_set->id_to_unichar(id1);
  const char *next_ch = uch_set->id_to_unichar(id2);
  if (strlen(ch) == 1 && strlen(next_ch) == 1 &&
      (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~'))
    return uch_set->unichar_to_id("-");
  return INVALID_UNICHAR_ID;
}

// Callback helper for fix_hyphens returns true if box1 and box2 overlap
// (assuming both on the same textline, are in order and a chopped em dash.)
bool WERD_RES::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) {
  return box1.right() >= box2.left();
}

// Change pairs of hyphens to a single hyphen if the bounding boxes touch
// Typically a long dash which has been segmented.
void WERD_RES::fix_hyphens() {
  if (!uch_set->contains_unichar("-") ||
      !uch_set->get_enabled(uch_set->unichar_to_id("-")))
    return;  // Don't create it if it is disallowed.

  using namespace std::placeholders;  // for _1, _2
  ConditionalBlobMerge(std::bind(&WERD_RES::BothHyphens, this, _1, _2),
                       std::bind(&WERD_RES::HyphenBoxesOverlap, this, _1, _2));
}

// Callback helper for merge_tess_fails returns a space if both
// arguments are space, otherwise INVALID_UNICHAR_ID.
UNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) {
  if (id1 == id2 && id1 == uch_set->unichar_to_id(" "))
    return id1;
  else
    return INVALID_UNICHAR_ID;
}

// Change pairs of tess failures to a single one
void WERD_RES::merge_tess_fails() {
  using namespace std::placeholders;  // for _1, _2
  if (ConditionalBlobMerge(std::bind(&WERD_RES::BothSpaces,
                                     this, _1, _2), nullptr)) {
    int len = best_choice->length();
    ASSERT_HOST(reject_map.length() == len);
    ASSERT_HOST(box_word->length() == len);
  }
}

// Returns true if the collection of count pieces, starting at start, are all
// natural connected components, ie there are no real chops involved.
bool WERD_RES::PiecesAllNatural(int start, int count) const {
  // all seams must have no splits.
  for (int index = start; index < start + count - 1; ++index) {
    if (index >= 0 && index < seam_array.size()) {
      SEAM* seam = seam_array[index];
      if (seam != nullptr && seam->HasAnySplits()) return false;
    }
  }
  return true;
}


WERD_RES::~WERD_RES () {
  Clear();
}

void WERD_RES::Clear() {
  if (combination) {
    delete word;
  }
  word = nullptr;
  delete blamer_bundle;
  blamer_bundle = nullptr;
  ClearResults();
}

void WERD_RES::ClearResults() {
  done = false;
  fontinfo = nullptr;
  fontinfo2 = nullptr;
  fontinfo_id_count = 0;
  fontinfo_id2_count = 0;
  delete bln_boxes;
  bln_boxes = nullptr;
  blob_row = nullptr;
  delete chopped_word;
  chopped_word = nullptr;
  delete rebuild_word;
  rebuild_word = nullptr;
  delete box_word;
  box_word = nullptr;
  best_state.clear();
  correct_text.clear();
  seam_array.delete_data_pointers();
  seam_array.clear();
  blob_widths.clear();
  blob_gaps.clear();
  ClearRatings();
  ClearWordChoices();
  if (blamer_bundle != nullptr) blamer_bundle->ClearResults();
}
void WERD_RES::ClearWordChoices() {
  best_choice = nullptr;
  delete raw_choice;
  raw_choice = nullptr;
  best_choices.clear();
  delete ep_choice;
  ep_choice = nullptr;
}
void WERD_RES::ClearRatings() {
  if (ratings != nullptr) {
    ratings->delete_matrix_pointers();
    delete ratings;
    ratings = nullptr;
  }
}

int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
  ASSERT_HOST(page_res == other.page_res);
  if (other.block_res == nullptr) {
    // other points to the end of the page.
    if (block_res == nullptr)
      return 0;
    return -1;
  }
  if (block_res == nullptr) {
    return 1; // we point to the end of the page.
  }
  if (block_res == other.block_res) {
    if (other.row_res == nullptr || row_res == nullptr) {
      // this should only happen if we hit an image block.
      return 0;
    }
    if (row_res == other.row_res) {
      // we point to the same block and row.
      ASSERT_HOST(other.word_res != nullptr && word_res != nullptr);
      if (word_res == other.word_res) {
        // we point to the same word!
        return 0;
      }

      WERD_RES_IT word_res_it(&row_res->word_res_list);
      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
           word_res_it.forward()) {
        if (word_res_it.data() == word_res) {
          return -1;
        } else if (word_res_it.data() == other.word_res) {
          return 1;
        }
      }
      ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
    }

    // we both point to the same block, but different rows.
    ROW_RES_IT row_res_it(&block_res->row_res_list);
    for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
         row_res_it.forward()) {
      if (row_res_it.data() == row_res) {
        return -1;
      } else if (row_res_it.data() == other.row_res) {
        return 1;
      }
    }
    ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
  }

  // We point to different blocks.
  BLOCK_RES_IT block_res_it(&page_res->block_res_list);
  for (block_res_it.mark_cycle_pt();
       !block_res_it.cycled_list(); block_res_it.forward()) {
    if (block_res_it.data() == block_res) {
      return -1;
    } else if (block_res_it.data() == other.block_res) {
      return 1;
    }
  }
  // Shouldn't happen...
  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
  return 0;
}

// Inserts the new_word as a combination owned by a corresponding WERD_RES
// before the current position. The simple fields of the WERD_RES are copied
// from clone_res and the resulting WERD_RES is returned for further setup
// with best_choice etc.
WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res,
                                             WERD* new_word) {
  // Make a WERD_RES for the new_word.
  auto* new_res = new WERD_RES(new_word);
  new_res->CopySimpleFields(clone_res);
  new_res->combination = true;
  // Insert into the appropriate place in the ROW_RES.
  WERD_RES_IT wr_it(&row()->word_res_list);
  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
    WERD_RES* word = wr_it.data();
    if (word == word_res)
      break;
  }
  ASSERT_HOST(!wr_it.cycled_list());
  wr_it.add_before_then_move(new_res);
  if (wr_it.at_first()) {
    // This is the new first word, so reset the member iterator so it
    // detects the cycled_list state correctly.
    ResetWordIterator();
  }
  return new_res;
}

// Helper computes the boundaries between blobs in the word. The blob bounds
// are likely very poor, if they come from LSTM, where it only outputs the
// character at one pixel within it, so we find the midpoints between them.
static void ComputeBlobEnds(const WERD_RES& word, const TBOX& clip_box,
                            C_BLOB_LIST* next_word_blobs,
                            GenericVector<int>* blob_ends) {
  C_BLOB_IT blob_it(word.word->cblob_list());
  for (int i = 0; i < word.best_state.size(); ++i) {
    int length = word.best_state[i];
    // Get the bounding box of the fake blobs
    TBOX blob_box = blob_it.data()->bounding_box();
    blob_it.forward();
    for (int b = 1; b < length; ++b) {
      blob_box += blob_it.data()->bounding_box();
      blob_it.forward();
    }
    // This blob_box is crap, so for now we are only looking for the
    // boundaries between them.
    int blob_end = INT32_MAX;
    if (!blob_it.at_first() || next_word_blobs != nullptr) {
      if (blob_it.at_first())
        blob_it.set_to_list(next_word_blobs);
      blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;
    }
    blob_end = ClipToRange<int>(blob_end, clip_box.left(), clip_box.right());
    blob_ends->push_back(blob_end);
  }
  blob_ends->back() = clip_box.right();
}

// Helper computes the bounds of a word by restricting it to existing words
// that significantly overlap.
static TBOX ComputeWordBounds(const tesseract::PointerVector<WERD_RES>& words,
                              int w_index, TBOX prev_box, WERD_RES_IT w_it) {
  constexpr int kSignificantOverlapFraction = 4;
  TBOX clipped_box;
  TBOX current_box = words[w_index]->word->bounding_box();
  TBOX next_box;
  if (w_index + 1 < words.size() && words[w_index + 1] != nullptr &&
      words[w_index + 1]->word != nullptr)
    next_box = words[w_index + 1]->word->bounding_box();
  for (w_it.forward(); !w_it.at_first() && w_it.data()->part_of_combo;
       w_it.forward()) {
    if (w_it.data() == nullptr || w_it.data()->word == nullptr) continue;
    TBOX w_box = w_it.data()->word->bounding_box();
    int height_limit = std::min<int>(w_box.height(), w_box.width() / 2);
    int width_limit = w_box.width() / kSignificantOverlapFraction;
    int min_significant_overlap = std::max(height_limit, width_limit);
    int overlap = w_box.intersection(current_box).width();
    int prev_overlap = w_box.intersection(prev_box).width();
    int next_overlap = w_box.intersection(next_box).width();
    if (overlap > min_significant_overlap) {
      if (prev_overlap > min_significant_overlap) {
        // We have no choice but to use the LSTM word edge.
        clipped_box.set_left(current_box.left());
      } else if (next_overlap > min_significant_overlap) {
        // We have no choice but to use the LSTM word edge.
        clipped_box.set_right(current_box.right());
      } else {
        clipped_box += w_box;
      }
    }
  }
  if (clipped_box.height() <= 0) {
    clipped_box.set_top(current_box.top());
    clipped_box.set_bottom(current_box.bottom());
  }
  if (clipped_box.width() <= 0) clipped_box = current_box;
  return clipped_box;
}

// Helper moves the blob from src to dest. If it isn't contained by clip_box,
// the blob is replaced by a fake that is contained.
static TBOX MoveAndClipBlob(C_BLOB_IT* src_it, C_BLOB_IT* dest_it,
                            const TBOX& clip_box) {
  C_BLOB* src_blob = src_it->extract();
  TBOX box = src_blob->bounding_box();
  if (!clip_box.contains(box)) {
    int left =
        ClipToRange<int>(box.left(), clip_box.left(), clip_box.right() - 1);
    int right =
        ClipToRange<int>(box.right(), clip_box.left() + 1, clip_box.right());
    int top =
        ClipToRange<int>(box.top(), clip_box.bottom() + 1, clip_box.top());
    int bottom =
        ClipToRange<int>(box.bottom(), clip_box.bottom(), clip_box.top() - 1);
    box = TBOX(left, bottom, right, top);
    delete src_blob;
    src_blob = C_BLOB::FakeBlob(box);
  }
  dest_it->add_after_then_move(src_blob);
  return box;
}

// Replaces the current WERD/WERD_RES with the given words. The given words
// contain fake blobs that indicate the position of the characters. These are
// replaced with real blobs from the current word as much as possible.
void PAGE_RES_IT::ReplaceCurrentWord(
    tesseract::PointerVector<WERD_RES>* words) {
  if (words->empty()) {
    DeleteCurrentWord();
    return;
  }
  WERD_RES* input_word = word();
  // Set the BOL/EOL flags on the words from the input word.
  if (input_word->word->flag(W_BOL)) {
    (*words)[0]->word->set_flag(W_BOL, true);
  } else {
    (*words)[0]->word->set_blanks(input_word->word->space());
  }
  words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));

  // Move the blobs from the input word to the new set of words.
  // If the input word_res is a combination, then the replacements will also be
  // combinations, and will own their own words. If the input word_res is not a
  // combination, then the final replacements will not be either, (although it
  // is allowed for the input words to be combinations) and their words
  // will get put on the row list. This maintains the ownership rules.
  WERD_IT w_it(row()->row->word_list());
  if (!input_word->combination) {
    for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
      WERD* word = w_it.data();
      if (word == input_word->word)
        break;
    }
    // w_it is now set to the input_word's word.
    ASSERT_HOST(!w_it.cycled_list());
  }
  // Insert into the appropriate place in the ROW_RES.
  WERD_RES_IT wr_it(&row()->word_res_list);
  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
    WERD_RES* word = wr_it.data();
    if (word == input_word)
      break;
  }
  ASSERT_HOST(!wr_it.cycled_list());
  // Since we only have an estimate of the bounds between blobs, use the blob
  // x-middle as the determiner of where to put the blobs
  C_BLOB_IT src_b_it(input_word->word->cblob_list());
  src_b_it.sort(&C_BLOB::SortByXMiddle);
  C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
  rej_b_it.sort(&C_BLOB::SortByXMiddle);
  TBOX clip_box;
  for (int w = 0; w < words->size(); ++w) {
    WERD_RES* word_w = (*words)[w];
    clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
    // Compute blob boundaries.
    GenericVector<int> blob_ends;
    C_BLOB_LIST* next_word_blobs =
        w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;
    ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);
    // Remove the fake blobs on the current word, but keep safe for back-up if
    // no blob can be found.
    C_BLOB_LIST fake_blobs;
    C_BLOB_IT fake_b_it(&fake_blobs);
    fake_b_it.add_list_after(word_w->word->cblob_list());
    fake_b_it.move_to_first();
    word_w->word->cblob_list()->clear();
    C_BLOB_IT dest_it(word_w->word->cblob_list());
    // Build the box word as we move the blobs.
    auto* box_word = new tesseract::BoxWord;
    for (int i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) {
      int end_x = blob_ends[i];
      TBOX blob_box;
      // Add the blobs up to end_x.
      while (!src_b_it.empty() &&
             src_b_it.data()->bounding_box().x_middle() < end_x) {
        blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box);
        src_b_it.forward();
      }
      while (!rej_b_it.empty() &&
             rej_b_it.data()->bounding_box().x_middle() < end_x) {
        blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box);
        rej_b_it.forward();
      }
      if (blob_box.null_box()) {
        // Use the original box as a back-up.
        blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box);
      }
      box_word->InsertBox(i, blob_box);
    }
    delete word_w->box_word;
    word_w->box_word = box_word;
    if (!input_word->combination) {
      // Insert word_w->word into the ROW. It doesn't own its word, so the
      // ROW needs to own it.
      w_it.add_before_stay_put(word_w->word);
      word_w->combination = false;
    }
    (*words)[w] = nullptr;  // We are taking ownership.
    wr_it.add_before_stay_put(word_w);
  }
  // We have taken ownership of the words.
  words->clear();
  // Delete the current word, which has been replaced. We could just call
  // DeleteCurrentWord, but that would iterate both lists again, and we know
  // we are already in the right place.
  if (!input_word->combination)
    delete w_it.extract();
  delete wr_it.extract();
  ResetWordIterator();
}

// Deletes the current WERD_RES and its underlying WERD.
void PAGE_RES_IT::DeleteCurrentWord() {
  // Check that this word is as we expect. part_of_combos are NEVER iterated
  // by the normal iterator, so we should never be trying to delete them.
  ASSERT_HOST(!word_res->part_of_combo);
  if (!word_res->combination) {
    // Combinations own their own word, so we won't find the word on the
    // row's word_list, but it is legitimate to try to delete them.
    // Delete word from the ROW when not a combination.
    WERD_IT w_it(row()->row->word_list());
    for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
      if (w_it.data() == word_res->word) {
        break;
      }
    }
    ASSERT_HOST(!w_it.cycled_list());
    delete w_it.extract();
  }
  // Remove the WERD_RES for the new_word.
  // Remove the WORD_RES from the ROW_RES.
  WERD_RES_IT wr_it(&row()->word_res_list);
  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
    if (wr_it.data() == word_res) {
      word_res = nullptr;
      break;
    }
  }
  ASSERT_HOST(!wr_it.cycled_list());
  delete wr_it.extract();
  ResetWordIterator();
}

// Makes the current word a fuzzy space if not already fuzzy. Updates
// corresponding part of combo if required.
void PAGE_RES_IT::MakeCurrentWordFuzzy() {
  WERD* real_word = word_res->word;
  if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
    real_word->set_flag(W_FUZZY_SP, true);
    if (word_res->combination) {
      // The next word should be the corresponding part of combo, but we have
      // already stepped past it, so find it by search.
      WERD_RES_IT wr_it(&row()->word_res_list);
      for (wr_it.mark_cycle_pt();
           !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
      }
      wr_it.forward();
      ASSERT_HOST(wr_it.data()->part_of_combo);
      real_word = wr_it.data()->word;
      ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
                  !real_word->flag(W_FUZZY_NON));
      real_word->set_flag(W_FUZZY_SP, true);
    }
  }
}

/*************************************************************************
 * PAGE_RES_IT::restart_page
 *
 * Set things up at the start of the page
 *************************************************************************/

WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) {
  block_res_it.set_to_list(&page_res->block_res_list);
  block_res_it.mark_cycle_pt();
  prev_block_res = nullptr;
  prev_row_res = nullptr;
  prev_word_res = nullptr;
  block_res = nullptr;
  row_res = nullptr;
  word_res = nullptr;
  next_block_res = nullptr;
  next_row_res = nullptr;
  next_word_res = nullptr;
  internal_forward(true, empty_ok);
  return internal_forward(false, empty_ok);
}

// Recovers from operations on the current word, such as in InsertCloneWord
// and DeleteCurrentWord.
// Resets the word_res_it so that it is one past the next_word_res, as
// it should be after internal_forward. If next_row_res != row_res,
// then the next_word_res is in the next row, so there is no need to do
// anything to word_res_it, but it is still a good idea to reset the pointers
// word_res and prev_word_res, which are still in the current row.
void PAGE_RES_IT::ResetWordIterator() {
  if (row_res == next_row_res) {
    // Reset the member iterator so it can move forward and detect the
    // cycled_list state correctly.
    word_res_it.move_to_first();
    for (word_res_it.mark_cycle_pt();
         !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
         word_res_it.forward()) {
      if (!word_res_it.data()->part_of_combo) {
        if (prev_row_res == row_res) prev_word_res = word_res;
        word_res = word_res_it.data();
      }
    }
    ASSERT_HOST(!word_res_it.cycled_list());
    wr_it_of_next_word = word_res_it;
    word_res_it.forward();
  } else {
    // word_res_it is OK, but reset word_res and prev_word_res if needed.
    WERD_RES_IT wr_it(&row_res->word_res_list);
    for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
      if (!wr_it.data()->part_of_combo) {
        if (prev_row_res == row_res) prev_word_res = word_res;
        word_res = wr_it.data();
      }
    }
  }
}

/*************************************************************************
 * PAGE_RES_IT::internal_forward
 *
 * Find the next word on the page. If empty_ok is true, then non-text blocks
 * and text blocks with no text are visited as if they contain a single
 * imaginary word in a single imaginary row. (word() and row() both return nullptr
 * in such a block and the return value is nullptr.)
 * If empty_ok is false, the old behaviour is maintained. Each real word
 * is visited and empty and non-text blocks and rows are skipped.
 * new_block is used to initialize the iterators for a new block.
 * The iterator maintains pointers to block, row and word for the previous,
 * current and next words.  These are correct, regardless of block/row
 * boundaries. nullptr values denote start and end of the page.
 *************************************************************************/

WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
  bool new_row = false;

  prev_block_res = block_res;
  prev_row_res = row_res;
  prev_word_res = word_res;
  block_res = next_block_res;
  row_res = next_row_res;
  word_res = next_word_res;
  wr_it_of_current_word = wr_it_of_next_word;
  next_block_res = nullptr;
  next_row_res = nullptr;
  next_word_res = nullptr;

  while (!block_res_it.cycled_list()) {
    if (new_block) {
      new_block = false;
      row_res_it.set_to_list(&block_res_it.data()->row_res_list);
      row_res_it.mark_cycle_pt();
      if (row_res_it.empty() && empty_ok) {
        next_block_res = block_res_it.data();
        break;
      }
      new_row = true;
    }
    while (!row_res_it.cycled_list()) {
      if (new_row) {
        new_row = false;
        word_res_it.set_to_list(&row_res_it.data()->word_res_list);
        word_res_it.mark_cycle_pt();
      }
      // Skip any part_of_combo words.
      while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo)
        word_res_it.forward();
      if (!word_res_it.cycled_list()) {
        next_block_res = block_res_it.data();
        next_row_res = row_res_it.data();
        next_word_res = word_res_it.data();
        wr_it_of_next_word = word_res_it;
        word_res_it.forward();
        goto foundword;
      }
      // end of row reached
      row_res_it.forward();
      new_row = true;
    }
    // end of block reached
    block_res_it.forward();
    new_block = true;
  }
  foundword:
  // Update prev_word_best_choice pointer.
  if (page_res != nullptr && page_res->prev_word_best_choice != nullptr) {
    *page_res->prev_word_best_choice =
      (new_block || prev_word_res == nullptr) ? nullptr : prev_word_res->best_choice;
  }
  return word_res;
}

/*************************************************************************
 * PAGE_RES_IT::restart_row()
 *
 * Move to the beginning (leftmost word) of the current row.
 *************************************************************************/
WERD_RES *PAGE_RES_IT::restart_row() {
  ROW_RES *row = this->row();
  if (!row) return nullptr;
  for (restart_page(); this->row() != row; forward()) {
    // pass
  }
  return word();
}

/*************************************************************************
 * PAGE_RES_IT::forward_paragraph
 *
 * Move to the beginning of the next paragraph, allowing empty blocks.
 *************************************************************************/

WERD_RES *PAGE_RES_IT::forward_paragraph() {
  while (block_res == next_block_res &&
         (next_row_res != nullptr && next_row_res->row != nullptr &&
          row_res->row->para() == next_row_res->row->para())) {
    internal_forward(false, true);
  }
  return internal_forward(false, true);
}

/*************************************************************************
 * PAGE_RES_IT::forward_block
 *
 * Move to the beginning of the next block, allowing empty blocks.
 *************************************************************************/

WERD_RES *PAGE_RES_IT::forward_block() {
  while (block_res == next_block_res) {
    internal_forward(false, true);
  }
  return internal_forward(false, true);
}

void PAGE_RES_IT::rej_stat_word() {
  int16_t chars_in_word;
  int16_t rejects_in_word = 0;

  chars_in_word = word_res->reject_map.length ();
  page_res->char_count += chars_in_word;
  block_res->char_count += chars_in_word;
  row_res->char_count += chars_in_word;

  rejects_in_word = word_res->reject_map.reject_count ();

  page_res->rej_count += rejects_in_word;
  block_res->rej_count += rejects_in_word;
  row_res->rej_count += rejects_in_word;
  if (chars_in_word == rejects_in_word)
    row_res->whole_word_rej_count += rejects_in_word;
}

} // namespace tesseract