21 #pragma warning(disable:4244) // Conversion warnings
86 for (; blob !=
NULL; blob = blob->
next) {
103 inT16 *accepted_match_count) {
132 int expected_outline_count;
137 expected_outline_count = 2;
139 expected_outline_count = 1;
140 return abs (outline_count - expected_outline_count);
144 BOOL8 good_quality_doc) {
174 while (page_res_it.
word () !=
NULL) {
177 word = page_res_it.
word ();
179 if (word->
reject_map[i].accept_if_good_quality ())
188 word = page_res_it.
word ();
201 current_row = page_res_it.
row ();
202 while ((page_res_it.
word () !=
NULL) &&
203 (page_res_it.
row () == current_row))
211 current_block =
NULL;
213 while (page_res_it.
word () !=
NULL) {
214 if (current_block != page_res_it.
block ()) {
215 current_block = page_res_it.
block ();
219 if (current_row != page_res_it.
row ()) {
220 current_row = page_res_it.
row ();
240 BOOL8 good_quality_doc) {
247 BOOL8 prev_word_rejected;
248 inT16 char_quality = 0;
249 inT16 accepted_char_quality;
255 tprintf(
"REJECT ALL #chars: %d #Rejects: %d; \n",
261 tprintf(
"NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
270 while ((word = page_res_it.
word()) !=
NULL) {
271 current_block = page_res_it.
block();
277 tprintf(
"REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
281 prev_word_rejected =
FALSE;
282 while ((word = page_res_it.
word()) !=
NULL &&
283 (page_res_it.
block() == current_block)) {
296 &accepted_char_quality);
309 prev_word_rejected &&
315 prev_word_rejected = rej_word;
320 tprintf(
"NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
327 while ((word = page_res_it.
word()) !=
NULL &&
328 page_res_it.
block() == current_block) {
329 current_row = page_res_it.
row();
343 tprintf(
"REJECTING ROW %d #chars: %d; #Rejects: %d\n",
347 prev_word_rejected =
FALSE;
348 while ((word = page_res_it.
word()) !=
NULL &&
349 page_res_it.
row () == current_row) {
367 &accepted_char_quality);
380 prev_word_rejected &&
386 prev_word_rejected = rej_word;
391 tprintf(
"NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
394 while (page_res_it.
word() !=
NULL &&
395 page_res_it.
row() == current_row)
415 while (page_res_it.
word () !=
NULL) {
433 while (page_res_it.
word() !=
NULL) {
439 word = page_res_it.
word();
448 found_terrible_word =
FALSE;
450 prev_potential_marked =
FALSE;
459 tprintf (
"T CRUNCHING: \"%s\"\n",
463 if (prev_potential_marked) {
464 while (copy_it.
word () != word) {
466 tprintf (
"P1 CRUNCHING: \"%s\"\n",
472 prev_potential_marked =
FALSE;
474 found_terrible_word =
TRUE;
478 garbage_level, ok_dict_word))) {
479 if (found_terrible_word) {
481 tprintf (
"P2 CRUNCHING: \"%s\"\n",
486 else if (!prev_potential_marked) {
487 copy_it = page_res_it;
488 prev_potential_marked =
TRUE;
490 tprintf (
"P3 CRUNCHING: \"%s\"\n",
496 found_terrible_word =
FALSE;
498 prev_potential_marked =
FALSE;
500 tprintf (
"NO CRUNCH: \"%s\"\n",
531 (garbage_level !=
G_OK))
534 (garbage_level !=
G_OK))
537 if (crunch_mode > 0) {
539 tprintf (
"Terrible_word_crunch (%d) on \"%s\"\n",
550 BOOL8 ok_dict_word) {
555 BOOL8 word_crunchable;
556 int poor_indicator_count = 0;
565 if (adjusted_len > 10)
571 tprintf(
"Potential poor rating on \"%s\"\n",
574 poor_indicator_count++;
577 if (word_crunchable &&
580 tprintf(
"Potential poor cert on \"%s\"\n",
583 poor_indicator_count++;
586 if (garbage_level !=
G_OK) {
588 tprintf(
"Potential garbage on \"%s\"\n",
591 poor_indicator_count++;
601 inT16 debug_delete_mode;
603 inT16 x_debug_delete_mode;
607 while (page_res_it.
word() !=
NULL) {
608 word = page_res_it.
word();
614 tprintf (
"BOL CRUNCH DELETING(%d): \"%s\"\n",
619 deleting_from_bol =
TRUE;
621 if (marked_delete_point) {
622 while (copy_it.
word() != word) {
624 x_debug_delete_mode);
626 tprintf (
"EOL CRUNCH DELETING(%d): \"%s\"\n",
635 tprintf (
"EOL CRUNCH DELETING(%d): \"%s\"\n",
640 deleting_from_bol =
FALSE;
641 marked_delete_point =
FALSE;
644 if (!marked_delete_point) {
645 copy_it = page_res_it;
646 marked_delete_point =
TRUE;
651 deleting_from_bol =
FALSE;
653 marked_delete_point =
FALSE;
672 bool modified =
false;
704 int isolated_digits = 0;
705 int isolated_alphas = 0;
706 int bad_char_count = 0;
711 int alpha_repetition_count = 0;
712 int longest_alpha_repetition_count = 0;
713 int longest_lower_run_len = 0;
714 int lower_string_count = 0;
715 int longest_upper_run_len = 0;
716 int upper_string_count = 0;
717 int total_alpha_count = 0;
718 int total_digit_count = 0;
720 for (; *str !=
'\0'; str += *(lengths++)) {
725 case SUBSEQUENT_UPPER:
727 state = SUBSEQUENT_UPPER;
728 upper_string_count++;
729 if (longest_upper_run_len < upper_string_count)
730 longest_upper_run_len = upper_string_count;
732 alpha_repetition_count++;
733 if (longest_alpha_repetition_count < alpha_repetition_count) {
734 longest_alpha_repetition_count = alpha_repetition_count;
739 alpha_repetition_count = 1;
747 alpha_repetition_count = 1;
748 upper_string_count = 1;
755 case SUBSEQUENT_LOWER:
757 state = SUBSEQUENT_LOWER;
758 lower_string_count++;
759 if (longest_lower_run_len < lower_string_count)
760 longest_lower_run_len = lower_string_count;
762 alpha_repetition_count++;
763 if (longest_alpha_repetition_count < alpha_repetition_count) {
764 longest_alpha_repetition_count = alpha_repetition_count;
769 alpha_repetition_count = 1;
777 alpha_repetition_count = 1;
778 lower_string_count = 1;
786 state = SUBSEQUENT_NUM;
798 if (*lengths == 1 && *str ==
' ')
828 total_alpha_count += total_digit_count - isolated_digits;
832 2 * (total_alpha_count - isolated_alphas) > len &&
842 strpbrk(str,
" ") ==
NULL &&
851 ok_chars = len - bad_char_count - isolated_digits -
852 isolated_alphas - tess_rejs;
855 tprintf(
"garbage_word: \"%s\"\n",
857 tprintf(
"LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
859 bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
861 if (bad_char_count == 0 &&
863 (len > isolated_digits + isolated_alphas || len <= 2))
866 if (tess_rejs > ok_chars ||
867 (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
871 dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
873 if (dodgy_chars > 5 || (dodgy_chars / (
float) len) > 0.5)
878 dodgy_chars = 2 * tess_rejs + bad_char_count;
879 if ((len == 4 && dodgy_chars > 2) ||
880 (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
979 for (; *str !=
'\0'; str++) {
989 inT16 outline_count = 0;
990 inT16 small_outline_count = 0;
994 for (
TBLOB* blob = word->
blobs; blob !=
NULL; blob = blob->next) {
995 for (
TESSLINE* ol = blob->outlines; ol !=
NULL; ol = ol->next) {
997 box = ol->bounding_box();
999 max_dimension = box.
height();
1001 max_dimension = box.
width();
1002 if (max_dimension < small_limit)
1003 small_outline_count++;
1006 return (small_outline_count >= outline_count);