19 #ifndef TESSERACT_DICT_DICT_H_
20 #define TESSERACT_DICT_DICT_H_
33 #define MAX_WERD_LENGTH (inT64) 128
52 static const int kAnyWordLength = -1;
53 static const int kRatingPad = 4;
56 static const char kHyphenSymbol[] =
"-";
57 static const int kMaxNumDawgEdgees = 2000000;
58 static const int kMaxDocDawgEdges = 250000;
59 static const int kMaxUserDawgEdges = 50000;
60 static const float kSimCertaintyScale = -10.0;
61 static const float kSimCertaintyOffset = -10.0;
62 static const float kSimilarityFloor = 100.0;
63 static const int kDocDictMaxRepChars = 4;
130 *word = *hyphen_word_;
145 return (last_word_on_line_ && !first_pos &&
146 unichar_id == hyphen_unichar_id_);
150 int word_index = word.
length() - 1;
168 if (word.
rating() < best_choice->
rating()) *best_choice = word;
181 return rating_limit <= 0.0;
190 int sought_word_length,
int end_char_choice_index);
206 bool word_ending,
WERD_CHOICE *word,
float certainties[],
207 float *limit,
WERD_CHOICE *best_choice,
int *attempts_left,
208 void *void_more_args);
218 const char*
choose_il1(
const char *first_char,
219 const char *second_char,
220 const char *third_char,
221 const char *prev_char,
222 const char *next_char,
223 const char *next_next_char);
262 char* pos_chartypes);
274 bool word_ending,
WERD_CHOICE *word,
float certainties[],
float *limit,
275 WERD_CHOICE *best_choice,
int *attempts_left,
void *more_args);
279 float curr_rating,
float curr_certainty,
281 const char *debug,
int word_ending,
286 int char_choice_index,
299 int char_choice_index,
310 int char_choice_index,
313 float certainties[],
float *limit,
315 void *void_more_args);
319 bool fix_replaceable,
321 bool *modified_blobs);
339 void ReplaceAmbig(
int wrong_ngram_begin_index,
int wrong_ngram_size,
342 bool *modified_blobs);
358 const float Certainties[]);
367 const char *String_lengths,
383 bool *modified_blobs);
399 const float Certainties[],
414 int label_num_unichars);
418 FLOAT32 AdjustFactor,
const float Certainties[],
479 if (pending_words_ !=
NULL)
480 pending_words_->
clear();
481 if (document_words_ !=
NULL)
482 document_words_->
clear();
574 int character_bytes);
579 int character_bytes) {
582 context, context_bytes,
588 const char*
lang,
const char* context,
int context_bytes,
589 const char*
character,
int character_bytes) {
591 (void) context_bytes;
593 (void) character_bytes;
600 int character_bytes);
605 inline const Dawg *
GetDawg(
int index)
const {
return dawgs_[index]; }
612 if (word_length > max_fixed_length_dawgs_wdlen_)
return NULL;
613 assert(dawgs_.
size() > word_length);
614 return dawgs_[word_length];
617 return max_fixed_length_dawgs_wdlen_;
621 if (edge_ref == NO_EDGE)
return 0;
623 if (node == 0) node = NO_EDGE;
632 int word_end,
DawgType current_dawg_type)
const {
633 if (!word_end)
return true;
635 for (
int c = 0; c < constraints.
length(); ++c) {
636 const DawgInfo &cinfo = constraints[c];
657 PermuterType *current_permuter)
const;
669 PermuterType perm,
int debug_level,
676 int num_dawgs,
int debug_level, FILE *output_file);
680 return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
681 perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
682 perm == USER_PATTERN_PERM || (numbers_ok && perm == NUMBER_PERM));
712 bool nonword,
float additional_adjust,
bool debug);
716 adjust_word(word, certainty_array, char_choices,
false, 0.0
f,
720 float *certainty_array,
723 adjust_word(word, certainty_array, char_choices,
true, 0.0
f, debug);
727 wordseg_rating_adjust_factor_ =
f;
748 bool keep_word_choices_;
762 bool last_word_on_line_;
771 Trie *pending_words_;
785 Trie *document_words_;
788 int max_fixed_length_dawgs_wdlen_;
791 float wordseg_rating_adjust_factor_;
793 FILE *output_ambig_words_file_;
801 "A list of user-provided patterns.");
806 "Load dawg with punctuation patterns.");
809 " dawgs (e.g. for non-space delimited languages)");
811 "Load dawg with special word bigrams.");
813 "Score multiplier for word matches which have good case and"
814 "are frequent in the given language (lower is better).");
817 "Score multiplier for word matches that have good case "
818 "(lower is better).");
821 "Default score multiplier for word matches, which may have "
822 "case issues (lower is better).");
826 "Multipler to for the best choice from the ngram model.");
829 "Score multiplier for glyph fragment segmentations which "
830 "do not match a dictionary word (lower is better).");
833 "Score multiplier for poorly cased strings that are not in"
834 " the dictionary and generally look like garbage (lower is"
837 "Output file for ambiguities found in the dictionary");
839 ", to 2 for more details, to 3 to see all the debug messages");
843 "Use only the first UTF8 step of the given string"
844 " when computing log probabilities.");
847 "Certainty threshold for non-dict words");
849 "Reject certainty offset");
851 "Size of dict word to be treated as non-dict word");
853 "Certainty to add for each dict char above small word size.");
855 "Max certaintly variation allowed in a word (in sigma)");
858 "Make AcceptableChoice() always return false. Useful"
859 " when there is a need to explore all segmentations");
861 "Gain factor for ambiguity threshold.");
863 "Certainty offset for ambiguity threshold.");
867 " should be printed to stdout");
869 "Lengths of unichars in word_to_debug");
874 " current best rate to prune other hypotheses");
876 "Turn on word script consistency permuter");
878 "incorporate segmentation cost in word rating?");
880 "Don't use any alphabetic-specific tricks."
881 "Set to true in the traineddata config file for"
882 " scripts that are cursive or inherently fixed-pitch");
884 "Score multipler for script consistency within a word. "
885 "Being a 'reward' factor, it should be <= 1. "
886 "Smaller value implies bigger reward.");
888 "Turn on fixed-length phrasebook search permuter");
890 "Turn on character type (property) consistency permuter");
892 "Score multipler for char type consistency within a word. ");
895 "Score multipler for ngram permuter's best choice"
896 " (only used in the Han script path).");
900 "Worst certainty for using pending dictionary");
902 " for words that can be inserted into the document dictionary");
904 "Activate character-level n-gram-based permuter");
906 " character choices to consider during permutation."
907 " This limit is especially useful when user patterns"
908 " are specified, since overly generic patterns can result in"
909 " dawg search exploring an overly large number of options.");
914 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_