Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
dict.h
Go to the documentation of this file.
1 
2 // File: dict.h
3 // Description: dict class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_DICT_DICT_H_
20 #define TESSERACT_DICT_DICT_H_
21 
22 #include "ambigs.h"
23 #include "dawg.h"
24 #include "host.h"
25 #include "image.h"
26 #include "oldlist.h"
27 #include "ratngs.h"
28 #include "stopper.h"
29 #include "trie.h"
30 #include "unicharset.h"
31 #include "permute.h"
32 
33 #define MAX_WERD_LENGTH (inT64) 128
34 #define NO_RATING -1
35 
41  float rating;
42  float certainty;
43 };
44 
45 namespace tesseract {
46 
48 
49 //
50 // Constants
51 //
52 static const int kAnyWordLength = -1;
53 static const int kRatingPad = 4;
54 // TODO(daria): If hyphens are different in different languages and can be
55 // inferred from training data we should load their values dynamically.
56 static const char kHyphenSymbol[] = "-";
57 static const int kMaxNumDawgEdgees = 2000000;
58 static const int kMaxDocDawgEdges = 250000;
59 static const int kMaxUserDawgEdges = 50000;
60 static const float kSimCertaintyScale = -10.0; // similarity matcher scaling
61 static const float kSimCertaintyOffset = -10.0; // similarity matcher offset
62 static const float kSimilarityFloor = 100.0; // worst E*L product to stop on
63 static const int kDocDictMaxRepChars = 4;
64 
65 struct DawgArgs {
67  DawgInfoVector *uc, float r, PermuterType p, int len, int e) :
70  for (int i = 0; i < MAX_WERD_LENGTH; ++i) {
72  }
73  permuter = p;
74  sought_word_length = len;
76  }
81  PermuterType permuter;
83 
84  // TODO(daria): remove these fields when permdawg is deprecated.
85  float rating_margin;
88 };
89 
90 class Dict {
91  public:
92  Dict(Image* image_ptr);
93  ~Dict();
94  const Image* getImage() const {
95  return image_ptr_;
96  }
98  return image_ptr_;
99  }
100  const UNICHARSET& getUnicharset() const {
101  return getImage()->getCCUtil()->unicharset;
102  }
104  return getImage()->getCCUtil()->unicharset;
105  }
107  return getImage()->getCCUtil()->unichar_ambigs;
108  }
109 
110  inline bool compound_marker(UNICHAR_ID unichar_id) {
111  return (unichar_id == getUnicharset().unichar_to_id("-") ||
112  unichar_id == getUnicharset().unichar_to_id("/"));
113  }
114 
115  /* hyphen.cpp ************************************************************/
116 
118  inline bool hyphenated() const { return
119  !last_word_on_line_ && hyphen_word_ && GetMaxFixedLengthDawgIndex() < 0;
120  }
122  inline int hyphen_base_size() const {
123  return this->hyphenated() ? hyphen_word_->length() : 0;
124  }
128  inline void copy_hyphen_info(WERD_CHOICE *word) const {
129  if (this->hyphenated()) {
130  *word = *hyphen_word_;
131  if (hyphen_debug_level) word->print("copy_hyphen_info: ");
132  }
133  }
137  inline void remove_hyphen_head(WERD_CHOICE *word) const {
138  if (this->hyphenated()) {
139  word->remove_unichar_ids(0, hyphen_word_->length());
140  if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: ");
141  }
142  }
144  inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const {
145  return (last_word_on_line_ && !first_pos &&
146  unichar_id == hyphen_unichar_id_);
147  }
149  inline bool has_hyphen_end(const WERD_CHOICE &word) const {
150  int word_index = word.length() - 1;
151  return has_hyphen_end(word.unichar_id(word_index), word_index == 0);
152  }
156  void reset_hyphen_vars(bool last_word_on_line);
159  void set_hyphen_word(const WERD_CHOICE &word,
160  const DawgInfoVector &active_dawgs,
161  const DawgInfoVector &constraints);
162 
163  /* permdawg.cpp ************************************************************/
166  inline void update_best_choice(const WERD_CHOICE &word,
167  WERD_CHOICE *best_choice) {
168  if (word.rating() < best_choice->rating()) *best_choice = word;
169  }
173  void init_active_dawgs(int sought_word_length,
174  DawgInfoVector *active_dawgs,
175  bool ambigs_mode) const;
178  void init_constraints(DawgInfoVector *constraints) const;
180  inline bool ambigs_mode(float rating_limit) {
181  return rating_limit <= 0.0;
182  }
189  const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit,
190  int sought_word_length, int end_char_choice_index);
192  const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
193  return dawg_permute_and_select(char_choices, rating_limit,
194  kAnyWordLength, 0);
195  }
203  void go_deeper_dawg_fxn(
204  const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
205  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
206  bool word_ending, WERD_CHOICE *word, float certainties[],
207  float *limit, WERD_CHOICE *best_choice, int *attempts_left,
208  void *void_more_args);
209 
210  /* permute.cpp *************************************************************/
212  const BLOB_CHOICE_LIST_VECTOR &char_choices);
214  const BLOB_CHOICE_LIST_VECTOR &char_choices,
215  float* rating_limit,
216  WERD_CHOICE *raw_choice,
217  BOOL8 *any_alpha);
218  const char* choose_il1(const char *first_char, //first choice
219  const char *second_char, //second choice
220  const char *third_char, //third choice
221  const char *prev_char, //prev in word
222  const char *next_char, //next in word
223  const char *next_next_char); //after next next in word
224  WERD_CHOICE *permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices,
225  const WERD_CHOICE *best_choice,
226  WERD_CHOICE *raw_choice);
227  void end_permute();
228  void permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
229  float rating_limit,
230  int start,
231  int end,
232  WERD_CHOICE *current_word);
233  bool permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
234  WERD_CHOICE *best_choice,
235  WERD_CHOICE *raw_choice);
237  const BLOB_CHOICE_LIST_VECTOR &char_choices,
238  float rating_limit);
243  const BLOB_CHOICE_LIST_VECTOR &char_choices,
244  PermuterState *permuter_state);
246  void incorporate_segcost(WERD_CHOICE* word);
251  const BLOB_CHOICE_LIST_VECTOR &char_choices,
252  PermuterState *permuter_state);
255  const BLOB_CHOICE_LIST_VECTOR &char_choices,
256  PermuterState *permuter_state);
257 
261  char top_word_chartype(const BLOB_CHOICE_LIST_VECTOR &char_choices,
262  char* pos_chartypes);
263 
265  const BLOB_CHOICE_LIST_VECTOR &char_choices,
266  float rating_limit);
272  const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
273  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
274  bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
275  WERD_CHOICE *best_choice, int *attempts_left, void *more_args);
276 
278  bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
279  float curr_rating, float curr_certainty,
280  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
281  const char *debug, int word_ending,
282  CHAR_FRAGMENT_INFO *char_frag_info);
283  void permute_choices(
284  const char *debug,
285  const BLOB_CHOICE_LIST_VECTOR &char_choices,
286  int char_choice_index,
287  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
288  WERD_CHOICE *word,
289  float certainties[],
290  float *limit,
291  WERD_CHOICE *best_choice,
292  int *attempts_left,
293  void *more_args);
294 
295  void append_choices(
296  const char *debug,
297  const BLOB_CHOICE_LIST_VECTOR &char_choices,
298  const BLOB_CHOICE &blob_choice,
299  int char_choice_index,
300  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
301  WERD_CHOICE *word,
302  float certainties[],
303  float *limit,
304  WERD_CHOICE *best_choice,
305  int *attempts_left,
306  void *more_args);
308  void (Dict::*go_deeper_fxn_)(const char *debug,
309  const BLOB_CHOICE_LIST_VECTOR &char_choices,
310  int char_choice_index,
311  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
312  bool word_ending, WERD_CHOICE *word,
313  float certainties[], float *limit,
314  WERD_CHOICE *best_choice, int *attempts_left,
315  void *void_more_args);
316  /* stopper.cpp *************************************************************/
317  bool NoDangerousAmbig(WERD_CHOICE *BestChoice,
318  DANGERR *fixpt,
319  bool fix_replaceable,
320  BLOB_CHOICE_LIST_VECTOR *Choices,
321  bool *modified_blobs);
322  double StopperAmbigThreshold(double f1, double f2) {
323  return (f2 - f1) * stopper_ambiguity_threshold_gain -
325  }
326  // If the certainty of any chunk in Choice (item1) is not ambiguous with the
327  // corresponding chunk in the best choice (item2), frees Choice and
328  // returns true.
329  int FreeBadChoice(void *item1, // VIABLE_CHOICE Choice
330  void *item2); // EXPANDED_CHOICE *BestChoice
339  void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
340  UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
341  BLOB_CHOICE_LIST_VECTOR *blob_choices,
342  bool *modified_blobs);
343 
344  inline void DisableChoiceAccum() { keep_word_choices_ = false; }
345  inline void EnableChoiceAccum() { keep_word_choices_ = true; }
346  inline bool ChoiceAccumEnabled() { return keep_word_choices_; }
347 
349  int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice);
356  VIABLE_CHOICE NewViableChoice(const WERD_CHOICE &WordChoice,
357  FLOAT32 AdjustFactor,
358  const float Certainties[]);
360  void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice);
363  bool StringSameAs(const WERD_CHOICE &WordChoice,
364  VIABLE_CHOICE ViableChoice);
366  bool StringSameAs(const char *String,
367  const char *String_lengths,
368  VIABLE_CHOICE ViableChoice);
376  int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices,
377  const WERD_CHOICE &BestChoice);
380  WERD_CHOICE *BestChoice,
381  DANGERR *fixpt,
383  bool *modified_blobs);
387  bool AcceptableResult(const WERD_CHOICE &BestChoice);
390  int ChoiceSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice);
398  void LogNewChoice(FLOAT32 AdjustFactor,
399  const float Certainties[],
400  bool raw_choice,
401  WERD_CHOICE *WordChoice,
402  const BLOB_CHOICE_LIST_VECTOR &blob_choices);
403  void EndDangerousAmbigs();
405  bool CurrentBestChoiceIs(const WERD_CHOICE &WordChoice);
409  bool CurrentWordAmbig();
411  void DebugWordChoices();
413  void PrintAmbigAlternatives(FILE *file, const char *label,
414  int label_num_unichars);
417  void FillViableChoice(const WERD_CHOICE &WordChoice,
418  FLOAT32 AdjustFactor, const float Certainties[],
419  VIABLE_CHOICE ViableChoice);
422  bool AlternativeChoicesWorseThan(FLOAT32 Threshold);
425  void FilterWordChoices();
440  void FindClassifierErrors(FLOAT32 MinRating,
441  FLOAT32 MaxRating,
442  FLOAT32 RatingMargin,
443  FLOAT32 Thresholds[]);
446  void InitChoiceAccum();
448  void ClearBestChoiceAccum();
452  void LogNewSegmentation(PIECES_STATE BlobWidth);
455  void LogNewSplit(int Blob);
458  void AddNewChunk(VIABLE_CHOICE Choice, int Blob);
460  void SettupStopperPass1();
462  void SettupStopperPass2();
463  /* context.cpp *************************************************************/
465  int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset);
468  bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);
469 
470  /* dict.cpp ****************************************************************/
471 
474  void Load();
475  void End();
476 
477  // Resets the document dictionary analogous to ResetAdaptiveClassifier.
479  if (pending_words_ != NULL)
480  pending_words_->clear();
481  if (document_words_ != NULL)
482  document_words_->clear();
483  }
484 
485  // Create unicharset adaptations of known, short lists of UTF-8 equivalent
486  // characters (think all hyphen-like symbols). The first version of the
487  // list is taken as equivalent for matching against the dictionary.
488  void LoadEquivalenceList(const char *unichar_strings[]);
489 
490  // Normalize all hyphen and apostrophes to the canonicalized one for
491  // matching; pass everything else through as is. See LoadEquivalenceList().
493 
556  //
557  int def_letter_is_okay(void* void_dawg_args,
558  UNICHAR_ID unichar_id, bool word_end) const;
559 
560  int (Dict::*letter_is_okay_)(void* void_dawg_args,
561  UNICHAR_ID unichar_id, bool word_end) const;
563  int LetterIsOkay(void* void_dawg_args,
564  UNICHAR_ID unichar_id, bool word_end) const {
565  return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end);
566  }
567 
568 
570  double (Dict::*probability_in_context_)(const char* lang,
571  const char* context,
572  int context_bytes,
573  const char* character,
574  int character_bytes);
576  double ProbabilityInContext(const char* context,
577  int context_bytes,
578  const char* character,
579  int character_bytes) {
580  return (this->*probability_in_context_)(
581  getImage()->getCCUtil()->lang.string(),
582  context, context_bytes,
583  character, character_bytes);
584  }
585 
588  const char* lang, const char* context, int context_bytes,
589  const char* character, int character_bytes) {
590  (void) context;
591  (void) context_bytes;
592  (void) character;
593  (void) character_bytes;
594  return 0.0;
595  }
596  double ngram_probability_in_context(const char* lang,
597  const char* context,
598  int context_bytes,
599  const char* character,
600  int character_bytes);
601 
603  inline const int NumDawgs() const { return dawgs_.size(); }
605  inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
607  inline const Dawg *GetPuncDawg() const { return punc_dawg_; }
609  inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; }
611  inline const Dawg *GetFixedLengthDawg(int word_length) const {
612  if (word_length > max_fixed_length_dawgs_wdlen_) return NULL;
613  assert(dawgs_.size() > word_length);
614  return dawgs_[word_length];
615  }
616  inline const int GetMaxFixedLengthDawgIndex() const {
617  return max_fixed_length_dawgs_wdlen_;
618  }
620  static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {
621  if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg
622  NODE_REF node = dawg->next_node(edge_ref);
623  if (node == 0) node = NO_EDGE; // end of word
624  return node;
625  }
631  inline bool ConstraintsOk(const DawgInfoVector &constraints,
632  int word_end, DawgType current_dawg_type) const {
633  if (!word_end) return true;
634  if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true;
635  for (int c = 0; c < constraints.length(); ++c) {
636  const DawgInfo &cinfo = constraints[c];
637  Dawg *cdawg = dawgs_[cinfo.dawg_index];
638  if (!cdawg->end_of_word(cinfo.ref)) {
639  if (dawg_debug_level >= 3) {
640  tprintf("Constraint [%d, " REFFORMAT "] is not satisfied\n",
641  cinfo.dawg_index, cinfo.ref);
642  }
643  return false;
644  }
645  }
646  return true;
647  }
648 
654  void ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info,
655  UNICHAR_ID unichar_id, bool word_end,
656  DawgArgs *dawg_args,
657  PermuterType *current_permuter) const;
658 
662 
668  static void ReadFixedLengthDawgs(DawgType type, const STRING &lang,
669  PermuterType perm, int debug_level,
670  FILE *file, DawgVector *dawg_vec,
671  int *max_wdlen);
674  static void WriteFixedLengthDawgs(
675  const GenericVector<SquishedDawg *> &dawg_vec,
676  int num_dawgs, int debug_level, FILE *output_file);
677 
679  inline static bool valid_word_permuter(uinT8 perm, bool numbers_ok) {
680  return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
681  perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
682  perm == USER_PATTERN_PERM || (numbers_ok && perm == NUMBER_PERM));
683  }
684  int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
685  int valid_word(const WERD_CHOICE &word) const {
686  return valid_word(word, false); // return NO_PERM for words with digits
687  }
688  int valid_word_or_number(const WERD_CHOICE &word) const {
689  return valid_word(word, true); // return NUMBER_PERM for valid numbers
690  }
692  int valid_word(const char *string) const {
693  WERD_CHOICE word(string, getUnicharset());
694  return valid_word(word);
695  }
696  // Do the two WERD_CHOICEs form a meaningful bigram?
697  bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
702  bool valid_punctuation(const WERD_CHOICE &word);
704  int good_choice(const WERD_CHOICE &choice);
706  void add_document_word(const WERD_CHOICE &best_choice);
707  int get_top_word_script(const BLOB_CHOICE_LIST_VECTOR &char_choices,
708  const UNICHARSET &unicharset);
710  void adjust_word(WERD_CHOICE *word, float *certainty_array,
711  const BLOB_CHOICE_LIST_VECTOR *char_choices,
712  bool nonword, float additional_adjust, bool debug);
713  void adjust_word(WERD_CHOICE *word, float *certainty_array,
714  const BLOB_CHOICE_LIST_VECTOR *char_choices,
715  bool debug) {
716  adjust_word(word, certainty_array, char_choices, false, 0.0f,
717  debug);
718  }
720  float *certainty_array,
721  const BLOB_CHOICE_LIST_VECTOR *char_choices,
722  bool debug) {
723  adjust_word(word, certainty_array, char_choices, true, 0.0f, debug);
724  }
726  inline void SetWordsegRatingAdjustFactor(float f) {
727  wordseg_rating_adjust_factor_ = f;
728  }
729  // Accessor for best_choices_.
730  const LIST &getBestChoices() { return best_choices_; }
731 
732  private:
734  Image* image_ptr_;
741  UnicharAmbigs *dang_ambigs_table_;
743  UnicharAmbigs *replace_ambigs_table_;
748  bool keep_word_choices_;
750  FLOAT32 reject_offset_;
752  PIECES_STATE current_segmentation_;
754  VIABLE_CHOICE best_raw_choice_;
755  LIST raw_choices_;
756  LIST best_choices_;
757  // Hyphen-related variables.
758  UNICHAR_ID hyphen_unichar_id_;
759  WERD_CHOICE *hyphen_word_;
760  DawgInfoVector hyphen_active_dawgs_;
761  DawgInfoVector hyphen_constraints_;
762  bool last_word_on_line_;
763  // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
764  // matching. The first member of each list is taken as canonical. For
765  // example, the first list contains hyphens and dashes with the first symbol
766  // being the ASCII hyphen minus.
767  GenericVector<GenericVectorEqEq<UNICHAR_ID> > equivalent_symbols_;
768  // Dawgs.
769  DawgVector dawgs_;
770  SuccessorListsVector successors_;
771  Trie *pending_words_;
772  // bigram_dawg_ points to a dawg of two-word bigrams which always supercede if
773  // any of them are present on the best choices list for a word pair.
774  // the bigrams are stored as space-separated words where:
775  // (1) leading and trailing punctuation has been removed from each word and
776  // (2) any digits have been replaced with '?' marks.
777  Dawg *bigram_dawg_;
780  // TODO(daria): need to support multiple languages in the future,
781  // so maybe will need to maintain a list of dawgs of each kind.
782  Dawg *freq_dawg_;
783  Dawg *unambig_dawg_;
784  Dawg *punc_dawg_;
785  Trie *document_words_;
788  int max_fixed_length_dawgs_wdlen_;
791  float wordseg_rating_adjust_factor_;
792  // File for recording ambiguities discovered during dictionary search.
793  FILE *output_ambig_words_file_;
794 
795  public:
799  STRING_VAR_H(user_words_suffix, "", "A list of user-provided words.");
801  "A list of user-provided patterns.");
802  BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
803  BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
804  BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
806  "Load dawg with punctuation patterns.");
807  BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns.");
808  BOOL_VAR_H(load_fixed_length_dawgs, true, "Load fixed length"
809  " dawgs (e.g. for non-space delimited languages)");
811  "Load dawg with special word bigrams.");
813  "Score multiplier for word matches which have good case and"
814  "are frequent in the given language (lower is better).");
815 
817  "Score multiplier for word matches that have good case "
818  "(lower is better).");
819 
821  "Default score multiplier for word matches, which may have "
822  "case issues (lower is better).");
823 
824  // TODO(daria): remove this param when ngram permuter is deprecated.
826  "Multipler to for the best choice from the ngram model.");
827 
829  "Score multiplier for glyph fragment segmentations which "
830  "do not match a dictionary word (lower is better).");
831 
833  "Score multiplier for poorly cased strings that are not in"
834  " the dictionary and generally look like garbage (lower is"
835  " better).");
837  "Output file for ambiguities found in the dictionary");
838  INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info"
839  ", to 2 for more details, to 3 to see all the debug messages");
840  INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
841  INT_VAR_H(max_viterbi_list_size, 10, "Maximum size of viterbi list.");
843  "Use only the first UTF8 step of the given string"
844  " when computing log probabilities.");
845  double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
847  "Certainty threshold for non-dict words");
849  "Reject certainty offset");
851  "Size of dict word to be treated as non-dict word");
853  "Certainty to add for each dict char above small word size.");
855  "Max certaintly variation allowed in a word (in sigma)");
856  INT_VAR_H(stopper_debug_level, 0, "Stopper debug level");
858  "Make AcceptableChoice() always return false. Useful"
859  " when there is a need to explore all segmentations");
861  "Gain factor for ambiguity threshold.");
863  "Certainty offset for ambiguity threshold.");
864  BOOL_VAR_H(save_raw_choices, false, "Save all explored raw choices");
865  INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
866  STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
867  " should be printed to stdout");
869  "Lengths of unichars in word_to_debug");
870  INT_VAR_H(fragments_debug, 0, "Debug character fragments");
871  INT_VAR_H(segment_debug, 0, "Debug the whole segmentation process");
872  BOOL_VAR_H(permute_debug, 0, "Debug char permutation process");
873  double_VAR_H(bestrate_pruning_factor, 2.0, "Multiplying factor of"
874  " current best rate to prune other hypotheses");
876  "Turn on word script consistency permuter");
878  "incorporate segmentation cost in word rating?");
880  "Don't use any alphabetic-specific tricks."
881  "Set to true in the traineddata config file for"
882  " scripts that are cursive or inherently fixed-pitch");
884  "Score multipler for script consistency within a word. "
885  "Being a 'reward' factor, it should be <= 1. "
886  "Smaller value implies bigger reward.");
888  "Turn on fixed-length phrasebook search permuter");
890  "Turn on character type (property) consistency permuter");
892  "Score multipler for char type consistency within a word. ");
893  // TODO(daria): remove this param when ngram permuter is deprecated.
895  "Score multipler for ngram permuter's best choice"
896  " (only used in the Han script path).");
897  BOOL_VAR_H(save_doc_words, 0, "Save Document Words");
898  BOOL_VAR_H(doc_dict_enable, 1, "Enable Document Dictionary ");
900  "Worst certainty for using pending dictionary");
901  double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty"
902  " for words that can be inserted into the document dictionary");
904  "Activate character-level n-gram-based permuter");
905  INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different"
906  " character choices to consider during permutation."
907  " This limit is especially useful when user patterns"
908  " are specified, since overly generic patterns can result in"
909  " dawg search exploring an overly large number of options.");
910  BOOL_VAR_H(permute_only_top, false, "Run only the top choice permuter");
911 };
912 } // namespace tesseract
913 
914 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_