Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ambigs.cpp
Go to the documentation of this file.
1 
2 // File: ambigs.cc
3 // Description: Functions for dealing with ambiguities
4 // (training and recognition).
5 // Author: Daria Antonova
6 // Created: Mon Feb 5 11:26:43 PDT 2009
7 //
8 // (C) Copyright 2008, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #include "ambigs.h"
22 #include "helpers.h"
23 
24 #ifdef _WIN32
25 #ifndef __GNUC__
26 #define strtok_r strtok_s
27 #else
28 #include "strtok_r.h"
29 #endif /* __GNUC__ */
30 #endif /* _WIN32 */
31 
32 namespace tesseract {
33 
35  wrong_ngram[0] = INVALID_UNICHAR_ID;
36  correct_fragments[0] = INVALID_UNICHAR_ID;
37  correct_ngram_id = INVALID_UNICHAR_ID;
38  type = NOT_AMBIG;
39  wrong_ngram_size = 0;
40 }
41 
43 
44 void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile,
45  inT64 end_offset,
46  int debug_level,
47  bool use_ambigs_for_adaption,
48  UNICHARSET *unicharset) {
49  int i, j;
50  UnicharIdVector *adaption_ambigs_entry;
51  for (i = 0; i < unicharset->size(); ++i) {
52  replace_ambigs_.push_back(NULL);
53  dang_ambigs_.push_back(NULL);
54  one_to_one_definite_ambigs_.push_back(NULL);
55  if (use_ambigs_for_adaption) {
56  ambigs_for_adaption_.push_back(NULL);
57  reverse_ambigs_for_adaption_.push_back(NULL);
58  }
59  }
60  if (debug_level) tprintf("Reading ambiguities\n");
61 
62  int TestAmbigPartSize;
63  int ReplacementAmbigPartSize;
64  // Maximum line size:
65  // 10 for sizes of ambigs, tabs, abmig type and newline
66  // UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig
67  // The space for buffer is allocated on the heap to avoid
68  // GCC frame size warning.
69  const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1);
70  const int kBufferSize = 10 + 2 * kMaxAmbigStringSize;
71  char *buffer = new char[kBufferSize];
72  char ReplacementString[kMaxAmbigStringSize];
73  UNICHAR_ID TestUnicharIds[MAX_AMBIG_SIZE + 1];
74  int line_num = 0;
75  int type = NOT_AMBIG;
76 
77  // Determine the version of the ambigs file.
78  int version = 0;
79  ASSERT_HOST(fgets(buffer, kBufferSize, AmbigFile) != NULL &&
80  strlen(buffer) > 0);
81  if (*buffer == 'v') {
82  version = static_cast<int>(strtol(buffer+1, NULL, 10));
83  ++line_num;
84  } else {
85  rewind(AmbigFile);
86  }
87  while ((end_offset < 0 || ftell(AmbigFile) < end_offset) &&
88  fgets(buffer, kBufferSize, AmbigFile) != NULL) {
89  chomp_string(buffer);
90  if (debug_level > 2) tprintf("read line %s\n", buffer);
91  ++line_num;
92  if (!ParseAmbiguityLine(line_num, version, debug_level, *unicharset,
93  buffer, &TestAmbigPartSize, TestUnicharIds,
94  &ReplacementAmbigPartSize,
95  ReplacementString, &type)) continue;
96  // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST.
97  AmbigSpec *ambig_spec = new AmbigSpec();
98  InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_,
99  TestAmbigPartSize, TestUnicharIds,
100  ReplacementAmbigPartSize, ReplacementString, type,
101  ambig_spec, unicharset);
102 
103  // Update one_to_one_definite_ambigs_.
104  if (TestAmbigPartSize == 1 &&
105  ReplacementAmbigPartSize == 1 && type == DEFINITE_AMBIG) {
106  if (one_to_one_definite_ambigs_[TestUnicharIds[0]] == NULL) {
107  one_to_one_definite_ambigs_[TestUnicharIds[0]] = new UnicharIdVector();
108  }
109  one_to_one_definite_ambigs_[TestUnicharIds[0]]->push_back(
110  ambig_spec->correct_ngram_id);
111  }
112  // Update ambigs_for_adaption_.
113  if (use_ambigs_for_adaption) {
114  for (i = 0; i < TestAmbigPartSize; ++i) {
115  if (ambigs_for_adaption_[TestUnicharIds[i]] == NULL) {
116  ambigs_for_adaption_[TestUnicharIds[i]] = new UnicharIdVector();
117  }
118  adaption_ambigs_entry = ambigs_for_adaption_[TestUnicharIds[i]];
119  const char *tmp_ptr = ReplacementString;
120  const char *tmp_ptr_end = ReplacementString + strlen(ReplacementString);
121  int step = unicharset->step(tmp_ptr);
122  while (step > 0) {
123  UNICHAR_ID id_to_insert = unicharset->unichar_to_id(tmp_ptr, step);
124  ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID);
125  // Add the new unichar id to adaption_ambigs_entry (only if the
126  // vector does not already contain it) keeping it in sorted order.
127  for (j = 0; j < adaption_ambigs_entry->size() &&
128  (*adaption_ambigs_entry)[j] > id_to_insert; ++j);
129  if (j < adaption_ambigs_entry->size()) {
130  if ((*adaption_ambigs_entry)[j] != id_to_insert) {
131  adaption_ambigs_entry->insert(id_to_insert, j);
132  }
133  } else {
134  adaption_ambigs_entry->push_back(id_to_insert);
135  }
136  // Update tmp_ptr and step.
137  tmp_ptr += step;
138  step = tmp_ptr < tmp_ptr_end ? unicharset->step(tmp_ptr) : 0;
139  }
140  }
141  }
142  }
143  delete[] buffer;
144 
145  // Fill in reverse_ambigs_for_adaption from ambigs_for_adaption vector.
146  if (use_ambigs_for_adaption) {
147  for (i = 0; i < ambigs_for_adaption_.size(); ++i) {
148  adaption_ambigs_entry = ambigs_for_adaption_[i];
149  if (adaption_ambigs_entry == NULL) continue;
150  for (j = 0; j < adaption_ambigs_entry->size(); ++j) {
151  UNICHAR_ID ambig_id = (*adaption_ambigs_entry)[j];
152  if (reverse_ambigs_for_adaption_[ambig_id] == NULL) {
153  reverse_ambigs_for_adaption_[ambig_id] = new UnicharIdVector();
154  }
155  reverse_ambigs_for_adaption_[ambig_id]->push_back(i);
156  }
157  }
158  }
159 
160  // Print what was read from the input file.
161  if (debug_level > 1) {
162  for (int tbl = 0; tbl < 2; ++tbl) {
163  const UnicharAmbigsVector &print_table =
164  (tbl == 0) ? replace_ambigs_ : dang_ambigs_;
165  for (i = 0; i < print_table.size(); ++i) {
166  AmbigSpec_LIST *lst = print_table[i];
167  if (lst == NULL) continue;
168  if (!lst->empty()) {
169  tprintf("%s Ambiguities for %s:\n",
170  (tbl == 0) ? "Replaceable" : "Dangerous",
171  unicharset->debug_str(i).string());
172  }
173  AmbigSpec_IT lst_it(lst);
174  for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) {
175  AmbigSpec *ambig_spec = lst_it.data();
176  tprintf("wrong_ngram:");
177  UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset);
178  tprintf("correct_fragments:");
179  UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset);
180  }
181  }
182  }
183  if (use_ambigs_for_adaption) {
184  for (int vec_id = 0; vec_id < 2; ++vec_id) {
185  const GenericVector<UnicharIdVector *> &vec = (vec_id == 0) ?
186  ambigs_for_adaption_ : reverse_ambigs_for_adaption_;
187  for (i = 0; i < vec.size(); ++i) {
188  adaption_ambigs_entry = vec[i];
189  if (adaption_ambigs_entry != NULL) {
190  tprintf("%sAmbigs for adaption for %s:\n",
191  (vec_id == 0) ? "" : "Reverse ",
192  unicharset->debug_str(i).string());
193  for (j = 0; j < adaption_ambigs_entry->size(); ++j) {
194  tprintf("%s ", unicharset->debug_str(
195  (*adaption_ambigs_entry)[j]).string());
196  }
197  tprintf("\n");
198  }
199  }
200  }
201  }
202  }
203 }
204 
205 bool UnicharAmbigs::ParseAmbiguityLine(
206  int line_num, int version, int debug_level, const UNICHARSET &unicharset,
207  char *buffer, int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
208  int *ReplacementAmbigPartSize, char *ReplacementString, int *type) {
209  int i;
210  char *token;
211  char *next_token;
212  if (!(token = strtok_r(buffer, kAmbigDelimiters, &next_token)) ||
213  !sscanf(token, "%d", TestAmbigPartSize) || TestAmbigPartSize <= 0) {
214  if (debug_level) tprintf(kIllegalMsg, line_num);
215  return false;
216  }
217  if (*TestAmbigPartSize > MAX_AMBIG_SIZE) {
218  tprintf("Too many unichars in ambiguity on line %d\n");
219  return false;
220  }
221  for (i = 0; i < *TestAmbigPartSize; ++i) {
222  if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
223  if (!unicharset.contains_unichar(token)) {
224  if (debug_level) tprintf(kIllegalUnicharMsg, token);
225  break;
226  }
227  TestUnicharIds[i] = unicharset.unichar_to_id(token);
228  }
229  TestUnicharIds[i] = INVALID_UNICHAR_ID;
230 
231  if (i != *TestAmbigPartSize ||
232  !(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
233  !sscanf(token, "%d", ReplacementAmbigPartSize) ||
234  *ReplacementAmbigPartSize <= 0) {
235  if (debug_level) tprintf(kIllegalMsg, line_num);
236  return false;
237  }
238  if (*ReplacementAmbigPartSize > MAX_AMBIG_SIZE) {
239  tprintf("Too many unichars in ambiguity on line %d\n");
240  return false;
241  }
242  ReplacementString[0] = '\0';
243  for (i = 0; i < *ReplacementAmbigPartSize; ++i) {
244  if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
245  strcat(ReplacementString, token);
246  if (!unicharset.contains_unichar(token)) {
247  if (debug_level) tprintf(kIllegalUnicharMsg, token);
248  break;
249  }
250  }
251  if (i != *ReplacementAmbigPartSize) {
252  if (debug_level) tprintf(kIllegalMsg, line_num);
253  return false;
254  }
255  if (version > 0) {
256  // The next field being true indicates that the abiguity should
257  // always be substituted (e.g. '' should always be changed to ").
258  // For such "certain" n -> m ambigs tesseract will insert character
259  // fragments for the n pieces in the unicharset. AmbigsFound()
260  // will then replace the incorrect ngram with the character
261  // fragments of the correct character (or ngram if m > 1).
262  // Note that if m > 1, an ngram will be inserted into the
263  // modified word, not the individual unigrams. Tesseract
264  // has limited support for ngram unichar (e.g. dawg permuter).
265  if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
266  !sscanf(token, "%d", type)) {
267  if (debug_level) tprintf(kIllegalMsg, line_num);
268  return false;
269  }
270  }
271  return true;
272 }
273 
274 void UnicharAmbigs::InsertIntoTable(
275  UnicharAmbigsVector &table, int TestAmbigPartSize,
276  UNICHAR_ID *TestUnicharIds, int ReplacementAmbigPartSize,
277  const char *ReplacementString, int type,
278  AmbigSpec *ambig_spec, UNICHARSET *unicharset) {
279  ambig_spec->type = static_cast<AmbigType>(type);
280  if (TestAmbigPartSize == 1 && ReplacementAmbigPartSize == 1 &&
281  unicharset->to_lower(TestUnicharIds[0]) ==
282  unicharset->to_lower(unicharset->unichar_to_id(ReplacementString))) {
283  ambig_spec->type = CASE_AMBIG;
284  }
285 
286  ambig_spec->wrong_ngram_size =
287  UnicharIdArrayUtils::copy(TestUnicharIds, ambig_spec->wrong_ngram);
288 
289  // Since we need to maintain a constant number of unichar positions in
290  // order to construct ambig_blob_choices vector in NoDangerousAmbig(), for
291  // each n->m ambiguity we will have to place n character fragments of the
292  // correct ngram into the corresponding positions in the vector (e.g. given
293  // "vvvvw" and vvvv->ww we will place v and |ww|0|4 into position 0, v and
294  // |ww|1|4 into position 1 and so on. The correct ngram is reconstructed
295  // from fragments by dawg_permute_and_select().
296 
297  // Insert the corresponding correct ngram into the unicharset.
298  // Unicharset code assumes that the "base" ngram is inserted into
299  // the unicharset before fragments of this ngram are inserted.
300  unicharset->unichar_insert(ReplacementString);
301  ambig_spec->correct_ngram_id =
302  unicharset->unichar_to_id(ReplacementString);
303  if (ReplacementAmbigPartSize > 1) {
304  unicharset->set_isngram(ambig_spec->correct_ngram_id, true);
305  }
306  // Add the corresponding fragments of the wrong ngram to unicharset.
307  int i;
308  for (i = 0; i < TestAmbigPartSize; ++i) {
309  UNICHAR_ID unichar_id;
310  if (TestAmbigPartSize == 1) {
311  unichar_id = ambig_spec->correct_ngram_id;
312  } else {
313  STRING frag_str = CHAR_FRAGMENT::to_string(
314  ReplacementString, i, TestAmbigPartSize, false);
315  unicharset->unichar_insert(frag_str.string());
316  unichar_id = unicharset->unichar_to_id(frag_str.string());
317  }
318  ambig_spec->correct_fragments[i] = unichar_id;
319  }
320  ambig_spec->correct_fragments[i] = INVALID_UNICHAR_ID;
321 
322  // Add AmbigSpec for this ambiguity to the corresponding AmbigSpec_LIST.
323  // Keep AmbigSpec_LISTs sorted by AmbigSpec.wrong_ngram.
324  if (table[TestUnicharIds[0]] == NULL) {
325  table[TestUnicharIds[0]] = new AmbigSpec_LIST();
326  }
327  table[TestUnicharIds[0]]->add_sorted(
328  AmbigSpec::compare_ambig_specs, false, ambig_spec);
329 }
330 
331 } // namespace tesseract