Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
werd.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: werd.cpp (Formerly word.c)
3  * Description: Code for the WERD class.
4  * Author: Ray Smith
5  * Created: Tue Oct 08 14:32:12 BST 1991
6  *
7  * (C) Copyright 1991, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "mfcpch.h"
21 #include "blckerr.h"
22 #include "helpers.h"
23 #include "linlsq.h"
24 #include "werd.h"
25 
26 // Include automatically generated configuration file if running autoconf.
27 #ifdef HAVE_CONFIG_H
28 #include "config_auto.h"
29 #endif
30 
31 #define FIRST_COLOUR ScrollView::RED //< first rainbow colour
32 #define LAST_COLOUR ScrollView::AQUAMARINE //< last rainbow colour
33 #define CHILD_COLOUR ScrollView::BROWN //< colour of children
34 
36  "Attempted to scale an edgestep format word";
37 
39 
40 
49 WERD::WERD(C_BLOB_LIST *blob_list, uinT8 blank_count, const char *text)
50  : blanks(blank_count),
51  flags(0),
52  script_id_(0),
53  correct(text) {
54  C_BLOB_IT start_it = blob_list;
55  C_BLOB_IT end_it = blob_list;
56  C_BLOB_IT rej_cblob_it = &rej_cblobs;
57  C_OUTLINE_IT c_outline_it;
58  inT16 inverted_vote = 0;
59  inT16 non_inverted_vote = 0;
60 
61  // Move blob_list's elements into cblobs.
62  while (!end_it.at_last())
63  end_it.forward();
64  cblobs.assign_to_sublist(&start_it, &end_it);
65 
66  /*
67  Set white on black flag for the WERD, moving any duff blobs onto the
68  rej_cblobs list.
69  First, walk the cblobs checking the inverse flag for each outline of each
70  cblob. If a cblob has inconsistent flag settings for its different
71  outlines, move the blob to the reject list. Otherwise, increment the
72  appropriate w-on-b or b-on-w vote for the word.
73 
74  Now set the inversion flag for the WERD by maximum vote.
75 
76  Walk the blobs again, moving any blob whose inversion flag does not agree
77  with the concencus onto the reject list.
78  */
79  start_it.set_to_list(&cblobs);
80  if (start_it.empty())
81  return;
82  for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {
83  BOOL8 reject_blob = FALSE;
84  BOOL8 blob_inverted;
85 
86  c_outline_it.set_to_list(start_it.data()->out_list());
87  blob_inverted = c_outline_it.data()->flag(COUT_INVERSE);
88  for (c_outline_it.mark_cycle_pt();
89  !c_outline_it.cycled_list() && !reject_blob;
90  c_outline_it.forward()) {
91  reject_blob = c_outline_it.data()->flag(COUT_INVERSE) != blob_inverted;
92  }
93  if (reject_blob) {
94  rej_cblob_it.add_after_then_move(start_it.extract());
95  } else {
96  if (blob_inverted)
97  inverted_vote++;
98  else
99  non_inverted_vote++;
100  }
101  }
102 
103  flags.set_bit(W_INVERSE, (inverted_vote > non_inverted_vote));
104 
105  start_it.set_to_list(&cblobs);
106  if (start_it.empty())
107  return;
108  for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {
109  c_outline_it.set_to_list(start_it.data()->out_list());
110  if (c_outline_it.data()->flag(COUT_INVERSE) != flags.bit(W_INVERSE))
111  rej_cblob_it.add_after_then_move(start_it.extract());
112  }
113 }
114 
115 
123 WERD::WERD(C_BLOB_LIST * blob_list, //< In word order
124  WERD * clone) //< Source of flags
125  : flags(clone->flags),
126  script_id_(clone->script_id_),
127  correct(clone->correct) {
128  C_BLOB_IT start_it = blob_list; // iterator
129  C_BLOB_IT end_it = blob_list; // another
130 
131  while (!end_it.at_last ())
132  end_it.forward (); //move to last
133  ((C_BLOB_LIST *) (&cblobs))->assign_to_sublist (&start_it, &end_it);
134  //move to our list
135  blanks = clone->blanks;
136  // fprintf(stderr,"Wrong constructor!!!!\n");
137 }
138 
139 // Construct a WERD from a single_blob and clone the flags from this.
140 // W_BOL and W_EOL flags are set according to the given values.
141 WERD* WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob) {
142  C_BLOB_LIST temp_blobs;
143  C_BLOB_IT temp_it(&temp_blobs);
144  temp_it.add_after_then_move(blob);
145  WERD* blob_word = new WERD(&temp_blobs, this);
146  blob_word->set_flag(W_BOL, bol);
147  blob_word->set_flag(W_EOL, eol);
148  return blob_word;
149 }
150 
165  TBOX box; // box being built
166  C_BLOB_IT rej_cblob_it = &rej_cblobs; // rejected blobs
167 
168  for (rej_cblob_it.mark_cycle_pt(); !rej_cblob_it.cycled_list();
169  rej_cblob_it.forward()) {
170  box += rej_cblob_it.data()->bounding_box();
171  }
172 
173  C_BLOB_IT it = &cblobs; // blobs of WERD
174  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
175  box += it.data()->bounding_box();
176  }
177  return box;
178 }
179 
180 
188 void WERD::move(const ICOORD vec) {
189  C_BLOB_IT cblob_it(&cblobs); // cblob iterator
190 
191  for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward())
192  cblob_it.data()->move(vec);
193 }
194 
201 void WERD::join_on(WERD* other) {
202  C_BLOB_IT blob_it(&cblobs);
203  C_BLOB_IT src_it(&other->cblobs);
204  C_BLOB_IT rej_cblob_it(&rej_cblobs);
205  C_BLOB_IT src_rej_it(&other->rej_cblobs);
206 
207  while (!src_it.empty()) {
208  blob_it.add_to_end(src_it.extract());
209  src_it.forward();
210  }
211  while (!src_rej_it.empty()) {
212  rej_cblob_it.add_to_end(src_rej_it.extract());
213  src_rej_it.forward();
214  }
215 }
216 
217 
224 void WERD::copy_on(WERD* other) {
225  bool reversed = other->bounding_box().left() < bounding_box().left();
226  C_BLOB_IT c_blob_it(&cblobs);
227  C_BLOB_LIST c_blobs;
228 
229  c_blobs.deep_copy(&other->cblobs, &C_BLOB::deep_copy);
230  if (reversed) {
231  c_blob_it.add_list_before(&c_blobs);
232  } else {
233  c_blob_it.move_to_last();
234  c_blob_it.add_list_after(&c_blobs);
235  }
236  if (!other->rej_cblobs.empty()) {
237  C_BLOB_IT rej_c_blob_it(&rej_cblobs);
238  C_BLOB_LIST new_rej_c_blobs;
239 
240  new_rej_c_blobs.deep_copy(&other->rej_cblobs, &C_BLOB::deep_copy);
241  if (reversed) {
242  rej_c_blob_it.add_list_before(&new_rej_c_blobs);
243  } else {
244  rej_c_blob_it.move_to_last();
245  rej_c_blob_it.add_list_after(&new_rej_c_blobs);
246  }
247  }
248 }
249 
256 void WERD::print() {
257  tprintf("Blanks= %d\n", blanks);
258  bounding_box().print();
259  tprintf("Flags = %d = 0%o\n", flags.val, flags.val);
260  tprintf(" W_SEGMENTED = %s\n", flags.bit(W_SEGMENTED) ? "TRUE" : "FALSE ");
261  tprintf(" W_ITALIC = %s\n", flags.bit(W_ITALIC) ? "TRUE" : "FALSE ");
262  tprintf(" W_BOL = %s\n", flags.bit(W_BOL) ? "TRUE" : "FALSE ");
263  tprintf(" W_EOL = %s\n", flags.bit(W_EOL) ? "TRUE" : "FALSE ");
264  tprintf(" W_NORMALIZED = %s\n",
265  flags.bit(W_NORMALIZED) ? "TRUE" : "FALSE ");
266  tprintf(" W_SCRIPT_HAS_XHEIGHT = %s\n",
267  flags.bit(W_SCRIPT_HAS_XHEIGHT) ? "TRUE" : "FALSE ");
268  tprintf(" W_SCRIPT_IS_LATIN = %s\n",
269  flags.bit(W_SCRIPT_IS_LATIN) ? "TRUE" : "FALSE ");
270  tprintf(" W_DONT_CHOP = %s\n", flags.bit(W_DONT_CHOP) ? "TRUE" : "FALSE ");
271  tprintf(" W_REP_CHAR = %s\n", flags.bit(W_REP_CHAR) ? "TRUE" : "FALSE ");
272  tprintf(" W_FUZZY_SP = %s\n", flags.bit(W_FUZZY_SP) ? "TRUE" : "FALSE ");
273  tprintf(" W_FUZZY_NON = %s\n", flags.bit(W_FUZZY_NON) ? "TRUE" : "FALSE ");
274  tprintf("Correct= %s\n", correct.string());
275  tprintf("Rejected cblob count = %d\n", rej_cblobs.length());
276  tprintf("Script = %d\n", script_id_);
277 }
278 
279 
286 #ifndef GRAPHICS_DISABLED
287 void WERD::plot(ScrollView *window, ScrollView::Color colour) {
288  C_BLOB_IT it = &cblobs;
289  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
290  it.data()->plot(window, colour, colour);
291  }
292  plot_rej_blobs(window);
293 }
294 
295 // Get the next color in the (looping) rainbow.
297  ScrollView::Color next = static_cast<ScrollView::Color>(colour + 1);
298  if (next >= LAST_COLOUR || next < FIRST_COLOUR)
299  next = FIRST_COLOUR;
300  return next;
301 }
302 
309 void WERD::plot(ScrollView* window) {
311  C_BLOB_IT it = &cblobs;
312  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
313  it.data()->plot(window, colour, CHILD_COLOUR);
314  colour = NextColor(colour);
315  }
316  plot_rej_blobs(window);
317 }
318 
319 
328  C_BLOB_IT it = &rej_cblobs;
329  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
330  it.data()->plot(window, ScrollView::GREY, ScrollView::GREY);
331  }
332 }
333 #endif // GRAPHICS_DISABLED
334 
335 
343  WERD *new_word = new WERD;
344 
345  new_word->blanks = blanks;
346  new_word->flags = flags;
347  new_word->dummy = dummy;
348  new_word->correct = correct;
349  return new_word;
350 }
351 
352 
359 WERD & WERD::operator= (const WERD & source) {
360  this->ELIST2_LINK::operator= (source);
361  blanks = source.blanks;
362  flags = source.flags;
363  script_id_ = source.script_id_;
364  dummy = source.dummy;
365  correct = source.correct;
366  if (!cblobs.empty())
367  cblobs.clear();
368  cblobs.deep_copy(&source.cblobs, &C_BLOB::deep_copy);
369 
370  if (!rej_cblobs.empty())
371  rej_cblobs.clear();
372  rej_cblobs.deep_copy(&source.rej_cblobs, &C_BLOB::deep_copy);
373  return *this;
374 }
375 
376 
384 int word_comparator(const void *word1p, const void *word2p) {
385  WERD *word1 = *(WERD **)word1p;
386  WERD *word2 = *(WERD **)word2p;
387  return word1->bounding_box().left() - word2->bounding_box().left();
388 }
389 
402 WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs,
403  C_BLOB_LIST* orphan_blobs) {
404  C_BLOB_LIST current_blob_list;
405  C_BLOB_IT werd_blobs_it(&current_blob_list);
406  // Add the word's c_blobs.
407  werd_blobs_it.add_list_after(cblob_list());
408 
409  // New blob list. These contain the blobs which will form the new word.
410  C_BLOB_LIST new_werd_blobs;
411  C_BLOB_IT new_blobs_it(&new_werd_blobs);
412 
413  // not_found_blobs contains the list of current word's blobs for which a
414  // corresponding blob wasn't found in the input all_blobs list.
415  C_BLOB_LIST not_found_blobs;
416  C_BLOB_IT not_found_it(&not_found_blobs);
417  not_found_it.move_to_last();
418 
419  werd_blobs_it.move_to_first();
420  for (werd_blobs_it.mark_cycle_pt(); !werd_blobs_it.cycled_list();
421  werd_blobs_it.forward()) {
422  C_BLOB* werd_blob = werd_blobs_it.extract();
423  TBOX werd_blob_box = werd_blob->bounding_box();
424  bool found = false;
425  // Now find the corresponding blob for this blob in the all_blobs
426  // list. For now, follow the inefficient method of pairwise
427  // comparisons. Ideally, one can pre-bucket the blobs by row.
428  C_BLOB_IT all_blobs_it(all_blobs);
429  for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list();
430  all_blobs_it.forward()) {
431  C_BLOB* a_blob = all_blobs_it.data();
432  // Compute the overlap of the two blobs. If major, a_blob should
433  // be added to the new blobs list.
434  TBOX a_blob_box = a_blob->bounding_box();
435  if (a_blob_box.null_box()) {
436  tprintf("Bounding box couldn't be ascertained\n");
437  }
438  if (werd_blob_box.contains(a_blob_box) ||
439  werd_blob_box.major_overlap(a_blob_box)) {
440  // Old blobs are from minimal splits, therefore are expected to be
441  // bigger. The new small blobs should cover a significant portion.
442  // This is it.
443  all_blobs_it.extract();
444  new_blobs_it.add_after_then_move(a_blob);
445  found = true;
446  }
447  }
448  if (!found) {
449  not_found_it.add_after_then_move(werd_blob);
450  } else {
451  delete werd_blob;
452  }
453  }
454  // Iterate over all not found blobs. Some of them may be due to
455  // under-segmentation (which is OK, since the corresponding blob is already
456  // in the list in that case.
457  not_found_it.move_to_first();
458  for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list();
459  not_found_it.forward()) {
460  C_BLOB* not_found = not_found_it.data();
461  TBOX not_found_box = not_found->bounding_box();
462  C_BLOB_IT existing_blobs_it(new_blobs_it);
463  for (existing_blobs_it.mark_cycle_pt(); !existing_blobs_it.cycled_list();
464  existing_blobs_it.forward()) {
465  C_BLOB* a_blob = existing_blobs_it.data();
466  TBOX a_blob_box = a_blob->bounding_box();
467  if ((not_found_box.major_overlap(a_blob_box) ||
468  a_blob_box.major_overlap(not_found_box)) &&
469  not_found_box.y_overlap(a_blob_box)) {
470  // Already taken care of.
471  delete not_found_it.extract();
472  break;
473  }
474  }
475  }
476  if (orphan_blobs) {
477  C_BLOB_IT orphan_blobs_it(orphan_blobs);
478  orphan_blobs_it.move_to_last();
479  orphan_blobs_it.add_list_after(&not_found_blobs);
480  }
481 
482  // New blobs are ready. Create a new werd object with these.
483  WERD* new_werd = NULL;
484  if (!new_werd_blobs.empty()) {
485  new_werd = new WERD(&new_werd_blobs, this);
486  } else {
487  // Add the blobs back to this word so that it can be reused.
488  C_BLOB_IT this_list_it(cblob_list());
489  this_list_it.add_list_after(&not_found_blobs);
490  }
491  return new_werd;
492 }