Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
heuristic.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: heuristic.c (Formerly heuristic.c)
5  * Description:
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Wed Jul 10 14:15:08 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 /*----------------------------------------------------------------------
26  I n c l u d e s
27 ----------------------------------------------------------------------*/
28 #include <math.h>
29 
30 // Note: "heuristic.h" is an empty file and deleted
31 #include "associate.h"
32 #include "bestfirst.h"
33 #include "seam.h"
34 #include "baseline.h"
35 #include "freelist.h"
36 #include "measure.h"
37 #include "ratngs.h"
38 #include "wordrec.h"
39 
40 /*----------------------------------------------------------------------
41  M a c r o s
42 ----------------------------------------------------------------------*/
43 #define BAD_RATING 1000.0 /* No valid blob */
44 
45 
46 namespace tesseract {
47 
48 /*----------------------------------------------------------------------
49 // Some static helpers used only in this file
50 ----------------------------------------------------------------------*/
51 
52 // Return a character width record corresponding to the character
53 // width that will be generated in this segmentation state.
54 // The calling function need to memfree WIDTH_RECORD when finished.
55 // This is the same as the original function, only cosmetic changes,
56 // except instead of passing chunks back to be freed, it deallocates
57 // internally.
59  STATE *state,
60  int num_joints) {
61  SEARCH_STATE chunks = bin_to_chunks(state, num_joints);
62  int num_chars = chunks[0] + 1;
63 
64  // allocate and store (n+1,w0,g0,w1,g1...,wn) in int[2*(n+1)] as a
65  // struct { num_chars, widths[2*n+1]; }
66  WIDTH_RECORD *char_widths = (WIDTH_RECORD*) memalloc(sizeof(int)*num_chars*2);
67  char_widths->num_chars = num_chars;
68 
69  int first_blob = 0;
70  int last_blob;
71  for (int i = 1; i <= num_chars; i++) {
72  last_blob = (i > chunks[0]) ? num_joints : first_blob + chunks[i];
73 
74  char_widths->widths[2*i-2] =
75  AssociateUtils::GetChunksWidth(chunk_widths, first_blob, last_blob);
76  if (i <= chunks[0]) {
77  char_widths->widths[2*i-1] =
78  AssociateUtils::GetChunksGap(chunk_widths, last_blob);
79  }
80 
81  if (segment_adjust_debug > 3)
82  tprintf("width_record[%d]s%d--s%d(%d) %d %d:%d\n",
83  i-1, first_blob, last_blob, chunks[i],
84  char_widths->widths[2*i-2], char_widths->widths[2*i-1],
85  chunk_widths->widths[2*last_blob+1]);
86  first_blob = last_blob + 1;
87  }
88 
89  memfree(chunks);
90  return char_widths;
91 }
92 
93 // Computes the variance of the char widths normalized to the given height
94 // TODO(dsl): Do this in a later stage and use char choice info to skip
95 // punctuations.
97  MEASUREMENT ws;
98  new_measurement(ws);
99  for (int x = 0; x < wrec->num_chars; x++) {
100  FLOAT32 wh_ratio = wrec->widths[2 * x] * 1.0f / norm_height;
101  if (x == wrec->num_chars - 1 && wh_ratio > 0.3)
102  continue; // exclude trailing punctuation from stats
103  ADD_SAMPLE(ws, wh_ratio);
104  }
105  if (segment_adjust_debug > 2)
106  tprintf("Width Mean=%g Var=%g\n", MEAN(ws), VARIANCE(ws));
107  return VARIANCE(ws);
108 }
109 
110 // Computes the variance of char positioning (width + spacing) wrt norm_height
111 FLOAT32 Wordrec::get_gap_variance(WIDTH_RECORD *wrec, float norm_height) {
112  MEASUREMENT ws;
113  new_measurement(ws);
114  for (int x = 0; x < wrec->num_chars - 1; x++) {
115  FLOAT32 gap_ratio = (wrec->widths[2 * x] + wrec->widths[ 2*x + 1])
116  * 1.0 / norm_height;
117  ADD_SAMPLE(ws, gap_ratio);
118  }
119  if (segment_adjust_debug > 2)
120  tprintf("Gap Mean=%g Var=%g\n", MEAN(ws), VARIANCE(ws));
121  return VARIANCE(ws);
122 }
123 
124 
125 /*----------------------------------------------------------------------
126  Below are the new state prioritization functions, which incorporates
127  segmentation cost and width distribution for fixed pitch languages.
128  They are included as methods in class Wordrec to obtain more context.
129  ----------------------------------------------------------------------*/
130 
131 /**********************************************************************
132  * Returns the cost associated with the segmentation state.
133  *
134  * The number of states should be the same as the number of seams.
135  * If state[i] is 1, then seams[i] is present; otherwise, it is hidden.
136  * This function returns the sum of priority for active seams.
137  * TODO(dsl): To keep this clean, this function will just return the
138  * sum of raw "priority" as seam cost. The normalization of this score
139  * relative to other costs will be done in bestfirst.cpp evaluate_seg().
140  **********************************************************************/
141 
143  STATE *state,
144  int num_joints) {
145  int x;
146  unsigned int mask = (num_joints > 32) ? (1 << (num_joints - 1 - 32))
147  : (1 << (num_joints - 1));
148  float seam_cost = 0.0f;
149  for (x = num_joints - 1; x >= 0; x--) {
150  int i = num_joints - 1 - x;
151  uinT32 value = (x < 32) ? state->part2 : state->part1;
152  bool state_on = value & mask;
153  if (state_on) {
154  SEAM* seam = (SEAM *) array_value(seams, i);
155  seam_cost += seam->priority;
156  }
157  if (mask == 1)
158  mask = 1 << 31;
159  else
160  mask >>= 1;
161  }
162  if (segment_adjust_debug > 2)
163  tprintf("seam_cost: %f\n", seam_cost);
164  return seam_cost;
165 }
166 
167 /**********************************************************************
168  * rating_priority
169  *
170  * Assign a segmentation priority based on the ratings of the blobs
171  * (in that segmentation) that have been classified. The average
172  * "goodness" (i.e. rating / weight) for each blob is used to indicate
173  * the segmentation priority. This is the original.
174  **********************************************************************/
176  STATE *state,
177  int num_joints) {
178  BLOB_CHOICE_LIST *blob_choices;
179  BLOB_CHOICE_IT blob_choice_it;
180  inT16 first_chunk = 0;
181  inT16 last_chunk;
182  inT16 ratings = 0;
183  inT16 weights = 0;
184 
185  PIECES_STATE blob_chunks;
186  bin_to_pieces(state, num_joints, blob_chunks);
187 
188  for (int x = 0; blob_chunks[x]; x++) {
189  last_chunk = first_chunk + blob_chunks[x];
190 
191  blob_choices = chunks_record->ratings->get(first_chunk, last_chunk - 1);
192  if (blob_choices != NOT_CLASSIFIED && blob_choices->length() > 0) {
193  blob_choice_it.set_to_list(blob_choices);
194  ratings += (inT16) blob_choice_it.data()->rating();
195  for (int y = first_chunk; y < last_chunk; y++) {
196  weights += (inT16) (chunks_record->weights[y]);
197  }
198  }
199  first_chunk = last_chunk;
200  }
201  if (weights <= 0)
202  weights = 1;
203  FLOAT32 rating_cost = static_cast<FLOAT32>(ratings) /
204  static_cast<FLOAT32>(weights);
205  if (segment_adjust_debug > 2)
206  tprintf("rating_cost: r%f / w%f = %f\n", ratings, weights, rating_cost);
207  return rating_cost;
208 }
209 
210 /**********************************************************************
211  * width_priority
212  *
213  * Return a priority value for this word segmentation based on the
214  * character widths present in the new segmentation.
215  * For variable-pitch fonts, this should do the same thing as before:
216  * ie. penalize only on really wide squatting blobs.
217  * For fixed-pitch fonts, this will include a measure of char & gap
218  * width consistency.
219  * TODO(dsl): generalize this to use a PDF estimate for proportional and
220  * fixed pitch mode.
221  **********************************************************************/
223  STATE *state,
224  int num_joints) {
225  FLOAT32 penalty = 0.0;
226  WIDTH_RECORD *width_rec = state_char_widths(chunks_record->chunk_widths,
227  state, num_joints);
228  // When baseline_enable==True, which is the current default for Tesseract,
229  // a fixed value of 128 (BASELINE_SCALE) is always used.
230  FLOAT32 normalizing_height = BASELINE_SCALE;
232  // For fixed pitch language like CJK, we use the full text height as the
233  // normalizing factor so we are not dependent on xheight calculation.
234  // In the normalized coord. xheight * scale == BASELINE_SCALE(128),
235  // so add proportionally scaled ascender zone to get full text height.
236  const DENORM& denorm = chunks_record->word_res->denorm;
237  normalizing_height = denorm.y_scale() *
238  (denorm.row()->x_height() + denorm.row()->ascenders());
239  if (segment_adjust_debug > 1)
240  tprintf("WidthPriority: %f %f normalizing height = %f\n",
241  denorm.row()->x_height(), denorm.row()->ascenders(),
242  normalizing_height);
243  // Impose additional segmentation penalties if blob widths or gaps
244  // distribution don't fit a fixed-pitch model.
245  FLOAT32 width_var = get_width_variance(width_rec, normalizing_height);
246  FLOAT32 gap_var = get_gap_variance(width_rec, normalizing_height);
247  penalty += width_var;
248  penalty += gap_var;
249  }
250 
251  for (int x = 0; x < width_rec->num_chars; x++) {
252  FLOAT32 squat = width_rec->widths[2*x];
253  FLOAT32 gap = (x < width_rec->num_chars-1) ? width_rec->widths[2*x+1] : 0;
254  squat /= normalizing_height;
255  gap /= normalizing_height;
258  squat, 0.0f, x == 0 || x == width_rec->num_chars -1,
261  gap, x == width_rec->num_chars - 1);
262  if (width_rec->num_chars == 1 &&
264  penalty += 10;
265  }
266  } else {
267  // Original equation when
268  // heuristic_max_char_ratio == AssociateUtils::kMaxSquat
269  if (squat > heuristic_max_char_wh_ratio)
270  penalty += squat - heuristic_max_char_wh_ratio;
271  }
272  }
273 
274  free_widths(width_rec);
275  return (penalty);
276 }
277 
278 
279 /**********************************************************************
280  * prioritize_state
281  *
282  * Create a priority for this state. It represents the urgency of
283  * checking this state. The larger the priority value, the worse the
284  * state is (lower priority). The "value" of this priority should be
285  * somewhat consistent with the final word rating.
286  * The question is how to normalize the different scores, and adjust
287  * the relative importance among them.
288  **********************************************************************/
290  SEARCH_RECORD *the_search) {
291  FLOAT32 shape_cost;
292  FLOAT32 width_cost;
293  FLOAT32 seam_cost;
294 
295  shape_cost = rating_priority(chunks_record,
296  the_search->this_state,
297  the_search->num_joints);
298 
299  width_cost = width_priority(chunks_record,
300  the_search->this_state,
301  the_search->num_joints);
302 
303  // The rating_priority is the same as the original, and the width_priority
304  // is the same as before if assume_fixed_pitch_char_segment == FALSE.
305  // So this would return the original state priority.
306  if (!use_new_state_cost)
307  return width_cost * 1000 + shape_cost;
308 
309  seam_cost = seamcut_priority(chunks_record->splits,
310  the_search->this_state,
311  the_search->num_joints);
312 
313  // TODO(dsl): how do we normalize the scores for these separate evidence?
314  // FLOAT32 total_cost = shape_cost + width_cost * 0.01 + seam_cost * 0.001;
315  FLOAT32 total_cost = shape_cost * heuristic_weight_rating +
316  width_cost * heuristic_weight_width +
317  seam_cost * heuristic_weight_seamcut;
318 
319  // We don't have an adjustment model for variable pitch segmentation cost
320  // into word rating
322  float seg_bias = 1.0;
323  if (width_cost < 1) seg_bias *= 0.85;
324  if (width_cost > 3)
325  seg_bias *= pow(heuristic_segcost_rating_base, width_cost/3.0);
326  if (seam_cost > 10)
327  seg_bias *= pow(heuristic_segcost_rating_base, log(seam_cost)/log(10.0));
328  if (shape_cost > 5)
329  seg_bias *= pow(heuristic_segcost_rating_base, shape_cost/5.0);
330  if (segment_adjust_debug) {
331  tprintf("SegCost: %g Weight: %g rating: %g width: %g seam: %g\n",
332  total_cost, seg_bias, shape_cost, width_cost, seam_cost);
333  }
334  the_search->segcost_bias = seg_bias;
335  } else {
336  the_search->segcost_bias = 0;
337  }
338 
339  return total_cost;
340 }
341 
342 } // namespace tesseract