Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
oldbasel.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: oldbasel.cpp (Formerly oldbl.c)
3  * Description: A re-implementation of the old baseline algorithm.
4  * Author: Ray Smith
5  * Created: Wed Oct 6 09:41:48 BST 1993
6  *
7  * (C) Copyright 1993, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "mfcpch.h"
21 #include "ccstruct.h"
22 #include "statistc.h"
23 #include "quadlsq.h"
24 #include "detlinefit.h"
25 #include "makerow.h"
26 #include "drawtord.h"
27 #include "oldbasel.h"
28 #include "textord.h"
29 #include "tprintf.h"
30 
31 // Include automatically generated configuration file if running autoconf.
32 #ifdef HAVE_CONFIG_H
33 #include "config_auto.h"
34 #endif
35 
36 #define EXTERN
37 
39 "Use original wiseowl xheight");
40 EXTERN BOOL_VAR (textord_oldbl_debug, FALSE, "Debug old baseline generation");
41 EXTERN BOOL_VAR (textord_debug_baselines, FALSE, "Debug baseline generation");
42 EXTERN BOOL_VAR (textord_oldbl_paradef, TRUE, "Use para default mechanism");
43 EXTERN BOOL_VAR (textord_oldbl_split_splines, TRUE, "Split stepped splines");
44 EXTERN BOOL_VAR (textord_oldbl_merge_parts, TRUE, "Merge suspect partitions");
45 EXTERN BOOL_VAR (oldbl_corrfix, TRUE, "Improve correlation of heights");
47 "Fix bug in modes threshold for xheights");
48 EXTERN BOOL_VAR(textord_ocropus_mode, FALSE, "Make baselines for ocropus");
49 EXTERN double_VAR (oldbl_xhfract, 0.4, "Fraction of est allowed in calc");
51 "Max lost before fallback line used");
52 EXTERN double_VAR (oldbl_dot_error_size, 1.26, "Max aspect ratio of a dot");
54 "X fraction for new partition");
55 
56 #define TURNLIMIT 1 /*min size for turning point */
57 #define X_HEIGHT_FRACTION 0.7 /*x-height/caps height */
58 #define DESCENDER_FRACTION 0.5 /*descender/x-height */
59 #define MIN_ASC_FRACTION 0.20 /*min size of ascenders */
60 #define MIN_DESC_FRACTION 0.25 /*min size of descenders */
61 #define MINASCRISE 2.0 /*min ascender/desc step */
62 #define MAXHEIGHTVARIANCE 0.15 /*accepted variation in x-height */
63 #define MAXHEIGHT 300 /*max blob height */
64 #define MAXOVERLAP 0.1 /*max 10% missed overlap */
65 #define MAXBADRUN 2 /*max non best for failed */
66 #define HEIGHTBUCKETS 200 /* Num of buckets */
67 #define DELTAHEIGHT 5.0 /* Small amount of diff */
68 #define GOODHEIGHT 5
69 #define MAXLOOPS 10
70 #define MODENUM 10
71 #define MAXPARTS 6
72 #define SPLINESIZE 23
73 
74 #define ABS(x) ((x)<0 ? (-(x)) : (x))
75 
76 namespace tesseract {
77 
78 /**********************************************************************
79  * make_old_baselines
80  *
81  * Top level function to make baselines the old way.
82  **********************************************************************/
83 
84 void Textord::make_old_baselines(TO_BLOCK *block, // block to do
85  BOOL8 testing_on, // correct orientation
86  float gradient) {
87  QSPLINE *prev_baseline; // baseline of previous row
88  TO_ROW *row; // current row
89  TO_ROW_IT row_it = block->get_rows();
90  BLOBNBOX_IT blob_it;
91 
92  prev_baseline = NULL; // nothing yet
93  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
94  row = row_it.data();
95  find_textlines(block, row, 2, NULL);
96  if (row->xheight <= 0 && prev_baseline != NULL)
97  find_textlines(block, row, 2, prev_baseline);
98  if (row->xheight > 0) { // was a good one
99  prev_baseline = &row->baseline;
100  } else {
101  prev_baseline = NULL;
102  blob_it.set_to_list(row->blob_list());
104  tprintf("Row baseline generation failed on row at (%d,%d)\n",
105  blob_it.data()->bounding_box().left(),
106  blob_it.data()->bounding_box().bottom());
107  }
108  }
109  correlate_lines(block, gradient);
110  block->block->set_xheight(block->xheight);
111 }
112 
113 
114 /**********************************************************************
115  * correlate_lines
116  *
117  * Correlate the x-heights and ascender heights of a block to fill-in
118  * the ascender height and descender height for rows without one.
119  * Also fix baselines of rows without a decent fit.
120  **********************************************************************/
121 
122 void Textord::correlate_lines(TO_BLOCK *block, float gradient) {
123  TO_ROW **rows; //array of ptrs
124  int rowcount; /*no of rows to do */
125  register int rowindex; /*no of row */
126  //iterator
127  TO_ROW_IT row_it = block->get_rows ();
128 
129  rowcount = row_it.length ();
130  if (rowcount == 0) {
131  //default value
132  block->xheight = block->line_size;
133  return; /*none to do */
134  }
135  rows = (TO_ROW **) alloc_mem (rowcount * sizeof (TO_ROW *));
136  rowindex = 0;
137  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
138  //make array
139  rows[rowindex++] = row_it.data ();
140 
141  /*try to fix bad lines */
142  correlate_neighbours(block, rows, rowcount);
143 
145  block->xheight = (float) correlate_with_stats(rows, rowcount, block);
146  if (block->xheight <= 0)
148  if (block->xheight < textord_min_xheight)
149  block->xheight = (float) textord_min_xheight;
150  } else {
151  compute_block_xheight(block, gradient);
152  }
153 
154  free_mem(rows);
155 }
156 
157 
158 /**********************************************************************
159  * correlate_neighbours
160  *
161  * Try to fix rows that had a bad spline fit by using neighbours.
162  **********************************************************************/
163 
164 void Textord::correlate_neighbours(TO_BLOCK *block, // block rows are in.
165  TO_ROW **rows, // rows of block.
166  int rowcount) { // no of rows to do.
167  TO_ROW *row; /*current row */
168  register int rowindex; /*no of row */
169  register int otherrow; /*second row */
170  int upperrow; /*row above to use */
171  int lowerrow; /*row below to use */
172  float biggest;
173 
174  for (rowindex = 0; rowindex < rowcount; rowindex++) {
175  row = rows[rowindex]; /*current row */
176  if (row->xheight < 0) {
177  /*quadratic failed */
178  for (otherrow = rowindex - 2;
179  otherrow >= 0
180  && (rows[otherrow]->xheight < 0.0
181  || !row->baseline.overlap (&rows[otherrow]->baseline,
182  MAXOVERLAP)); otherrow--);
183  upperrow = otherrow; /*decent row above */
184  for (otherrow = rowindex + 1;
185  otherrow < rowcount
186  && (rows[otherrow]->xheight < 0.0
187  || !row->baseline.overlap (&rows[otherrow]->baseline,
188  MAXOVERLAP)); otherrow++);
189  lowerrow = otherrow; /*decent row below */
190  if (upperrow >= 0)
191  find_textlines(block, row, 2, &rows[upperrow]->baseline);
192  if (row->xheight < 0 && lowerrow < rowcount)
193  find_textlines(block, row, 2, &rows[lowerrow]->baseline);
194  if (row->xheight < 0) {
195  if (upperrow >= 0)
196  find_textlines(block, row, 1, &rows[upperrow]->baseline);
197  else if (lowerrow < rowcount)
198  find_textlines(block, row, 1, &rows[lowerrow]->baseline);
199  }
200  }
201  }
202 
203  for (biggest = 0.0f, rowindex = 0; rowindex < rowcount; rowindex++) {
204  row = rows[rowindex]; /*current row */
205  if (row->xheight < 0) /*linear failed */
206  /*make do */
207  row->xheight = -row->xheight;
208  biggest = MAX (biggest, row->xheight);
209  }
210 }
211 
212 
213 /**********************************************************************
214  * correlate_with_stats
215  *
216  * correlate the x-heights and ascender heights of a block to fill-in
217  * the ascender height and descender height for rows without one.
218  **********************************************************************/
219 
220 int Textord::correlate_with_stats(TO_ROW **rows, // rows of block.
221  int rowcount, // no of rows to do.
222  TO_BLOCK* block) {
223  TO_ROW *row; /*current row */
224  register int rowindex; /*no of row */
225  float lineheight; /*mean x-height */
226  float ascheight; /*average ascenders */
227  float minascheight; /*min allowed ascheight */
228  int xcount; /*no of samples for xheight */
229  float fullheight; /*mean top height */
230  int fullcount; /*no of samples */
231  float descheight; /*mean descender drop */
232  float mindescheight; /*min allowed descheight */
233  int desccount; /*no of samples */
234  float xshift; /*shift in xheight */
235 
236  /*no samples */
237  xcount = fullcount = desccount = 0;
238  lineheight = ascheight = fullheight = descheight = 0.0;
239  for (rowindex = 0; rowindex < rowcount; rowindex++) {
240  row = rows[rowindex]; /*current row */
241  if (row->ascrise > 0.0) { /*got ascenders? */
242  lineheight += row->xheight;/*average x-heights */
243  ascheight += row->ascrise; /*average ascenders */
244  xcount++;
245  }
246  else {
247  fullheight += row->xheight;/*assume full height */
248  fullcount++;
249  }
250  if (row->descdrop < 0.0) { /*got descenders? */
251  /*average descenders */
252  descheight += row->descdrop;
253  desccount++;
254  }
255  }
256 
257  if (xcount > 0 && (!oldbl_corrfix || xcount >= fullcount)) {
258  lineheight /= xcount; /*average x-height */
259  /*average caps height */
260  fullheight = lineheight + ascheight / xcount;
261  /*must be decent size */
262  if (fullheight < lineheight * (1 + MIN_ASC_FRACTION))
263  fullheight = lineheight * (1 + MIN_ASC_FRACTION);
264  }
265  else {
266  fullheight /= fullcount; /*average max height */
267  /*guess x-height */
268  lineheight = fullheight * X_HEIGHT_FRACTION;
269  }
270  if (desccount > 0 && (!oldbl_corrfix || desccount >= rowcount / 2))
271  descheight /= desccount; /*average descenders */
272  else
273  /*guess descenders */
274  descheight = -lineheight * DESCENDER_FRACTION;
275 
276  if (lineheight > 0.0f)
277  block->block->set_cell_over_xheight((fullheight - descheight) / lineheight);
278 
279  minascheight = lineheight * MIN_ASC_FRACTION;
280  mindescheight = -lineheight * MIN_DESC_FRACTION;
281  for (rowindex = 0; rowindex < rowcount; rowindex++) {
282  row = rows[rowindex]; /*do each row */
283  row->all_caps = FALSE;
284  if (row->ascrise / row->xheight < MIN_ASC_FRACTION) {
285  /*no ascenders */
286  if (row->xheight >= lineheight * (1 - MAXHEIGHTVARIANCE)
287  && row->xheight <= lineheight * (1 + MAXHEIGHTVARIANCE)) {
288  row->ascrise = fullheight - lineheight;
289  /*shift in x */
290  xshift = lineheight - row->xheight;
291  /*set to average */
292  row->xheight = lineheight;
293 
294  }
295  else if (row->xheight >= fullheight * (1 - MAXHEIGHTVARIANCE)
296  && row->xheight <= fullheight * (1 + MAXHEIGHTVARIANCE)) {
297  row->ascrise = row->xheight - lineheight;
298  xshift = -row->ascrise; /*shift in x */
299  /*set to average */
300  row->xheight = lineheight;
301  row->all_caps = TRUE;
302  }
303  else {
304  row->ascrise = (fullheight - lineheight) * row->xheight
305  / fullheight;
306  xshift = -row->ascrise; /*shift in x */
307  /*scale it */
308  row->xheight -= row->ascrise;
309  row->all_caps = TRUE;
310  }
311  if (row->ascrise < minascheight)
312  row->ascrise =
313  row->xheight * ((1.0 - X_HEIGHT_FRACTION) / X_HEIGHT_FRACTION);
314  }
315  if (row->descdrop > mindescheight) {
316  if (row->xheight >= lineheight * (1 - MAXHEIGHTVARIANCE)
317  && row->xheight <= lineheight * (1 + MAXHEIGHTVARIANCE))
318  /*set to average */
319  row->descdrop = descheight;
320  else
321  row->descdrop = -row->xheight * DESCENDER_FRACTION;
322  }
323  }
324  return (int) lineheight; //block xheight
325 }
326 
327 
328 /**********************************************************************
329  * find_textlines
330  *
331  * Compute the baseline for the given row.
332  **********************************************************************/
333 
334 void Textord::find_textlines(TO_BLOCK *block, // block row is in
335  TO_ROW *row, // row to do
336  int degree, // required approximation
337  QSPLINE *spline) { // starting spline
338  int partcount; /*no of partitions of */
339  BOOL8 holed_line = FALSE; //lost too many blobs
340  int bestpart; /*biggest partition */
341  char *partids; /*partition no of each blob */
342  int partsizes[MAXPARTS]; /*no in each partition */
343  int lineheight; /*guessed x-height */
344  float jumplimit; /*allowed delta change */
345  int *xcoords; /*useful sample points */
346  int *ycoords; /*useful sample points */
347  TBOX *blobcoords; /*edges of blob rectangles */
348  int blobcount; /*no of blobs on line */
349  float *ydiffs; /*diffs from 1st approx */
350  int pointcount; /*no of coords */
351  int xstarts[SPLINESIZE + 1]; //segment boundaries
352  int segments; //no of segments
353 
354  //no of blobs in row
355  blobcount = row->blob_list ()->length ();
356  partids = (char *) alloc_mem (blobcount * sizeof (char));
357  xcoords = (int *) alloc_mem (blobcount * sizeof (int));
358  ycoords = (int *) alloc_mem (blobcount * sizeof (int));
359  blobcoords = (TBOX *) alloc_mem (blobcount * sizeof (TBOX));
360  ydiffs = (float *) alloc_mem (blobcount * sizeof (float));
361 
362  lineheight = get_blob_coords (row, (int) block->line_size, blobcoords,
363  holed_line, blobcount);
364  /*limit for line change */
365  jumplimit = lineheight * textord_oldbl_jumplimit;
366  if (jumplimit < MINASCRISE)
367  jumplimit = MINASCRISE;
368 
369  if (textord_oldbl_debug) {
370  tprintf
371  ("\nInput height=%g, Estimate x-height=%d pixels, jumplimit=%.2f\n",
372  block->line_size, lineheight, jumplimit);
373  }
374  if (holed_line)
375  make_holed_baseline (blobcoords, blobcount, spline, &row->baseline,
376  row->line_m ());
377  else
378  make_first_baseline (blobcoords, blobcount,
379  xcoords, ycoords, spline, &row->baseline, jumplimit);
380 #ifndef GRAPHICS_DISABLED
383 #endif
384  if (blobcount > 1) {
385  bestpart = partition_line (blobcoords, blobcount,
386  &partcount, partids, partsizes,
387  &row->baseline, jumplimit, ydiffs);
388  pointcount = partition_coords (blobcoords, blobcount,
389  partids, bestpart, xcoords, ycoords);
390  segments = segment_spline (blobcoords, blobcount,
391  xcoords, ycoords,
392  degree, pointcount, xstarts);
393  if (!holed_line) {
394  do {
395  row->baseline = QSPLINE (xstarts, segments,
396  xcoords, ycoords, pointcount, degree);
397  }
399  && split_stepped_spline (&row->baseline, jumplimit / 2,
400  xcoords, xstarts, segments));
401  }
402  find_lesser_parts(row,
403  blobcoords,
404  blobcount,
405  partids,
406  partsizes,
407  partcount,
408  bestpart);
409 
410  }
411  else {
412  row->xheight = -1.0f; /*failed */
413  row->descdrop = 0.0f;
414  row->ascrise = 0.0f;
415  }
416  row->baseline.extrapolate (row->line_m (),
417  block->block->bounding_box ().left (),
418  block->block->bounding_box ().right ());
419 
421  old_first_xheight (row, blobcoords, lineheight,
422  blobcount, &row->baseline, jumplimit);
423  } else if (textord_old_xheight) {
424  make_first_xheight (row, blobcoords, lineheight, (int) block->line_size,
425  blobcount, &row->baseline, jumplimit);
426  } else {
427  compute_row_xheight(row, block->block->classify_rotation(),
428  row->line_m(), block->line_size);
429  }
430  free_mem(partids);
431  free_mem(xcoords);
432  free_mem(ycoords);
433  free_mem(blobcoords);
434  free_mem(ydiffs);
435 }
436 
437 } // namespace tesseract.
438 
439 
440 /**********************************************************************
441  * get_blob_coords
442  *
443  * Fill the blobcoords array with the coordinates of the blobs
444  * in the row. The return value is the first guess at the line height.
445  **********************************************************************/
446 
447 int get_blob_coords( //get boxes
448  TO_ROW *row, //row to use
449  inT32 lineheight, //block level
450  TBOX *blobcoords, //ouput boxes
451  BOOL8 &holed_line, //lost a lot of blobs
452  int &outcount //no of real blobs
453  ) {
454  //blobs
455  BLOBNBOX_IT blob_it = row->blob_list ();
456  register int blobindex; /*no along text line */
457  int losscount; //lost blobs
458  int maxlosscount; //greatest lost blobs
459  /*height stat collection */
460  STATS heightstat (0, MAXHEIGHT);
461 
462  if (blob_it.empty ())
463  return 0; //none
464  maxlosscount = 0;
465  losscount = 0;
466  blob_it.mark_cycle_pt ();
467  blobindex = 0;
468  do {
469  blobcoords[blobindex] = box_next_pre_chopped (&blob_it);
470  if (blobcoords[blobindex].height () > lineheight * 0.25)
471  heightstat.add (blobcoords[blobindex].height (), 1);
472  if (blobindex == 0
473  || blobcoords[blobindex].height () > lineheight * 0.25
474  || blob_it.cycled_list ()) {
475  blobindex++; /*no of merged blobs */
476  losscount = 0;
477  }
478  else {
479  if (blobcoords[blobindex].height ()
480  < blobcoords[blobindex].width () * oldbl_dot_error_size
481  && blobcoords[blobindex].width ()
482  < blobcoords[blobindex].height () * oldbl_dot_error_size) {
483  //counts as dot
484  blobindex++;
485  losscount = 0;
486  }
487  else {
488  losscount++; //lost it
489  if (losscount > maxlosscount)
490  //remember max
491  maxlosscount = losscount;
492  }
493  }
494  }
495  while (!blob_it.cycled_list ());
496 
497  holed_line = maxlosscount > oldbl_holed_losscount;
498  outcount = blobindex; /*total blobs */
499 
500  if (heightstat.get_total () > 1)
501  /*guess x-height */
502  return (int) heightstat.ile (0.25);
503  else
504  return blobcoords[0].height ();
505 }
506 
507 
508 /**********************************************************************
509  * make_first_baseline
510  *
511  * Make the first estimate at a baseline, either by shifting
512  * a supplied previous spline, or by doing a piecewise linear
513  * approximation using all the blobs.
514  **********************************************************************/
515 
516 void
517 make_first_baseline ( //initial approximation
518 TBOX blobcoords[], /*blob bounding boxes */
519 int blobcount, /*no of blobcoords */
520 int xcoords[], /*coords for spline */
521 int ycoords[], /*approximator */
522 QSPLINE * spline, /*initial spline */
523 QSPLINE * baseline, /*output spline */
524 float jumplimit /*guess half descenders */
525 ) {
526  int leftedge; /*left edge of line */
527  int rightedge; /*right edge of line */
528  int blobindex; /*current blob */
529  int segment; /*current segment */
530  float prevy, thisy, nexty; /*3 y coords */
531  float y1, y2, y3; /*3 smooth blobs */
532  float maxmax, minmin; /*absolute limits */
533  int x2 = 0; /*right edge of old y3 */
534  int ycount; /*no of ycoords in use */
535  float yturns[SPLINESIZE]; /*y coords of turn pts */
536  int xturns[SPLINESIZE]; /*xcoords of turn pts */
537  int xstarts[SPLINESIZE + 1];
538  int segments; //no of segments
539  ICOORD shift; //shift of spline
540 
541  prevy = 0;
542  /*left edge of row */
543  leftedge = blobcoords[0].left ();
544  /*right edge of line */
545  rightedge = blobcoords[blobcount - 1].right ();
546  if (spline == NULL /*no given spline */
547  || spline->segments < 3 /*or trivial */
548  /*or too non-overlap */
549  || spline->xcoords[1] > leftedge + MAXOVERLAP * (rightedge - leftedge)
550  || spline->xcoords[spline->segments - 1] < rightedge
551  - MAXOVERLAP * (rightedge - leftedge)) {
553  return; //use default
554  xstarts[0] = blobcoords[0].left () - 1;
555  for (blobindex = 0; blobindex < blobcount; blobindex++) {
556  xcoords[blobindex] = (blobcoords[blobindex].left ()
557  + blobcoords[blobindex].right ()) / 2;
558  ycoords[blobindex] = blobcoords[blobindex].bottom ();
559  }
560  xstarts[1] = blobcoords[blobcount - 1].right () + 1;
561  segments = 1; /*no of segments */
562 
563  /*linear */
564  *baseline = QSPLINE (xstarts, segments, xcoords, ycoords, blobcount, 1);
565 
566  if (blobcount >= 3) {
567  y1 = y2 = y3 = 0.0f;
568  ycount = 0;
569  segment = 0; /*no of segments */
570  maxmax = minmin = 0.0f;
571  thisy = ycoords[0] - baseline->y (xcoords[0]);
572  nexty = ycoords[1] - baseline->y (xcoords[1]);
573  for (blobindex = 2; blobindex < blobcount; blobindex++) {
574  prevy = thisy; /*shift ycoords */
575  thisy = nexty;
576  nexty = ycoords[blobindex] - baseline->y (xcoords[blobindex]);
577  /*middle of smooth y */
578  if (ABS (thisy - prevy) < jumplimit && ABS (thisy - nexty) < jumplimit) {
579  y1 = y2; /*shift window */
580  y2 = y3;
581  y3 = thisy; /*middle point */
582  ycount++;
583  /*local max */
584  if (ycount >= 3 && ((y1 < y2 && y2 >= y3)
585  /*local min */
586  || (y1 > y2 && y2 <= y3))) {
587  if (segment < SPLINESIZE - 2) {
588  /*turning pt */
589  xturns[segment] = x2;
590  yturns[segment] = y2;
591  segment++; /*no of spline segs */
592  }
593  }
594  if (ycount == 1) {
595  maxmax = minmin = y3;/*initialise limits */
596  }
597  else {
598  if (y3 > maxmax)
599  maxmax = y3; /*biggest max */
600  if (y3 < minmin)
601  minmin = y3; /*smallest min */
602  }
603  /*possible turning pt */
604  x2 = blobcoords[blobindex - 1].right ();
605  }
606  }
607 
608  jumplimit *= 1.2;
609  /*must be wavy */
610  if (maxmax - minmin > jumplimit) {
611  ycount = segment; /*no of segments */
612  for (blobindex = 0, segment = 1; blobindex < ycount;
613  blobindex++) {
614  if (yturns[blobindex] > minmin + jumplimit
615  || yturns[blobindex] < maxmax - jumplimit) {
616  /*significant peak */
617  if (segment == 1
618  || yturns[blobindex] > prevy + jumplimit
619  || yturns[blobindex] < prevy - jumplimit) {
620  /*different to previous */
621  xstarts[segment] = xturns[blobindex];
622  segment++;
623  prevy = yturns[blobindex];
624  }
625  /*bigger max */
626  else if ((prevy > minmin + jumplimit && yturns[blobindex] > prevy)
627  /*smaller min */
628  || (prevy < maxmax - jumplimit && yturns[blobindex] < prevy)) {
629  xstarts[segment - 1] = xturns[blobindex];
630  /*improved previous */
631  prevy = yturns[blobindex];
632  }
633  }
634  }
635  xstarts[segment] = blobcoords[blobcount - 1].right () + 1;
636  segments = segment; /*no of segments */
637  /*linear */
638  *baseline = QSPLINE (xstarts, segments, xcoords, ycoords, blobcount, 1);
639  }
640  }
641  }
642  else {
643  *baseline = *spline; /*copy it */
644  shift = ICOORD (0, (inT16) (blobcoords[0].bottom ()
645  - spline->y (blobcoords[0].right ())));
646  baseline->move (shift);
647  }
648 }
649 
650 
651 /**********************************************************************
652  * make_holed_baseline
653  *
654  * Make the first estimate at a baseline, either by shifting
655  * a supplied previous spline, or by doing a piecewise linear
656  * approximation using all the blobs.
657  **********************************************************************/
658 
659 void
660 make_holed_baseline ( //initial approximation
661 TBOX blobcoords[], /*blob bounding boxes */
662 int blobcount, /*no of blobcoords */
663 QSPLINE * spline, /*initial spline */
664 QSPLINE * baseline, /*output spline */
665 float gradient //of line
666 ) {
667  int leftedge; /*left edge of line */
668  int rightedge; /*right edge of line */
669  int blobindex; /*current blob */
670  float x; //centre of row
671  ICOORD shift; //shift of spline
672 
673  tesseract::DetLineFit lms; // straight baseline
674  inT32 xstarts[2]; //straight line
675  double coeffs[3];
676  float c; //line parameter
677 
678  /*left edge of row */
679  leftedge = blobcoords[0].left ();
680  /*right edge of line */
681  rightedge = blobcoords[blobcount - 1].right();
682  for (blobindex = 0; blobindex < blobcount; blobindex++) {
683  lms.Add(ICOORD((blobcoords[blobindex].left() +
684  blobcoords[blobindex].right()) / 2,
685  blobcoords[blobindex].bottom()));
686  }
687  lms.ConstrainedFit(gradient, &c);
688  xstarts[0] = leftedge;
689  xstarts[1] = rightedge;
690  coeffs[0] = 0;
691  coeffs[1] = gradient;
692  coeffs[2] = c;
693  *baseline = QSPLINE (1, xstarts, coeffs);
694  if (spline != NULL /*no given spline */
695  && spline->segments >= 3 /*or trivial */
696  /*or too non-overlap */
697  && spline->xcoords[1] <= leftedge + MAXOVERLAP * (rightedge - leftedge)
698  && spline->xcoords[spline->segments - 1] >= rightedge
699  - MAXOVERLAP * (rightedge - leftedge)) {
700  *baseline = *spline; /*copy it */
701  x = (leftedge + rightedge) / 2.0;
702  shift = ICOORD (0, (inT16) (gradient * x + c - spline->y (x)));
703  baseline->move (shift);
704  }
705 }
706 
707 
708 /**********************************************************************
709  * partition_line
710  *
711  * Partition a row of blobs into different groups of continuous
712  * y position. jumplimit specifies the max allowable limit on a jump
713  * before a new partition is started.
714  * The return value is the biggest partition
715  **********************************************************************/
716 
717 int
718 partition_line ( //partition blobs
719 TBOX blobcoords[], //bounding boxes
720 int blobcount, /*no of blobs on row */
721 int *numparts, /*number of partitions */
722 char partids[], /*partition no of each blob */
723 int partsizes[], /*no in each partition */
724 QSPLINE * spline, /*curve to fit to */
725 float jumplimit, /*allowed delta change */
726 float ydiffs[] /*diff from spline */
727 ) {
728  register int blobindex; /*no along text line */
729  int bestpart; /*best new partition */
730  int biggestpart; /*part with most members */
731  float diff; /*difference from line */
732  int startx; /*index of start blob */
733  float partdiffs[MAXPARTS]; /*step between parts */
734 
735  for (bestpart = 0; bestpart < MAXPARTS; bestpart++)
736  partsizes[bestpart] = 0; /*zero them all */
737 
738  startx = get_ydiffs (blobcoords, blobcount, spline, ydiffs);
739  *numparts = 1; /*1 partition */
740  bestpart = -1; /*first point */
741  float drift = 0.0f;
742  float last_delta = 0.0f;
743  for (blobindex = startx; blobindex < blobcount; blobindex++) {
744  /*do each blob in row */
745  diff = ydiffs[blobindex]; /*diff from line */
746  if (textord_oldbl_debug) {
747  tprintf ("%d(%d,%d), ", blobindex,
748  blobcoords[blobindex].left (),
749  blobcoords[blobindex].bottom ());
750  }
751  bestpart = choose_partition(diff, partdiffs, bestpart, jumplimit,
752  &drift, &last_delta, numparts);
753  /*record partition */
754  partids[blobindex] = bestpart;
755  partsizes[bestpart]++; /*another in it */
756  }
757 
758  bestpart = -1; /*first point */
759  drift = 0.0f;
760  last_delta = 0.0f;
761  partsizes[0]--; /*doing 1st pt again */
762  /*do each blob in row */
763  for (blobindex = startx; blobindex >= 0; blobindex--) {
764  diff = ydiffs[blobindex]; /*diff from line */
765  if (textord_oldbl_debug) {
766  tprintf ("%d(%d,%d), ", blobindex,
767  blobcoords[blobindex].left (),
768  blobcoords[blobindex].bottom ());
769  }
770  bestpart = choose_partition(diff, partdiffs, bestpart, jumplimit,
771  &drift, &last_delta, numparts);
772  /*record partition */
773  partids[blobindex] = bestpart;
774  partsizes[bestpart]++; /*another in it */
775  }
776 
777  for (biggestpart = 0, bestpart = 1; bestpart < *numparts; bestpart++)
778  if (partsizes[bestpart] >= partsizes[biggestpart])
779  biggestpart = bestpart; /*new biggest */
781  merge_oldbl_parts(blobcoords,
782  blobcount,
783  partids,
784  partsizes,
785  biggestpart,
786  jumplimit);
787  return biggestpart; /*biggest partition */
788 }
789 
790 
791 /**********************************************************************
792  * merge_oldbl_parts
793  *
794  * For any adjacent group of blobs in a different part, put them in the
795  * main part if they fit closely to neighbours in the main part.
796  **********************************************************************/
797 
798 void
799 merge_oldbl_parts ( //partition blobs
800 TBOX blobcoords[], //bounding boxes
801 int blobcount, /*no of blobs on row */
802 char partids[], /*partition no of each blob */
803 int partsizes[], /*no in each partition */
804 int biggestpart, //major partition
805 float jumplimit /*allowed delta change */
806 ) {
807  BOOL8 found_one; //found a bestpart blob
808  BOOL8 close_one; //found was close enough
809  register int blobindex; /*no along text line */
810  int prevpart; //previous iteration
811  int runlength; //no in this part
812  float diff; /*difference from line */
813  int startx; /*index of start blob */
814  int test_blob; //another index
815  FCOORD coord; //blob coordinate
816  float m, c; //fitted line
817  QLSQ stats; //line stuff
818 
819  prevpart = biggestpart;
820  runlength = 0;
821  startx = 0;
822  for (blobindex = 0; blobindex < blobcount; blobindex++) {
823  if (partids[blobindex] != prevpart) {
824  // tprintf("Partition change at (%d,%d) from %d to %d after run of %d\n",
825  // blobcoords[blobindex].left(),blobcoords[blobindex].bottom(),
826  // prevpart,partids[blobindex],runlength);
827  if (prevpart != biggestpart && runlength > MAXBADRUN) {
828  stats.clear ();
829  for (test_blob = startx; test_blob < blobindex; test_blob++) {
830  coord = FCOORD ((blobcoords[test_blob].left ()
831  + blobcoords[test_blob].right ()) / 2.0,
832  blobcoords[test_blob].bottom ());
833  stats.add (coord.x (), coord.y ());
834  }
835  stats.fit (1);
836  m = stats.get_b ();
837  c = stats.get_c ();
839  tprintf ("Fitted line y=%g x + %g\n", m, c);
840  found_one = FALSE;
841  close_one = FALSE;
842  for (test_blob = 1; !found_one
843  && (startx - test_blob >= 0
844  || blobindex + test_blob <= blobcount); test_blob++) {
845  if (startx - test_blob >= 0
846  && partids[startx - test_blob] == biggestpart) {
847  found_one = TRUE;
848  coord = FCOORD ((blobcoords[startx - test_blob].left ()
849  + blobcoords[startx -
850  test_blob].right ()) /
851  2.0,
852  blobcoords[startx -
853  test_blob].bottom ());
854  diff = m * coord.x () + c - coord.y ();
856  tprintf
857  ("Diff of common blob to suspect part=%g at (%g,%g)\n",
858  diff, coord.x (), coord.y ());
859  if (diff < jumplimit && -diff < jumplimit)
860  close_one = TRUE;
861  }
862  if (blobindex + test_blob <= blobcount
863  && partids[blobindex + test_blob - 1] == biggestpart) {
864  found_one = TRUE;
865  coord =
866  FCOORD ((blobcoords[blobindex + test_blob - 1].
867  left () + blobcoords[blobindex + test_blob -
868  1].right ()) / 2.0,
869  blobcoords[blobindex + test_blob -
870  1].bottom ());
871  diff = m * coord.x () + c - coord.y ();
873  tprintf
874  ("Diff of common blob to suspect part=%g at (%g,%g)\n",
875  diff, coord.x (), coord.y ());
876  if (diff < jumplimit && -diff < jumplimit)
877  close_one = TRUE;
878  }
879  }
880  if (close_one) {
882  tprintf
883  ("Merged %d blobs back into part %d from %d starting at (%d,%d)\n",
884  runlength, biggestpart, prevpart,
885  blobcoords[startx].left (),
886  blobcoords[startx].bottom ());
887  //switch sides
888  partsizes[prevpart] -= runlength;
889  for (test_blob = startx; test_blob < blobindex; test_blob++)
890  partids[test_blob] = biggestpart;
891  }
892  }
893  prevpart = partids[blobindex];
894  runlength = 1;
895  startx = blobindex;
896  }
897  else
898  runlength++;
899  }
900 }
901 
902 
903 /**********************************************************************
904  * get_ydiffs
905  *
906  * Get the differences between the blobs and the spline,
907  * putting them in ydiffs. The return value is the index
908  * of the blob in the middle of the "best behaved" region
909  **********************************************************************/
910 
911 int
912 get_ydiffs ( //evaluate differences
913 TBOX blobcoords[], //bounding boxes
914 int blobcount, /*no of blobs */
915 QSPLINE * spline, /*approximating spline */
916 float ydiffs[] /*output */
917 ) {
918  register int blobindex; /*current blob */
919  int xcentre; /*xcoord */
920  int lastx; /*last xcentre */
921  float diffsum; /*sum of diffs */
922  float diff; /*current difference */
923  float drift; /*sum of spline steps */
924  float bestsum; /*smallest diffsum */
925  int bestindex; /*index of bestsum */
926 
927  diffsum = 0.0f;
928  bestindex = 0;
929  bestsum = (float) MAX_INT32;
930  drift = 0.0f;
931  lastx = blobcoords[0].left ();
932  /*do each blob in row */
933  for (blobindex = 0; blobindex < blobcount; blobindex++) {
934  /*centre of blob */
935  xcentre = (blobcoords[blobindex].left () + blobcoords[blobindex].right ()) >> 1;
936  //step functions in spline
937  drift += spline->step (lastx, xcentre);
938  lastx = xcentre;
939  diff = blobcoords[blobindex].bottom ();
940  diff -= spline->y (xcentre);
941  diff += drift;
942  ydiffs[blobindex] = diff; /*store difference */
943  if (blobindex > 2)
944  /*remove old one */
945  diffsum -= ABS (ydiffs[blobindex - 3]);
946  diffsum += ABS (diff); /*add new one */
947  if (blobindex >= 2 && diffsum < bestsum) {
948  bestsum = diffsum; /*find min sum */
949  bestindex = blobindex - 1; /*middle of set */
950  }
951  }
952  return bestindex;
953 }
954 
955 
956 /**********************************************************************
957  * choose_partition
958  *
959  * Choose a partition for the point and return the index.
960  **********************************************************************/
961 
962 int
963 choose_partition ( //select partition
964 register float diff, /*diff from spline */
965 float partdiffs[], /*diff on all parts */
966 int lastpart, /*last assigned partition */
967 float jumplimit, /*new part threshold */
968 float* drift,
969 float* lastdelta,
970 int *partcount /*no of partitions */
971 ) {
972  register int partition; /*partition no */
973  int bestpart; /*best new partition */
974  float bestdelta; /*best gap from a part */
975  float delta; /*diff from part */
976 
977  if (lastpart < 0) {
978  partdiffs[0] = diff;
979  lastpart = 0; /*first point */
980  *drift = 0.0f;
981  *lastdelta = 0.0f;
982  }
983  /*adjusted diff from part */
984  delta = diff - partdiffs[lastpart] - *drift;
985  if (textord_oldbl_debug) {
986  tprintf ("Diff=%.2f, Delta=%.3f, Drift=%.3f, ", diff, delta, *drift);
987  }
988  if (ABS (delta) > jumplimit / 2) {
989  /*delta on part 0 */
990  bestdelta = diff - partdiffs[0] - *drift;
991  bestpart = 0; /*0 best so far */
992  for (partition = 1; partition < *partcount; partition++) {
993  delta = diff - partdiffs[partition] - *drift;
994  if (ABS (delta) < ABS (bestdelta)) {
995  bestdelta = delta;
996  bestpart = partition; /*part with nearest jump */
997  }
998  }
999  delta = bestdelta;
1000  /*too far away */
1001  if (ABS (bestdelta) > jumplimit
1002  && *partcount < MAXPARTS) { /*and spare part left */
1003  bestpart = (*partcount)++; /*best was new one */
1004  /*start new one */
1005  partdiffs[bestpart] = diff - *drift;
1006  delta = 0.0f;
1007  }
1008  }
1009  else {
1010  bestpart = lastpart; /*best was last one */
1011  }
1012 
1013  if (bestpart == lastpart
1014  && (ABS (delta - *lastdelta) < jumplimit / 2
1015  || ABS (delta) < jumplimit / 2))
1016  /*smooth the drift */
1017  *drift = (3 * *drift + delta) / 3;
1018  *lastdelta = delta;
1019 
1020  if (textord_oldbl_debug) {
1021  tprintf ("P=%d\n", bestpart);
1022  }
1023 
1024  return bestpart;
1025 }
1026 
1027 
1029 //partitions and gives all the rest partid 0*/
1030 //
1031 //merge_partitions(partids,partcount,blobcount,bestpart)
1032 //register char *partids; /*partition numbers*/
1033 //int partcount; /*no of partitions*/
1034 //int blobcount; /*no of blobs*/
1035 //int bestpart; /*best partition*/
1036 //{
1037 // register int blobindex; /*no along text line*/
1038 // int runlength; /*run of same partition*/
1039 // int bestrun; /*biggest runlength*/
1040 //
1041 // bestrun=0; /*no runs yet*/
1042 // runlength=1;
1043 // for (blobindex=1;blobindex<blobcount;blobindex++)
1044 // { if (partids[blobindex]!=partids[blobindex-1])
1045 // { if (runlength>bestrun)
1046 // bestrun=runlength; /*find biggest run*/
1047 // runlength=1; /*new run*/
1048 // }
1049 // else
1050 // { runlength++;
1051 // }
1052 // }
1053 // if (runlength>bestrun)
1054 // bestrun=runlength;
1055 //
1056 // for (blobindex=0;blobindex<blobcount;blobindex++)
1057 // { if (blobindex<1
1058 // || partids[blobindex]!=partids[blobindex-1])
1059 // { if ((blobindex+1>=blobcount
1060 // || partids[blobindex]!=partids[blobindex+1])
1061 // /*loner*/
1062 // && (bestrun>2 || partids[blobindex]!=bestpart))
1063 // { partids[blobindex]=partcount; /*discard loner*/
1064 // }
1065 // else if (blobindex+1<blobcount
1066 // && partids[blobindex]==partids[blobindex+1]
1067 // /*pair*/
1068 // && (blobindex+2>=blobcount
1069 // || partids[blobindex]!=partids[blobindex+2])
1070 // && (bestrun>3 || partids[blobindex]!=bestpart))
1071 // { partids[blobindex]=partcount; /*discard both*/
1072 // partids[blobindex+1]=partcount;
1073 // }
1074 // }
1075 // }
1076 // for (blobindex=0;blobindex<blobcount;blobindex++)
1077 // { if (partids[blobindex]<partcount)
1078 // partids[blobindex]=0; /*all others together*/
1079 // }
1080 //}
1081 
1082 /**********************************************************************
1083  * partition_coords
1084  *
1085  * Get the x,y coordinates of all points in the bestpart and put them
1086  * in xcoords,ycoords. Return the number of points found.
1087  **********************************************************************/
1088 
1089 int
1090 partition_coords ( //find relevant coords
1091 TBOX blobcoords[], //bounding boxes
1092 int blobcount, /*no of blobs in row */
1093 char partids[], /*partition no of each blob */
1094 int bestpart, /*best new partition */
1095 int xcoords[], /*points to work on */
1096 int ycoords[] /*points to work on */
1097 ) {
1098  register int blobindex; /*no along text line */
1099  int pointcount; /*no of points */
1100 
1101  pointcount = 0;
1102  for (blobindex = 0; blobindex < blobcount; blobindex++) {
1103  if (partids[blobindex] == bestpart) {
1104  /*centre of blob */
1105  xcoords[pointcount] = (blobcoords[blobindex].left () + blobcoords[blobindex].right ()) >> 1;
1106  ycoords[pointcount++] = blobcoords[blobindex].bottom ();
1107  }
1108  }
1109  return pointcount; /*no of points found */
1110 }
1111 
1112 
1113 /**********************************************************************
1114  * segment_spline
1115  *
1116  * Segment the row at midpoints between maxima and minima of the x,y pairs.
1117  * The xstarts of the segments are returned and the number found.
1118  **********************************************************************/
1119 
1120 int
1121 segment_spline ( //make xstarts
1122 TBOX blobcoords[], //boundign boxes
1123 int blobcount, /*no of blobs in row */
1124 int xcoords[], /*points to work on */
1125 int ycoords[], /*points to work on */
1126 int degree, int pointcount, /*no of points */
1127 int xstarts[] //result
1128 ) {
1129  register int ptindex; /*no along text line */
1130  register int segment; /*partition no */
1131  int lastmin, lastmax; /*possible turn points */
1132  int turnpoints[SPLINESIZE]; /*good turning points */
1133  int turncount; /*no of turning points */
1134  int max_x; //max specified coord
1135 
1136  xstarts[0] = xcoords[0] - 1; //leftmost defined pt
1137  max_x = xcoords[pointcount - 1] + 1;
1138  if (degree < 2)
1139  pointcount = 0;
1140  turncount = 0; /*no turning points yet */
1141  if (pointcount > 3) {
1142  ptindex = 1;
1143  lastmax = lastmin = 0; /*start with first one */
1144  while (ptindex < pointcount - 1 && turncount < SPLINESIZE - 1) {
1145  /*minimum */
1146  if (ycoords[ptindex - 1] > ycoords[ptindex] && ycoords[ptindex] <= ycoords[ptindex + 1]) {
1147  if (ycoords[ptindex] < ycoords[lastmax] - TURNLIMIT) {
1148  if (turncount == 0 || turnpoints[turncount - 1] != lastmax)
1149  /*new max point */
1150  turnpoints[turncount++] = lastmax;
1151  lastmin = ptindex; /*latest minimum */
1152  }
1153  else if (ycoords[ptindex] < ycoords[lastmin]) {
1154  lastmin = ptindex; /*lower minimum */
1155  }
1156  }
1157 
1158  /*maximum */
1159  if (ycoords[ptindex - 1] < ycoords[ptindex] && ycoords[ptindex] >= ycoords[ptindex + 1]) {
1160  if (ycoords[ptindex] > ycoords[lastmin] + TURNLIMIT) {
1161  if (turncount == 0 || turnpoints[turncount - 1] != lastmin)
1162  /*new min point */
1163  turnpoints[turncount++] = lastmin;
1164  lastmax = ptindex; /*latest maximum */
1165  }
1166  else if (ycoords[ptindex] > ycoords[lastmax]) {
1167  lastmax = ptindex; /*higher maximum */
1168  }
1169  }
1170  ptindex++;
1171  }
1172  /*possible global min */
1173  if (ycoords[ptindex] < ycoords[lastmax] - TURNLIMIT
1174  && (turncount == 0 || turnpoints[turncount - 1] != lastmax)) {
1175  if (turncount < SPLINESIZE - 1)
1176  /*2 more turns */
1177  turnpoints[turncount++] = lastmax;
1178  if (turncount < SPLINESIZE - 1)
1179  turnpoints[turncount++] = ptindex;
1180  }
1181  else if (ycoords[ptindex] > ycoords[lastmin] + TURNLIMIT
1182  /*possible global max */
1183  && (turncount == 0 || turnpoints[turncount - 1] != lastmin)) {
1184  if (turncount < SPLINESIZE - 1)
1185  /*2 more turns */
1186  turnpoints[turncount++] = lastmin;
1187  if (turncount < SPLINESIZE - 1)
1188  turnpoints[turncount++] = ptindex;
1189  }
1190  else if (turncount > 0 && turnpoints[turncount - 1] == lastmin
1191  && turncount < SPLINESIZE - 1) {
1192  if (ycoords[ptindex] > ycoords[lastmax])
1193  turnpoints[turncount++] = ptindex;
1194  else
1195  turnpoints[turncount++] = lastmax;
1196  }
1197  else if (turncount > 0 && turnpoints[turncount - 1] == lastmax
1198  && turncount < SPLINESIZE - 1) {
1199  if (ycoords[ptindex] < ycoords[lastmin])
1200  turnpoints[turncount++] = ptindex;
1201  else
1202  turnpoints[turncount++] = lastmin;
1203  }
1204  }
1205 
1206  if (textord_oldbl_debug && turncount > 0)
1207  tprintf ("First turn is %d at (%d,%d)\n",
1208  turnpoints[0], xcoords[turnpoints[0]], ycoords[turnpoints[0]]);
1209  for (segment = 1; segment < turncount; segment++) {
1210  /*centre y coord */
1211  lastmax = (ycoords[turnpoints[segment - 1]] + ycoords[turnpoints[segment]]) / 2;
1212 
1213  /* fix alg so that it works with both rising and falling sections */
1214  if (ycoords[turnpoints[segment - 1]] < ycoords[turnpoints[segment]])
1215  /*find rising y centre */
1216  for (ptindex = turnpoints[segment - 1] + 1; ptindex < turnpoints[segment] && ycoords[ptindex + 1] <= lastmax; ptindex++);
1217  else
1218  /*find falling y centre */
1219  for (ptindex = turnpoints[segment - 1] + 1; ptindex < turnpoints[segment] && ycoords[ptindex + 1] >= lastmax; ptindex++);
1220 
1221  /*centre x */
1222  xstarts[segment] = (xcoords[ptindex - 1] + xcoords[ptindex]
1223  + xcoords[turnpoints[segment - 1]]
1224  + xcoords[turnpoints[segment]] + 2) / 4;
1225  /*halfway between turns */
1226  if (textord_oldbl_debug)
1227  tprintf ("Turn %d is %d at (%d,%d), mid pt is %d@%d, final @%d\n",
1228  segment, turnpoints[segment],
1229  xcoords[turnpoints[segment]], ycoords[turnpoints[segment]],
1230  ptindex - 1, xcoords[ptindex - 1], xstarts[segment]);
1231  }
1232 
1233  xstarts[segment] = max_x;
1234  return segment; /*no of splines */
1235 }
1236 
1237 
1238 /**********************************************************************
1239  * split_stepped_spline
1240  *
1241  * Re-segment the spline in cases where there is a big step function.
1242  * Return TRUE if any were done.
1243  **********************************************************************/
1244 
1245 BOOL8
1246 split_stepped_spline ( //make xstarts
1247 QSPLINE * baseline, //current shot
1248 float jumplimit, //max step fuction
1249 int xcoords[], /*points to work on */
1250 int xstarts[], //result
1251 int &segments //no of segments
1252 ) {
1253  BOOL8 doneany; //return value
1254  register int segment; /*partition no */
1255  int startindex, centreindex, endindex;
1256  float leftcoord, rightcoord;
1257  int leftindex, rightindex;
1258  float step; //spline step
1259 
1260  doneany = FALSE;
1261  startindex = 0;
1262  for (segment = 1; segment < segments - 1; segment++) {
1263  step = baseline->step ((xstarts[segment - 1] + xstarts[segment]) / 2.0,
1264  (xstarts[segment] + xstarts[segment + 1]) / 2.0);
1265  if (step < 0)
1266  step = -step;
1267  if (step > jumplimit) {
1268  while (xcoords[startindex] < xstarts[segment - 1])
1269  startindex++;
1270  centreindex = startindex;
1271  while (xcoords[centreindex] < xstarts[segment])
1272  centreindex++;
1273  endindex = centreindex;
1274  while (xcoords[endindex] < xstarts[segment + 1])
1275  endindex++;
1276  if (segments >= SPLINESIZE) {
1278  tprintf ("Too many segments to resegment spline!!\n");
1279  }
1280  else if (endindex - startindex >= textord_spline_medianwin * 3) {
1281  while (centreindex - startindex <
1282  textord_spline_medianwin * 3 / 2)
1283  centreindex++;
1284  while (endindex - centreindex <
1285  textord_spline_medianwin * 3 / 2)
1286  centreindex--;
1287  leftindex = (startindex + startindex + centreindex) / 3;
1288  rightindex = (centreindex + endindex + endindex) / 3;
1289  leftcoord =
1290  (xcoords[startindex] * 2 + xcoords[centreindex]) / 3.0;
1291  rightcoord =
1292  (xcoords[centreindex] + xcoords[endindex] * 2) / 3.0;
1293  while (xcoords[leftindex] > leftcoord
1294  && leftindex - startindex > textord_spline_medianwin)
1295  leftindex--;
1296  while (xcoords[leftindex] < leftcoord
1297  && centreindex - leftindex >
1299  leftindex++;
1300  if (xcoords[leftindex] - leftcoord >
1301  leftcoord - xcoords[leftindex - 1])
1302  leftindex--;
1303  while (xcoords[rightindex] > rightcoord
1304  && rightindex - centreindex >
1306  rightindex--;
1307  while (xcoords[rightindex] < rightcoord
1308  && endindex - rightindex > textord_spline_medianwin)
1309  rightindex++;
1310  if (xcoords[rightindex] - rightcoord >
1311  rightcoord - xcoords[rightindex - 1])
1312  rightindex--;
1314  tprintf ("Splitting spline at %d with step %g at (%d,%d)\n",
1315  xstarts[segment],
1316  baseline->
1317  step ((xstarts[segment - 1] +
1318  xstarts[segment]) / 2.0,
1319  (xstarts[segment] +
1320  xstarts[segment + 1]) / 2.0),
1321  (xcoords[leftindex - 1] + xcoords[leftindex]) / 2,
1322  (xcoords[rightindex - 1] + xcoords[rightindex]) / 2);
1323  insert_spline_point (xstarts, segment,
1324  (xcoords[leftindex - 1] +
1325  xcoords[leftindex]) / 2,
1326  (xcoords[rightindex - 1] +
1327  xcoords[rightindex]) / 2, segments);
1328  doneany = TRUE;
1329  }
1330  else if (textord_debug_baselines) {
1331  tprintf
1332  ("Resegmenting spline failed - insufficient pts (%d,%d,%d,%d)\n",
1333  startindex, centreindex, endindex,
1335  }
1336  }
1337  // else tprintf("Spline step at %d is %g\n",
1338  // xstarts[segment],
1339  // baseline->step((xstarts[segment-1]+xstarts[segment])/2.0,
1340  // (xstarts[segment]+xstarts[segment+1])/2.0));
1341  }
1342  return doneany;
1343 }
1344 
1345 
1346 /**********************************************************************
1347  * insert_spline_point
1348  *
1349  * Insert a new spline point and shuffle up the others.
1350  **********************************************************************/
1351 
1352 void
1353 insert_spline_point ( //get descenders
1354 int xstarts[], //starts to shuffle
1355 int segment, //insertion pt
1356 int coord1, //coords to add
1357 int coord2, int &segments //total segments
1358 ) {
1359  int index; //for shuffling
1360 
1361  for (index = segments; index > segment; index--)
1362  xstarts[index + 1] = xstarts[index];
1363  segments++;
1364  xstarts[segment] = coord1;
1365  xstarts[segment + 1] = coord2;
1366 }
1367 
1368 
1369 /**********************************************************************
1370  * find_lesser_parts
1371  *
1372  * Average the step from the spline for the other partitions
1373  * and find the commonest partition which has a descender.
1374  **********************************************************************/
1375 
1376 void
1377 find_lesser_parts ( //get descenders
1378 TO_ROW * row, //row to process
1379 TBOX blobcoords[], //bounding boxes
1380 int blobcount, /*no of blobs */
1381 char partids[], /*partition of each blob */
1382 int partsizes[], /*size of each part */
1383 int partcount, /*no of partitions */
1384 int bestpart /*biggest partition */
1385 ) {
1386  register int blobindex; /*index of blob */
1387  register int partition; /*current partition */
1388  int xcentre; /*centre of blob */
1389  int poscount; /*count of best up step */
1390  int negcount; /*count of best down step */
1391  float partsteps[MAXPARTS]; /*average step to part */
1392  float bestpos; /*best up step */
1393  float bestneg; /*best down step */
1394  int runlength; /*length of bad run */
1395  int biggestrun; /*biggest bad run */
1396 
1397  biggestrun = 0;
1398  for (partition = 0; partition < partcount; partition++)
1399  partsteps[partition] = 0.0; /*zero accumulators */
1400  for (runlength = 0, blobindex = 0; blobindex < blobcount; blobindex++) {
1401  xcentre = (blobcoords[blobindex].left ()
1402  + blobcoords[blobindex].right ()) >> 1;
1403  /*in other parts */
1404  if (partids[blobindex] != bestpart) {
1405  runlength++; /*run of non bests */
1406  if (runlength > biggestrun)
1407  biggestrun = runlength;
1408  partsteps[partids[blobindex]] += blobcoords[blobindex].bottom ()
1409  - row->baseline.y (xcentre);
1410  }
1411  else
1412  runlength = 0;
1413  }
1414  if (biggestrun > MAXBADRUN)
1415  row->xheight = -1.0f; /*failed */
1416  else
1417  row->xheight = 1.0f; /*success */
1418  poscount = negcount = 0;
1419  bestpos = bestneg = 0.0; /*no step yet */
1420  for (partition = 0; partition < partcount; partition++) {
1421  if (partition != bestpart) {
1422 
1423  //by jetsoft divide by zero possible
1424  if (partsizes[partition]==0)
1425  partsteps[partition]=0;
1426  else
1427  partsteps[partition] /= partsizes[partition];
1428  //
1429 
1430 
1431  if (partsteps[partition] >= MINASCRISE
1432  && partsizes[partition] > poscount) {
1433  /*ascender rise */
1434  bestpos = partsteps[partition];
1435  /*2nd most popular */
1436  poscount = partsizes[partition];
1437  }
1438  if (partsteps[partition] <= -MINASCRISE
1439  && partsizes[partition] > negcount) {
1440  /*ascender rise */
1441  bestneg = partsteps[partition];
1442  /*2nd most popular */
1443  negcount = partsizes[partition];
1444  }
1445  }
1446  }
1447  /*average x-height */
1448  partsteps[bestpart] /= blobcount;
1449  row->descdrop = bestneg;
1450 }
1451 
1452 
1453 /**********************************************************************
1454  * old_first_xheight
1455  *
1456  * Makes an x-height spline by copying the baseline and shifting it.
1457  * It estimates the x-height across the line to use as the shift.
1458  * It also finds the ascender height if it can.
1459  **********************************************************************/
1460 
1461 void
1462 old_first_xheight ( //the wiseowl way
1463 TO_ROW * row, /*current row */
1464 TBOX blobcoords[], /*blob bounding boxes */
1465 int initialheight, //initial guess
1466 int blobcount, /*blobs in blobcoords */
1467 QSPLINE * baseline, /*established */
1468 float jumplimit /*min ascender height */
1469 ) {
1470  register int blobindex; /*current blob */
1471  /*height statistics */
1472  STATS heightstat (0, MAXHEIGHT);
1473  int height; /*height of blob */
1474  int xcentre; /*centre of blob */
1475  int lineheight; /*approx xheight */
1476  float ascenders; /*ascender sum */
1477  int asccount; /*no of ascenders */
1478  float xsum; /*xheight sum */
1479  int xcount; /*xheight count */
1480  register float diff; /*height difference */
1481 
1482  if (blobcount > 1) {
1483  for (blobindex = 0; blobindex < blobcount; blobindex++) {
1484  xcentre = (blobcoords[blobindex].left ()
1485  + blobcoords[blobindex].right ()) / 2;
1486  /*height of blob */
1487  height = (int) (blobcoords[blobindex].top () - baseline->y (xcentre) + 0.5);
1488  if (height > initialheight * oldbl_xhfract
1489  && height > textord_min_xheight)
1490  heightstat.add (height, 1);
1491  }
1492  if (heightstat.get_total () > 3) {
1493  lineheight = (int) heightstat.ile (0.25);
1494  if (lineheight <= 0)
1495  lineheight = (int) heightstat.ile (0.5);
1496  }
1497  else
1498  lineheight = initialheight;
1499  }
1500  else {
1501  lineheight = (int) (blobcoords[0].top ()
1502  - baseline->y ((blobcoords[0].left ()
1503  + blobcoords[0].right ()) / 2) +
1504  0.5);
1505  }
1506 
1507  xsum = 0.0f;
1508  xcount = 0;
1509  for (ascenders = 0.0f, asccount = 0, blobindex = 0; blobindex < blobcount;
1510  blobindex++) {
1511  xcentre = (blobcoords[blobindex].left ()
1512  + blobcoords[blobindex].right ()) / 2;
1513  diff = blobcoords[blobindex].top () - baseline->y (xcentre);
1514  /*is it ascender */
1515  if (diff > lineheight + jumplimit) {
1516  ascenders += diff;
1517  asccount++; /*count ascenders */
1518  }
1519  else if (diff > lineheight - jumplimit) {
1520  xsum += diff; /*mean xheight */
1521  xcount++;
1522  }
1523  }
1524  if (xcount > 0)
1525  xsum /= xcount; /*average xheight */
1526  else
1527  xsum = (float) lineheight; /*guess it */
1528  row->xheight *= xsum;
1529  if (asccount > 0)
1530  row->ascrise = ascenders / asccount - xsum;
1531  else
1532  row->ascrise = 0.0f; /*had none */
1533  if (row->xheight == 0)
1534  row->xheight = -1.0f;
1535 }
1536 
1537 
1538 /**********************************************************************
1539  * make_first_xheight
1540  *
1541  * Makes an x-height spline by copying the baseline and shifting it.
1542  * It estimates the x-height across the line to use as the shift.
1543  * It also finds the ascender height if it can.
1544  **********************************************************************/
1545 
1546 void
1547 make_first_xheight ( //find xheight
1548 TO_ROW * row, /*current row */
1549 TBOX blobcoords[], /*blob bounding boxes */
1550 int lineheight, //initial guess
1551 int init_lineheight, //block level guess
1552 int blobcount, /*blobs in blobcoords */
1553 QSPLINE * baseline, /*established */
1554 float jumplimit /*min ascender height */
1555 ) {
1556  STATS heightstat (0, HEIGHTBUCKETS);
1557  int lefts[HEIGHTBUCKETS];
1558  int rights[HEIGHTBUCKETS];
1559  int modelist[MODENUM];
1560  int blobindex;
1561  int mode_count; //blobs to count in thr
1562  int sign_bit;
1563  int mode_threshold;
1564  const int kBaselineTouch = 2; // This really should change with resolution.
1565  const int kGoodStrength = 8; // Strength of baseline-touching heights.
1566  const float kMinHeight = 0.25; // Min fraction of lineheight to use.
1567 
1568  sign_bit = row->xheight > 0 ? 1 : -1;
1569 
1570  memset(lefts, 0, HEIGHTBUCKETS * sizeof(lefts[0]));
1571  memset(rights, 0, HEIGHTBUCKETS * sizeof(rights[0]));
1572  mode_count = 0;
1573  for (blobindex = 0; blobindex < blobcount; blobindex++) {
1574  int xcenter = (blobcoords[blobindex].left () +
1575  blobcoords[blobindex].right ()) / 2;
1576  float base = baseline->y(xcenter);
1577  float bottomdiff = fabs(base - blobcoords[blobindex].bottom());
1578  int strength = textord_ocropus_mode &&
1579  bottomdiff <= kBaselineTouch ? kGoodStrength : 1;
1580  int height = static_cast<int>(blobcoords[blobindex].top () - base + 0.5);
1581  if (blobcoords[blobindex].height () > init_lineheight * kMinHeight) {
1582  if (height > lineheight * oldbl_xhfract
1583  && height > textord_min_xheight) {
1584  heightstat.add (height, strength);
1585  if (height < HEIGHTBUCKETS) {
1586  if (xcenter > rights[height])
1587  rights[height] = xcenter;
1588  if (xcenter > 0 && (lefts[height] == 0 || xcenter < lefts[height]))
1589  lefts[height] = xcenter;
1590  }
1591  }
1592  mode_count += strength;
1593  }
1594  }
1595 
1596  mode_threshold = (int) (blobcount * 0.1);
1597  if (oldbl_dot_error_size > 1 || oldbl_xhfix)
1598  mode_threshold = (int) (mode_count * 0.1);
1599 
1600  if (textord_oldbl_debug) {
1601  tprintf ("blobcount=%d, mode_count=%d, mode_t=%d\n",
1602  blobcount, mode_count, mode_threshold);
1603  }
1604  find_top_modes(&heightstat, HEIGHTBUCKETS, modelist, MODENUM);
1605  if (textord_oldbl_debug) {
1606  for (blobindex = 0; blobindex < MODENUM; blobindex++)
1607  tprintf ("mode[%d]=%d ", blobindex, modelist[blobindex]);
1608  tprintf ("\n");
1609  }
1610  pick_x_height(row, modelist, lefts, rights, &heightstat, mode_threshold);
1611 
1612  if (textord_oldbl_debug)
1613  tprintf ("Output xheight=%g\n", row->xheight);
1614  if (row->xheight < 0 && textord_oldbl_debug)
1615  tprintf ("warning: Row Line height < 0; %4.2f\n", row->xheight);
1616 
1617  if (sign_bit < 0)
1618  row->xheight = -row->xheight;
1619 }
1620 
1621 /**********************************************************************
1622  * find_top_modes
1623  *
1624  * Fill the input array with the indices of the top ten modes of the
1625  * input distribution.
1626  **********************************************************************/
1627 
1628 const int kMinModeFactorOcropus = 32;
1629 const int kMinModeFactor = 12;
1630 
1631 void
1632 find_top_modes ( //get modes
1633 STATS * stats, //stats to hack
1634 int statnum, //no of piles
1635 int modelist[], int modenum //no of modes to get
1636 ) {
1637  int mode_count;
1638  int last_i = 0;
1639  int last_max = MAX_INT32;
1640  int i;
1641  int mode;
1642  int total_max = 0;
1643  int mode_factor = textord_ocropus_mode ?
1645 
1646  for (mode_count = 0; mode_count < modenum; mode_count++) {
1647  mode = 0;
1648  for (i = 0; i < statnum; i++) {
1649  if (stats->pile_count (i) > stats->pile_count (mode)) {
1650  if ((stats->pile_count (i) < last_max) ||
1651  ((stats->pile_count (i) == last_max) && (i > last_i))) {
1652  mode = i;
1653  }
1654  }
1655  }
1656  last_i = mode;
1657  last_max = stats->pile_count (last_i);
1658  total_max += last_max;
1659  if (last_max <= total_max / mode_factor)
1660  mode = 0;
1661  modelist[mode_count] = mode;
1662  }
1663 }
1664 
1665 
1666 /**********************************************************************
1667  * pick_x_height
1668  *
1669  * Choose based on the height modes the best x height value.
1670  **********************************************************************/
1671 
1672 void pick_x_height(TO_ROW * row, //row to do
1673  int modelist[],
1674  int lefts[], int rights[],
1675  STATS * heightstat,
1676  int mode_threshold) {
1677  int x;
1678  int y;
1679  int z;
1680  float ratio;
1681  int found_one_bigger = FALSE;
1682  int best_x_height = 0;
1683  int best_asc = 0;
1684  int num_in_best;
1685 
1686  for (x = 0; x < MODENUM; x++) {
1687  for (y = 0; y < MODENUM; y++) {
1688  /* Check for two modes */
1689  if (modelist[x] && modelist[y] &&
1690  heightstat->pile_count (modelist[x]) > mode_threshold &&
1692  MIN(rights[modelist[x]], rights[modelist[y]]) >
1693  MAX(lefts[modelist[x]], lefts[modelist[y]]))) {
1694  ratio = (float) modelist[y] / (float) modelist[x];
1695  if (1.2 < ratio && ratio < 1.8) {
1696  /* Two modes found */
1697  best_x_height = modelist[x];
1698  num_in_best = heightstat->pile_count (modelist[x]);
1699 
1700  /* Try to get one higher */
1701  do {
1702  found_one_bigger = FALSE;
1703  for (z = 0; z < MODENUM; z++) {
1704  if (modelist[z] == best_x_height + 1 &&
1706  MIN(rights[modelist[x]], rights[modelist[y]]) >
1707  MAX(lefts[modelist[x]], lefts[modelist[y]]))) {
1708  ratio = (float) modelist[y] / (float) modelist[z];
1709  if ((1.2 < ratio && ratio < 1.8) &&
1710  /* Should be half of best */
1711  heightstat->pile_count (modelist[z]) >
1712  num_in_best * 0.5) {
1713  best_x_height++;
1714  found_one_bigger = TRUE;
1715  break;
1716  }
1717  }
1718  }
1719  }
1720  while (found_one_bigger);
1721 
1722  /* try to get a higher ascender */
1723 
1724  best_asc = modelist[y];
1725  num_in_best = heightstat->pile_count (modelist[y]);
1726 
1727  /* Try to get one higher */
1728  do {
1729  found_one_bigger = FALSE;
1730  for (z = 0; z < MODENUM; z++) {
1731  if (modelist[z] > best_asc &&
1733  MIN(rights[modelist[x]], rights[modelist[y]]) >
1734  MAX(lefts[modelist[x]], lefts[modelist[y]]))) {
1735  ratio = (float) modelist[z] / (float) best_x_height;
1736  if ((1.2 < ratio && ratio < 1.8) &&
1737  /* Should be half of best */
1738  heightstat->pile_count (modelist[z]) >
1739  num_in_best * 0.5) {
1740  best_asc = modelist[z];
1741  found_one_bigger = TRUE;
1742  break;
1743  }
1744  }
1745  }
1746  }
1747  while (found_one_bigger);
1748 
1749  row->xheight = (float) best_x_height;
1750  row->ascrise = (float) best_asc - best_x_height;
1751  return;
1752  }
1753  }
1754  }
1755  }
1756 
1757  best_x_height = modelist[0]; /* Single Mode found */
1758  num_in_best = heightstat->pile_count (best_x_height);
1759  do {
1760  /* Try to get one higher */
1761  found_one_bigger = FALSE;
1762  for (z = 1; z < MODENUM; z++) {
1763  /* Should be half of best */
1764  if ((modelist[z] == best_x_height + 1) &&
1765  (heightstat->pile_count (modelist[z]) > num_in_best * 0.5)) {
1766  best_x_height++;
1767  found_one_bigger = TRUE;
1768  break;
1769  }
1770  }
1771  }
1772  while (found_one_bigger);
1773 
1774  row->ascrise = 0.0f;
1775  row->xheight = (float) best_x_height;
1776  if (row->xheight == 0)
1777  row->xheight = -1.0f;
1778 }