Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tabvector.cpp
Go to the documentation of this file.
1 
2 // File: tabvector.cpp
3 // Description: Class to hold a near-vertical vector representing a tab-stop.
4 // Author: Ray Smith
5 // Created: Thu Apr 10 16:28:01 PST 2008
6 //
7 // (C) Copyright 2008, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #include "tabvector.h"
25 #include "blobbox.h"
26 #include "colfind.h"
27 #include "colpartitionset.h"
28 #include "detlinefit.h"
29 #include "statistc.h"
30 
31 // Include automatically generated configuration file if running autoconf.
32 #ifdef HAVE_CONFIG_H
33 #include "config_auto.h"
34 #endif
35 
36 namespace tesseract {
37 
38 // Multiple of height used as a gutter for evaluation search.
39 const int kGutterMultiple = 4;
40 // Multiple of neighbour gap that we expect the gutter gap to be at minimum.
42 // Pixel distance for tab vectors to be considered the same.
43 const int kSimilarVectorDist = 10;
44 // Pixel distance for ragged tab vectors to be considered the same if there
45 // is nothing in the overlap box
46 const int kSimilarRaggedDist = 50;
47 // Max multiple of height to allow filling in between blobs when evaluating.
48 const int kMaxFillinMultiple = 11;
49 // Min fraction of mean gutter size to allow a gutter on a good tab blob.
50 const double kMinGutterFraction = 0.5;
51 // Multiple of 1/n lines as a minimum gutter in evaluation.
52 const double kLineCountReciprocal = 4.0;
53 // Constant add-on for minimum gutter for aligned tabs.
54 const double kMinAlignedGutter = 0.25;
55 // Constant add-on for minimum gutter for ragged tabs.
56 const double kMinRaggedGutter = 1.5;
57 
59  "max fraction of mean blob width allowed for vertical gaps in vertical text");
60 
62  "Fraction of box matches required to declare a line vertical");
63 
65 
66 // Create a constraint for the top or bottom of this TabVector.
67 void TabConstraint::CreateConstraint(TabVector* vector, bool is_top) {
68  TabConstraint* constraint = new TabConstraint(vector, is_top);
69  TabConstraint_LIST* constraints = new TabConstraint_LIST;
70  TabConstraint_IT it(constraints);
71  it.add_to_end(constraint);
72  if (is_top)
73  vector->set_top_constraints(constraints);
74  else
75  vector->set_bottom_constraints(constraints);
76 }
77 
78 // Test to see if the constraints are compatible enough to merge.
79 bool TabConstraint::CompatibleConstraints(TabConstraint_LIST* list1,
80  TabConstraint_LIST* list2) {
81  if (list1 == list2)
82  return false;
83  int y_min = -MAX_INT32;
84  int y_max = MAX_INT32;
85  if (textord_debug_tabfind > 3)
86  tprintf("Testing constraint compatibility\n");
87  GetConstraints(list1, &y_min, &y_max);
88  GetConstraints(list2, &y_min, &y_max);
89  if (textord_debug_tabfind > 3)
90  tprintf("Resulting range = [%d,%d]\n", y_min, y_max);
91  return y_max >= y_min;
92 }
93 
94 // Merge the lists of constraints and update the TabVector pointers.
95 // The second list is deleted.
96 void TabConstraint::MergeConstraints(TabConstraint_LIST* list1,
97  TabConstraint_LIST* list2) {
98  if (list1 == list2)
99  return;
100  TabConstraint_IT it(list2);
101  if (textord_debug_tabfind > 3)
102  tprintf("Merging constraints\n");
103  // The vectors of all constraints on list2 are now going to be on list1.
104  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
105  TabConstraint* constraint = it.data();
106  if (textord_debug_tabfind> 3)
107  constraint->vector_->Print("Merge");
108  if (constraint->is_top_)
109  constraint->vector_->set_top_constraints(list1);
110  else
111  constraint->vector_->set_bottom_constraints(list1);
112  }
113  it = list1;
114  it.add_list_before(list2);
115  delete list2;
116 }
117 
118 // Set all the tops and bottoms as appropriate to a mean of the
119 // constrained range. Delete all the constraints and list.
120 void TabConstraint::ApplyConstraints(TabConstraint_LIST* constraints) {
121  int y_min = -MAX_INT32;
122  int y_max = MAX_INT32;
123  GetConstraints(constraints, &y_min, &y_max);
124  int y = (y_min + y_max) / 2;
125  TabConstraint_IT it(constraints);
126  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
127  TabConstraint* constraint = it.data();
128  TabVector* v = constraint->vector_;
129  if (constraint->is_top_) {
130  v->SetYEnd(y);
132  } else {
133  v->SetYStart(y);
135  }
136  }
137  delete constraints;
138 }
139 
140 TabConstraint::TabConstraint(TabVector* vector, bool is_top)
141  : vector_(vector), is_top_(is_top) {
142  if (is_top) {
143  y_min_ = vector->endpt().y();
144  y_max_ = vector->extended_ymax();
145  } else {
146  y_max_ = vector->startpt().y();
147  y_min_ = vector->extended_ymin();
148  }
149 }
150 
151 // Get the max of the mins and the min of the maxes.
152 void TabConstraint::GetConstraints(TabConstraint_LIST* constraints,
153  int* y_min, int* y_max) {
154  TabConstraint_IT it(constraints);
155  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
156  TabConstraint* constraint = it.data();
157  if (textord_debug_tabfind > 3) {
158  tprintf("Constraint is [%d,%d]", constraint->y_min_, constraint->y_max_);
159  constraint->vector_->Print(" for");
160  }
161  *y_min = MAX(*y_min, constraint->y_min_);
162  *y_max = MIN(*y_max, constraint->y_max_);
163  }
164 }
165 
166 ELIST2IZE(TabVector)
167 CLISTIZE(TabVector)
168 
169 // The constructor is private. See the bottom of the file...
170 
172 }
173 
174 
175 // Public factory to build a TabVector from a list of boxes.
176 // The TabVector will be of the given alignment type.
177 // The input vertical vector is used in fitting, and the output
178 // vertical_x, vertical_y have the resulting line vector added to them
179 // if the alignment is not ragged.
180 // The extended_start_y and extended_end_y are the maximum possible
181 // extension to the line segment that can be used to align with others.
182 // The input CLIST of BLOBNBOX good_points is consumed and taken over.
184  int extended_start_y, int extended_end_y,
185  BLOBNBOX_CLIST* good_points,
186  int* vertical_x, int* vertical_y) {
187  TabVector* vector = new TabVector(extended_start_y, extended_end_y,
188  alignment, good_points);
189  if (!vector->Fit(vertical, false)) {
190  delete vector;
191  return NULL;
192  }
193  if (!vector->IsRagged()) {
194  vertical = vector->endpt_ - vector->startpt_;
195  int weight = vector->BoxCount();
196  *vertical_x += vertical.x() * weight;
197  *vertical_y += vertical.y() * weight;
198  }
199  return vector;
200 }
201 
202 // Build a ragged TabVector by copying another's direction, shifting it
203 // to match the given blob, and making its initial extent the height
204 // of the blob, but its extended bounds from the bounds of the original.
206  const ICOORD& vertical_skew, BLOBNBOX* blob)
207  : extended_ymin_(src.extended_ymin_), extended_ymax_(src.extended_ymax_),
208  sort_key_(0), percent_score_(0), mean_width_(0),
209  needs_refit_(true), needs_evaluation_(true), intersects_other_lines_(false),
210  alignment_(alignment),
211  top_constraints_(NULL), bottom_constraints_(NULL) {
212  BLOBNBOX_C_IT it(&boxes_);
213  it.add_to_end(blob);
214  TBOX box = blob->bounding_box();
215  if (IsLeftTab()) {
216  startpt_ = box.botleft();
217  endpt_ = box.topleft();
218  } else {
219  startpt_ = box.botright();
220  endpt_ = box.topright();
221  }
222  sort_key_ = SortKey(vertical_skew,
223  (startpt_.x() + endpt_.x()) / 2,
224  (startpt_.y() + endpt_.y()) / 2);
225  if (textord_debug_tabfind > 3)
226  Print("Constructed a new tab vector:");
227 }
228 
229 // Copies basic attributes of a tab vector for simple operations.
230 // Copies things such startpt, endpt, range.
231 // Does not copy things such as partners, boxes, or constraints.
232 // This is useful if you only need vector information for processing, such
233 // as in the table detection code.
235  TabVector* copy = new TabVector();
236  copy->startpt_ = startpt_;
237  copy->endpt_ = endpt_;
238  copy->alignment_ = alignment_;
239  copy->extended_ymax_ = extended_ymax_;
240  copy->extended_ymin_ = extended_ymin_;
241  copy->intersects_other_lines_ = intersects_other_lines_;
242  return copy;
243 }
244 
245 // Extend this vector to include the supplied blob if it doesn't
246 // already have it.
248  TBOX new_box = new_blob->bounding_box();
249  BLOBNBOX_C_IT it(&boxes_);
250  if (!it.empty()) {
251  BLOBNBOX* blob = it.data();
252  TBOX box = blob->bounding_box();
253  while (!it.at_last() && box.top() <= new_box.top()) {
254  if (blob == new_blob)
255  return; // We have it already.
256  it.forward();
257  blob = it.data();
258  box = blob->bounding_box();
259  }
260  if (box.top() >= new_box.top()) {
261  it.add_before_stay_put(new_blob);
262  needs_refit_ = true;
263  return;
264  }
265  }
266  needs_refit_ = true;
267  it.add_after_stay_put(new_blob);
268 }
269 
270 // Set the ycoord of the start and move the xcoord to match.
271 void TabVector::SetYStart(int start_y) {
272  startpt_.set_x(XAtY(start_y));
273  startpt_.set_y(start_y);
274 }
275 // Set the ycoord of the end and move the xcoord to match.
276 void TabVector::SetYEnd(int end_y) {
277  endpt_.set_x(XAtY(end_y));
278  endpt_.set_y(end_y);
279 }
280 
281 // Rotate the ends by the given vector. Auto flip start and end if needed.
282 void TabVector::Rotate(const FCOORD& rotation) {
283  startpt_.rotate(rotation);
284  endpt_.rotate(rotation);
285  int dx = endpt_.x() - startpt_.x();
286  int dy = endpt_.y() - startpt_.y();
287  if ((dy < 0 && abs(dy) > abs(dx)) || (dx < 0 && abs(dx) > abs(dy))) {
288  // Need to flip start/end.
289  ICOORD tmp = startpt_;
290  startpt_ = endpt_;
291  endpt_ = tmp;
292  }
293 }
294 
295 // Setup the initial constraints, being the limits of
296 // the vector and the extended ends.
298  TabConstraint::CreateConstraint(this, false);
300 }
301 
302 // Setup the constraints between the partners of this TabVector.
304  // With the first and last partner, we want a common bottom and top,
305  // respectively, and for each change of partner, we want a common
306  // top of first with bottom of next.
307  TabVector_C_IT it(&partners_);
308  TabVector* prev_partner = NULL;
309  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
310  TabVector* partner = it.data();
311  if (partner->top_constraints_ == NULL ||
312  partner->bottom_constraints_ == NULL) {
313  partner->Print("Impossible: has no constraints");
314  Print("This vector has it as a partner");
315  continue;
316  }
317  if (prev_partner == NULL) {
318  // This is the first partner, so common bottom.
319  if (TabConstraint::CompatibleConstraints(bottom_constraints_,
320  partner->bottom_constraints_))
321  TabConstraint::MergeConstraints(bottom_constraints_,
322  partner->bottom_constraints_);
323  } else {
324  // We need prev top to be common with partner bottom.
325  if (TabConstraint::CompatibleConstraints(prev_partner->top_constraints_,
326  partner->bottom_constraints_))
327  TabConstraint::MergeConstraints(prev_partner->top_constraints_,
328  partner->bottom_constraints_);
329  }
330  prev_partner = partner;
331  if (it.at_last()) {
332  // This is the last partner, so common top.
333  if (TabConstraint::CompatibleConstraints(top_constraints_,
334  partner->top_constraints_))
335  TabConstraint::MergeConstraints(top_constraints_,
336  partner->top_constraints_);
337  }
338  }
339 }
340 
341 // Setup the constraints between this and its partner.
343  if (TabConstraint::CompatibleConstraints(bottom_constraints_,
344  partner->bottom_constraints_))
345  TabConstraint::MergeConstraints(bottom_constraints_,
346  partner->bottom_constraints_);
347  if (TabConstraint::CompatibleConstraints(top_constraints_,
348  partner->top_constraints_))
349  TabConstraint::MergeConstraints(top_constraints_,
350  partner->top_constraints_);
351 }
352 
353 // Use the constraints to modify the top and bottom.
355  if (top_constraints_ != NULL)
356  TabConstraint::ApplyConstraints(top_constraints_);
357  if (bottom_constraints_ != NULL)
358  TabConstraint::ApplyConstraints(bottom_constraints_);
359 }
360 
361 // Merge close tab vectors of the same side that overlap.
363  TabVector_LIST* vectors,
364  BlobGrid* grid) {
365  TabVector_IT it1(vectors);
366  for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
367  TabVector* v1 = it1.data();
368  TabVector_IT it2(it1);
369  for (it2.forward(); !it2.at_first(); it2.forward()) {
370  TabVector* v2 = it2.data();
371  if (v2->SimilarTo(vertical, *v1, grid)) {
372  // Merge into the forward one, in case the combined vector now
373  // overlaps one in between.
374  if (textord_debug_tabfind) {
375  v2->Print("Merging");
376  v1->Print("by deleting");
377  }
378  v2->MergeWith(vertical, it1.extract());
379  if (textord_debug_tabfind) {
380  v2->Print("Producing");
381  }
382  ICOORD merged_vector = v2->endpt();
383  merged_vector -= v2->startpt();
384  if (abs(merged_vector.x()) > 100) {
385  v2->Print("Garbage result of merge?");
386  }
387  break;
388  }
389  }
390  }
391 }
392 
393 // Return true if this vector is the same side, overlaps, and close
394 // enough to the other to be merged.
395 bool TabVector::SimilarTo(const ICOORD& vertical,
396  const TabVector& other, BlobGrid* grid) const {
397  if ((IsRightTab() && other.IsRightTab()) ||
398  (IsLeftTab() && other.IsLeftTab())) {
399  // If they don't overlap, at least in extensions, then there is no chance.
400  if (ExtendedOverlap(other.extended_ymax_, other.extended_ymin_) < 0)
401  return false;
402  // A fast approximation to the scale factor of the sort_key_.
403  int v_scale = abs(vertical.y());
404  if (v_scale == 0)
405  v_scale = 1;
406  // If they are close enough, then OK.
407  if (sort_key_ + kSimilarVectorDist * v_scale >= other.sort_key_ &&
408  sort_key_ - kSimilarVectorDist * v_scale <= other.sort_key_)
409  return true;
410  // Ragged tabs get a bigger threshold.
411  if (!IsRagged() || !other.IsRagged() ||
412  sort_key_ + kSimilarRaggedDist * v_scale < other.sort_key_ ||
413  sort_key_ - kSimilarRaggedDist * v_scale > other.sort_key_)
414  return false;
415  if (grid == NULL) {
416  // There is nothing else to test!
417  return true;
418  }
419  // If there is nothing in the rectangle between the vector that is going to
420  // move, and the place it is moving to, then they can be merged.
421  // Setup a vertical search for any blob.
422  const TabVector* mover = (IsRightTab() &&
423  sort_key_ < other.sort_key_) ? this : &other;
424  int top_y = mover->endpt_.y();
425  int bottom_y = mover->startpt_.y();
426  int left = MIN(mover->XAtY(top_y), mover->XAtY(bottom_y));
427  int right = MAX(mover->XAtY(top_y), mover->XAtY(bottom_y));
428  int shift = abs(sort_key_ - other.sort_key_) / v_scale;
429  if (IsRightTab()) {
430  right += shift;
431  } else {
432  left -= shift;
433  }
434 
436  vsearch.StartVerticalSearch(left, right, top_y);
437  BLOBNBOX* blob;
438  while ((blob = vsearch.NextVerticalSearch(true)) != NULL) {
439  TBOX box = blob->bounding_box();
440  if (box.top() > bottom_y)
441  return true; // Nothing found.
442  if (box.bottom() < top_y)
443  continue; // Doesn't overlap.
444  int left_at_box = XAtY(box.bottom());
445  int right_at_box = left_at_box;
446  if (IsRightTab())
447  right_at_box += shift;
448  else
449  left_at_box -= shift;
450  if (MIN(right_at_box, box.right()) > MAX(left_at_box, box.left()))
451  return false;
452  }
453  return true; // Nothing found.
454  }
455  return false;
456 }
457 
458 // Eat the other TabVector into this and delete it.
459 void TabVector::MergeWith(const ICOORD& vertical, TabVector* other) {
460  extended_ymin_ = MIN(extended_ymin_, other->extended_ymin_);
461  extended_ymax_ = MAX(extended_ymax_, other->extended_ymax_);
462  if (other->IsRagged()) {
463  alignment_ = other->alignment_;
464  }
465  // Merge sort the two lists of boxes.
466  BLOBNBOX_C_IT it1(&boxes_);
467  BLOBNBOX_C_IT it2(&other->boxes_);
468  while (!it2.empty()) {
469  BLOBNBOX* bbox2 = it2.extract();
470  it2.forward();
471  TBOX box2 = bbox2->bounding_box();
472  BLOBNBOX* bbox1 = it1.data();
473  TBOX box1 = bbox1->bounding_box();
474  while (box1.bottom() < box2.bottom() && !it1.at_last()) {
475  it1.forward();
476  bbox1 = it1.data();
477  box1 = bbox1->bounding_box();
478  }
479  if (box1.bottom() < box2.bottom()) {
480  it1.add_to_end(bbox2);
481  } else if (bbox1 != bbox2) {
482  it1.add_before_stay_put(bbox2);
483  }
484  }
485  Fit(vertical, true);
486  other->Delete(this);
487 }
488 
489 // Add a new element to the list of partner TabVectors.
490 // Partners must be added in order of increasing y coordinate of the text line
491 // that makes them partners.
492 // Groups of identical partners are merged into one.
494  if (IsSeparator() || partner->IsSeparator())
495  return;
496  TabVector_C_IT it(&partners_);
497  if (!it.empty()) {
498  it.move_to_last();
499  if (it.data() == partner)
500  return;
501  }
502  it.add_after_then_move(partner);
503 }
504 
505 // Return true if other is a partner of this.
506 bool TabVector::IsAPartner(const TabVector* other) {
507  TabVector_C_IT it(&partners_);
508  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
509  if (it.data() == other)
510  return true;
511  }
512  return false;
513 }
514 
515 // These names must be synced with the TabAlignment enum in tabvector.h.
516 const char* kAlignmentNames[] = {
517  "Left Aligned",
518  "Left Ragged",
519  "Center",
520  "Right Aligned",
521  "Right Ragged",
522  "Separator"
523 };
524 
525 // Print basic information about this tab vector.
526 void TabVector::Print(const char* prefix) {
527  if (this == NULL) {
528  tprintf("%s <null>\n", prefix);
529  } else {
530  tprintf("%s %s (%d,%d)->(%d,%d) w=%d s=%d, sort key=%d, boxes=%d,"
531  " partners=%d\n",
532  prefix, kAlignmentNames[alignment_],
533  startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y(),
534  mean_width_, percent_score_, sort_key_,
535  boxes_.length(), partners_.length());
536  }
537 }
538 
539 // Print basic information about this tab vector and every box in it.
540 void TabVector::Debug(const char* prefix) {
541  Print(prefix);
542  BLOBNBOX_C_IT it(&boxes_);
543  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
544  BLOBNBOX* bbox = it.data();
545  const TBOX& box = bbox->bounding_box();
546  tprintf("Box at (%d,%d)->(%d,%d)\n",
547  box.left(), box.bottom(), box.right(), box.top());
548  }
549 }
550 
551 // Draw this tabvector in place in the given window.
553 #ifndef GRAPHICS_DISABLED
555  tab_win->Pen(ScrollView::BLUE);
556  else if (alignment_ == TA_LEFT_ALIGNED)
557  tab_win->Pen(ScrollView::LIME_GREEN);
558  else if (alignment_ == TA_LEFT_RAGGED)
559  tab_win->Pen(ScrollView::DARK_GREEN);
560  else if (alignment_ == TA_RIGHT_ALIGNED)
561  tab_win->Pen(ScrollView::PINK);
562  else if (alignment_ == TA_RIGHT_RAGGED)
563  tab_win->Pen(ScrollView::CORAL);
564  else
565  tab_win->Pen(ScrollView::WHITE);
566  tab_win->Line(startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y());
567  tab_win->Pen(ScrollView::GREY);
568  tab_win->Line(startpt_.x(), startpt_.y(), startpt_.x(), extended_ymin_);
569  tab_win->Line(endpt_.x(), extended_ymax_, endpt_.x(), endpt_.y());
570  char score_buf[64];
571  snprintf(score_buf, sizeof(score_buf), "%d", percent_score_);
572  tab_win->TextAttributes("Times", 50, false, false, false);
573  tab_win->Text(startpt_.x(), startpt_.y(), score_buf);
574 #endif
575 }
576 
577 // Refit the line and/or re-evaluate the vector if the dirty flags are set.
579  TabFind* finder) {
580  if (needs_refit_)
581  Fit(vertical, true);
582  if (needs_evaluation_)
583  Evaluate(vertical, finder);
584 }
585 
586 // Evaluate the vector in terms of coverage of its length by good-looking
587 // box edges. A good looking box is one where its nearest neighbour on the
588 // inside is nearer than half the distance its nearest neighbour on the
589 // outside of the putative column. Bad boxes are removed from the line.
590 // A second pass then further filters boxes by requiring that the gutter
591 // width be a minimum fraction of the mean gutter along the line.
592 void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) {
593  bool debug = false;
594  needs_evaluation_ = false;
595  int length = endpt_.y() - startpt_.y();
596  if (length == 0 || boxes_.empty()) {
597  percent_score_ = 0;
598  Print("Zero length in evaluate");
599  return;
600  }
601  // Compute the mean box height.
602  BLOBNBOX_C_IT it(&boxes_);
603  int mean_height = 0;
604  int height_count = 0;
605  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
606  BLOBNBOX* bbox = it.data();
607  const TBOX& box = bbox->bounding_box();
608  int height = box.height();
609  mean_height += height;
610  ++height_count;
611  }
612  mean_height /= height_count;
613  int max_gutter = kGutterMultiple * mean_height;
614  if (IsRagged()) {
615  // Ragged edges face a tougher test in that the gap must always be within
616  // the height of the blob.
617  max_gutter = kGutterToNeighbourRatio * mean_height;
618  }
619 
620  STATS gutters(0, max_gutter + 1);
621  // Evaluate the boxes for their goodness, calculating the coverage as we go.
622  // Remove boxes that are not good and shorten the list to the first and
623  // last good boxes.
624  int num_deleted_boxes = 0;
625  bool text_on_image = false;
626  int good_length = 0;
627  const TBOX* prev_good_box = NULL;
628  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
629  BLOBNBOX* bbox = it.data();
630  const TBOX& box = bbox->bounding_box();
631  int mid_y = (box.top() + box.bottom()) / 2;
632  if (TabFind::WithinTestRegion(2, XAtY(box.bottom()), box.bottom())) {
633  if (!debug) {
634  tprintf("After already deleting %d boxes, ", num_deleted_boxes);
635  Print("Starting evaluation");
636  }
637  debug = true;
638  }
639  // A good box is one where the nearest neighbour on the inside is closer
640  // than half the distance to the nearest neighbour on the outside
641  // (of the putative column).
642  bool left = IsLeftTab();
643  int tab_x = XAtY(mid_y);
644  int gutter_width;
645  int neighbour_gap;
646  finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left,
647  bbox, &gutter_width, &neighbour_gap);
648  if (debug) {
649  tprintf("Box (%d,%d)->(%d,%d) has gutter %d, ndist %d\n",
650  box.left(), box.bottom(), box.right(), box.top(),
651  gutter_width, neighbour_gap);
652  }
653  // Now we can make the test.
654  if (neighbour_gap * kGutterToNeighbourRatio <= gutter_width) {
655  // A good box contributes its height to the good_length.
656  good_length += box.top() - box.bottom();
657  gutters.add(gutter_width, 1);
658  // Two good boxes together contribute the gap between them
659  // to the good_length as well, as long as the gap is not
660  // too big.
661  if (prev_good_box != NULL) {
662  int vertical_gap = box.bottom() - prev_good_box->top();
663  double size1 = sqrt(static_cast<double>(prev_good_box->area()));
664  double size2 = sqrt(static_cast<double>(box.area()));
665  if (vertical_gap < kMaxFillinMultiple * MIN(size1, size2))
666  good_length += vertical_gap;
667  if (debug) {
668  tprintf("Box and prev good, gap=%d, target %g, goodlength=%d\n",
669  vertical_gap, kMaxFillinMultiple * MIN(size1, size2),
670  good_length);
671  }
672  } else {
673  // Adjust the start to the first good box.
674  SetYStart(box.bottom());
675  }
676  prev_good_box = &box;
677  if (bbox->flow() == BTFT_TEXT_ON_IMAGE)
678  text_on_image = true;
679  } else {
680  // Get rid of boxes that are not good.
681  if (debug) {
682  tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, ndist %d\n",
683  box.left(), box.bottom(), box.right(), box.top(),
684  gutter_width, neighbour_gap);
685  }
686  it.extract();
687  ++num_deleted_boxes;
688  }
689  }
690  if (debug) {
691  Print("Evaluating:");
692  }
693  // If there are any good boxes, do it again, except this time get rid of
694  // boxes that have a gutter that is a small fraction of the mean gutter.
695  // This filters out ends that run into a coincidental gap in the text.
696  int search_top = endpt_.y();
697  int search_bottom = startpt_.y();
698  int median_gutter = IntCastRounded(gutters.median());
699  if (gutters.get_total() > 0) {
700  prev_good_box = NULL;
701  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
702  BLOBNBOX* bbox = it.data();
703  const TBOX& box = bbox->bounding_box();
704  int mid_y = (box.top() + box.bottom()) / 2;
705  // A good box is one where the gutter width is at least some constant
706  // fraction of the mean gutter width.
707  bool left = IsLeftTab();
708  int tab_x = XAtY(mid_y);
709  int max_gutter = kGutterMultiple * mean_height;
710  if (IsRagged()) {
711  // Ragged edges face a tougher test in that the gap must always be
712  // within the height of the blob.
713  max_gutter = kGutterToNeighbourRatio * mean_height;
714  }
715  int gutter_width;
716  int neighbour_gap;
717  finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left,
718  bbox, &gutter_width, &neighbour_gap);
719  // Now we can make the test.
720  if (gutter_width >= median_gutter * kMinGutterFraction) {
721  if (prev_good_box == NULL) {
722  // Adjust the start to the first good box.
723  SetYStart(box.bottom());
724  search_bottom = box.top();
725  }
726  prev_good_box = &box;
727  search_top = box.bottom();
728  } else {
729  // Get rid of boxes that are not good.
730  if (debug) {
731  tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, mean gutter %d\n",
732  box.left(), box.bottom(), box.right(), box.top(),
733  gutter_width, median_gutter);
734  }
735  it.extract();
736  ++num_deleted_boxes = true;
737  }
738  }
739  }
740  // If there has been a good box, adjust the end.
741  if (prev_good_box != NULL) {
742  SetYEnd(prev_good_box->top());
743  // Compute the percentage of the vector that is occupied by good boxes.
744  int length = endpt_.y() - startpt_.y();
745  percent_score_ = 100 * good_length / length;
746  if (num_deleted_boxes > 0) {
747  needs_refit_ = true;
748  FitAndEvaluateIfNeeded(vertical, finder);
749  if (boxes_.empty())
750  return;
751  }
752  // Test the gutter over the whole vector, instead of just at the boxes.
753  int required_shift;
754  if (search_bottom > search_top) {
755  search_bottom = startpt_.y();
756  search_top = endpt_.y();
757  }
758  double min_gutter_width = kLineCountReciprocal / boxes_.length();
759  min_gutter_width += IsRagged() ? kMinRaggedGutter : kMinAlignedGutter;
760  min_gutter_width *= mean_height;
761  int max_gutter_width = IntCastRounded(min_gutter_width) + 1;
762  if (median_gutter > max_gutter_width)
763  max_gutter_width = median_gutter;
764  int gutter_width = finder->GutterWidth(search_bottom, search_top, *this,
765  text_on_image, max_gutter_width,
766  &required_shift);
767  if (gutter_width < min_gutter_width) {
768  if (debug) {
769  tprintf("Rejecting bad tab Vector with %d gutter vs %g min\n",
770  gutter_width, min_gutter_width);
771  }
772  boxes_.shallow_clear();
773  percent_score_ = 0;
774  } else if (debug) {
775  tprintf("Final gutter %d, vs limit of %g, required shift = %d\n",
776  gutter_width, min_gutter_width, required_shift);
777  }
778  } else {
779  // There are no good boxes left, so score is 0.
780  percent_score_ = 0;
781  }
782 
783  if (debug) {
784  Print("Evaluation complete:");
785  }
786 }
787 
788 // (Re)Fit a line to the stored points. Returns false if the line
789 // is degenerate. Althougth the TabVector code mostly doesn't care about the
790 // direction of lines, XAtY would give silly results for a horizontal line.
791 // The class is mostly aimed at use for vertical lines representing
792 // horizontal tab stops.
793 bool TabVector::Fit(ICOORD vertical, bool force_parallel) {
794  needs_refit_ = false;
795  if (boxes_.empty()) {
796  // Don't refit something with no boxes, as that only happens
797  // in Evaluate, and we don't want to end up with a zero vector.
798  if (!force_parallel)
799  return false;
800  // If we are forcing parallel, then we just need to set the sort_key_.
801  ICOORD midpt = startpt_;
802  midpt += endpt_;
803  midpt /= 2;
804  sort_key_ = SortKey(vertical, midpt.x(), midpt.y());
805  return startpt_.y() != endpt_.y();
806  }
807  if (!force_parallel && !IsRagged()) {
808  // Use a fitted line as the vertical.
809  DetLineFit linepoints;
810  BLOBNBOX_C_IT it(&boxes_);
811  // Fit a line to all the boxes in the list.
812  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
813  BLOBNBOX* bbox = it.data();
814  TBOX box = bbox->bounding_box();
815  int x1 = IsRightTab() ? box.right() : box.left();
816  ICOORD boxpt(x1, box.bottom());
817  linepoints.Add(boxpt);
818  if (it.at_last()) {
819  ICOORD top_pt(x1, box.top());
820  linepoints.Add(top_pt);
821  }
822  }
823  linepoints.Fit(&startpt_, &endpt_);
824  if (startpt_.y() != endpt_.y()) {
825  vertical = endpt_;
826  vertical -= startpt_;
827  }
828  }
829  int start_y = startpt_.y();
830  int end_y = endpt_.y();
831  sort_key_ = IsLeftTab() ? MAX_INT32 : -MAX_INT32;
832  BLOBNBOX_C_IT it(&boxes_);
833  // Choose a line parallel to the vertical such that all boxes are on the
834  // correct side of it.
835  mean_width_ = 0;
836  int width_count = 0;
837  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
838  BLOBNBOX* bbox = it.data();
839  TBOX box = bbox->bounding_box();
840  mean_width_ += box.width();
841  ++width_count;
842  int x1 = IsRightTab() ? box.right() : box.left();
843  // Test both the bottom and the top, as one will be more extreme, depending
844  // on the direction of skew.
845  int bottom_y = box.bottom();
846  int top_y = box.top();
847  int key = SortKey(vertical, x1, bottom_y);
848  if (IsLeftTab() == (key < sort_key_)) {
849  sort_key_ = key;
850  startpt_ = ICOORD(x1, bottom_y);
851  }
852  key = SortKey(vertical, x1, top_y);
853  if (IsLeftTab() == (key < sort_key_)) {
854  sort_key_ = key;
855  startpt_ = ICOORD(x1, top_y);
856  }
857  if (it.at_first())
858  start_y = bottom_y;
859  if (it.at_last())
860  end_y = top_y;
861  }
862  if (width_count > 0) {
863  mean_width_ = (mean_width_ + width_count - 1) / width_count;
864  }
865  endpt_ = startpt_ + vertical;
866  needs_evaluation_ = true;
867  if (start_y != end_y) {
868  // Set the ends of the vector to fully include the first and last blobs.
869  startpt_.set_x(XAtY(vertical, sort_key_, start_y));
870  startpt_.set_y(start_y);
871  endpt_.set_x(XAtY(vertical, sort_key_, end_y));
872  endpt_.set_y(end_y);
873  return true;
874  }
875  return false;
876 }
877 
878 // Returns the singleton partner if there is one, or NULL otherwise.
880  if (!partners_.singleton())
881  return NULL;
882  TabVector_C_IT partner_it(&partners_);
883  TabVector* partner = partner_it.data();
884  return partner;
885 }
886 
887 // Return the partner of this TabVector if the vector qualifies as
888 // being a vertical text line, otherwise NULL.
890  if (!partners_.singleton())
891  return NULL;
892  TabVector_C_IT partner_it(&partners_);
893  TabVector* partner = partner_it.data();
894  BLOBNBOX_C_IT box_it1(&boxes_);
895  BLOBNBOX_C_IT box_it2(&partner->boxes_);
896  // Count how many boxes are also in the other list.
897  // At the same time, gather the mean width and median vertical gap.
898  if (textord_debug_tabfind > 1) {
899  Print("Testing for vertical text");
900  partner->Print(" partner");
901  }
902  int num_matched = 0;
903  int num_unmatched = 0;
904  int total_widths = 0;
905  int width = startpt().x() - partner->startpt().x();
906  if (width < 0)
907  width = -width;
908  STATS gaps(0, width * 2);
909  BLOBNBOX* prev_bbox = NULL;
910  box_it2.mark_cycle_pt();
911  for (box_it1.mark_cycle_pt(); !box_it1.cycled_list(); box_it1.forward()) {
912  BLOBNBOX* bbox = box_it1.data();
913  TBOX box = bbox->bounding_box();
914  if (prev_bbox != NULL) {
915  gaps.add(box.bottom() - prev_bbox->bounding_box().top(), 1);
916  }
917  while (!box_it2.cycled_list() && box_it2.data() != bbox &&
918  box_it2.data()->bounding_box().bottom() < box.bottom()) {
919  box_it2.forward();
920  }
921  if (!box_it2.cycled_list() && box_it2.data() == bbox &&
922  bbox->region_type() >= BRT_UNKNOWN &&
923  (prev_bbox == NULL || prev_bbox->region_type() >= BRT_UNKNOWN))
924  ++num_matched;
925  else
926  ++num_unmatched;
927  total_widths += box.width();
928  prev_bbox = bbox;
929  }
930  double avg_width = total_widths * 1.0 / (num_unmatched + num_matched);
931  double max_gap = textord_tabvector_vertical_gap_fraction * avg_width;
932  int min_box_match = static_cast<int>((num_matched + num_unmatched) *
934  bool is_vertical = (gaps.get_total() > 0 &&
935  num_matched >= min_box_match &&
936  gaps.median() <= max_gap);
937  if (textord_debug_tabfind > 1) {
938  tprintf("gaps=%d, matched=%d, unmatched=%d, min_match=%d "
939  "median gap=%.2f, width=%.2f max_gap=%.2f Vertical=%s\n",
940  gaps.get_total(), num_matched, num_unmatched, min_box_match,
941  gaps.median(), avg_width, max_gap, is_vertical?"Yes":"No");
942  }
943  return (is_vertical) ? partner : NULL;
944 }
945 
946 // The constructor is private.
947 TabVector::TabVector(int extended_ymin, int extended_ymax,
948  TabAlignment alignment, BLOBNBOX_CLIST* boxes)
949  : extended_ymin_(extended_ymin), extended_ymax_(extended_ymax),
950  sort_key_(0), percent_score_(0), mean_width_(0),
951  needs_refit_(true), needs_evaluation_(true), alignment_(alignment),
952  top_constraints_(NULL), bottom_constraints_(NULL) {
953  BLOBNBOX_C_IT it(&boxes_);
954  it.add_list_after(boxes);
955 }
956 
957 // Delete this, but first, repoint all the partners to point to
958 // replacement. If replacement is NULL, then partner relationships
959 // are removed.
960 void TabVector::Delete(TabVector* replacement) {
961  TabVector_C_IT it(&partners_);
962  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
963  TabVector* partner = it.data();
964  TabVector_C_IT p_it(&partner->partners_);
965  // If partner already has replacement in its list, then make
966  // replacement null, and just remove this TabVector when we find it.
967  TabVector* partner_replacement = replacement;
968  for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
969  TabVector* p_partner = p_it.data();
970  if (p_partner == partner_replacement) {
971  partner_replacement = NULL;
972  break;
973  }
974  }
975  // Remove all references to this, and replace with replacement if not NULL.
976  for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
977  TabVector* p_partner = p_it.data();
978  if (p_partner == this) {
979  p_it.extract();
980  if (partner_replacement != NULL)
981  p_it.add_before_stay_put(partner_replacement);
982  }
983  }
984  if (partner_replacement != NULL) {
985  partner_replacement->AddPartner(partner);
986  }
987  }
988  delete this;
989 }
990 
991 
992 } // namespace tesseract.