Tesseract
3.02
Main Page
Related Pages
Modules
Namespaces
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Groups
Pages
trainingsampleset.h
Go to the documentation of this file.
1
// Copyright 2010 Google Inc. All Rights Reserved.
2
// Author: rays@google.com (Ray Smith)
3
//
4
// Licensed under the Apache License, Version 2.0 (the "License");
5
// you may not use this file except in compliance with the License.
6
// You may obtain a copy of the License at
7
// http://www.apache.org/licenses/LICENSE-2.0
8
// Unless required by applicable law or agreed to in writing, software
9
// distributed under the License is distributed on an "AS IS" BASIS,
10
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
// See the License for the specific language governing permissions and
12
// limitations under the License.
13
//
15
16
#ifndef TESSERACT_TRAINING_TRAININGSAMPLESET_H__
17
#define TESSERACT_TRAINING_TRAININGSAMPLESET_H__
18
19
#include "
bitvector.h
"
20
#include "
genericvector.h
"
21
#include "
indexmapbidi.h
"
22
#include "
matrix.h
"
23
#include "
shapetable.h
"
24
#include "
trainingsample.h
"
25
26
class
UNICHARSET
;
27
template
<
typename
T>
class
UnicityTable
;
28
29
namespace
tesseract
{
30
31
struct
FontInfo;
32
class
IntFeatureMap;
33
class
IntFeatureSpace;
34
class
TrainingSample;
35
class
UnicharAndFonts;
36
37
// Collection of TrainingSample used for training or testing a classifier.
38
// Provides several useful methods to operate on the collection as a whole,
39
// including outlier detection and deletion, providing access by font and
40
// class, finding the canonical sample, finding the "cloud" features (OR of
41
// all features in all samples), replication of samples, caching of distance
42
// metrics.
43
class
TrainingSampleSet
{
44
public
:
45
explicit
TrainingSampleSet
(
const
UnicityTable<FontInfo>
& fontinfo_table);
46
~TrainingSampleSet
();
47
48
// Writes to the given file. Returns false in case of error.
49
bool
Serialize
(FILE* fp)
const
;
50
// Reads from the given file. Returns false in case of error.
51
// If swap is true, assumes a big/little-endian swap is needed.
52
bool
DeSerialize
(
bool
swap, FILE* fp);
53
54
// Accessors
55
int
num_samples
()
const
{
56
return
samples_.
size
();
57
}
58
int
num_raw_samples
()
const
{
59
return
num_raw_samples_;
60
}
61
int
NumFonts
()
const
{
62
return
font_id_map_.
SparseSize
();
63
}
64
const
UNICHARSET
&
unicharset
()
const
{
65
return
unicharset_;
66
}
67
int
charsetsize
()
const
{
68
return
unicharset_size_;
69
}
70
71
// Loads an initial unicharset, or sets one up if the file cannot be read.
72
void
LoadUnicharset
(
const
char
*
filename
);
73
74
// Adds a character sample to this sample set.
75
// If the unichar is not already in the local unicharset, it is added.
76
// Returns the unichar_id of the added sample, from the local unicharset.
77
int
AddSample
(
const
char
* unichar,
TrainingSample
*
sample
);
78
// Adds a character sample to this sample set with the given unichar_id,
79
// which must correspond to the local unicharset (in this).
80
void
AddSample
(
int
unichar_id,
TrainingSample
*
sample
);
81
82
// Returns the number of samples for the given font,class pair.
83
// If randomize is true, returns the number of samples accessible
84
// with randomizing on. (Increases the number of samples if small.)
85
// OrganizeByFontAndClass must have been already called.
86
int
NumClassSamples
(
int
font_id,
int
class_id,
bool
randomize)
const
;
87
88
// Gets a sample by its index.
89
const
TrainingSample
*
GetSample
(
int
index)
const
;
90
91
// Gets a sample by its font, class, index.
92
// OrganizeByFontAndClass must have been already called.
93
const
TrainingSample
*
GetSample
(
int
font_id,
int
class_id,
int
index)
const
;
94
95
// Get a sample by its font, class, index. Does not randomize.
96
// OrganizeByFontAndClass must have been already called.
97
TrainingSample
*
MutableSample
(
int
font_id,
int
class_id,
int
index);
98
99
// Returns a string debug representation of the given sample:
100
// font, unichar_str, bounding box, page.
101
STRING
SampleToString
(
const
TrainingSample
&
sample
)
const
;
102
103
// Gets the combined set of features used by all the samples of the given
104
// font/class combination.
105
const
BitVector
&
GetCloudFeatures
(
int
font_id,
int
class_id)
const
;
106
// Gets the indexed features of the canonical sample of the given
107
// font/class combination.
108
const
GenericVector<int>
&
GetCanonicalFeatures
(
int
font_id,
109
int
class_id)
const
;
110
111
// Returns the distance between the given UniCharAndFonts pair.
112
// If matched_fonts, only matching fonts, are considered, unless that yields
113
// the empty set.
114
// OrganizeByFontAndClass must have been already called.
115
float
UnicharDistance
(
const
UnicharAndFonts
& uf1,
const
UnicharAndFonts
& uf2,
116
bool
matched_fonts,
const
IntFeatureMap
& feature_map);
117
118
// Returns the distance between the given pair of font/class pairs.
119
// Finds in cache or computes and caches.
120
// OrganizeByFontAndClass must have been already called.
121
float
ClusterDistance
(
int
font_id1,
int
class_id1,
122
int
font_id2,
int
class_id2,
123
const
IntFeatureMap
& feature_map);
124
125
// Computes the distance between the given pair of font/class pairs.
126
float
ComputeClusterDistance
(
int
font_id1,
int
class_id1,
127
int
font_id2,
int
class_id2,
128
const
IntFeatureMap
& feature_map)
const
;
129
130
// Returns the number of canonical features of font/class 2 for which
131
// neither the feature nor any of its near neighbors occurs in the cloud
132
// of font/class 1. Each such feature is a reliable separation between
133
// the classes, ASSUMING that the canonical sample is sufficiently
134
// representative that every sample has a feature near that particular
135
// feature. To check that this is so on the fly would be prohibitively
136
// expensive, but it might be possible to pre-qualify the canonical features
137
// to include only those for which this assumption is true.
138
// ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
139
// first, or the results will be nonsense.
140
int
ReliablySeparable
(
int
font_id1,
int
class_id1,
141
int
font_id2,
int
class_id2,
142
const
IntFeatureMap
& feature_map,
143
bool
thorough)
const
;
144
145
146
// Returns the total index of the requested sample.
147
// OrganizeByFontAndClass must have been already called.
148
int
GlobalSampleIndex
(
int
font_id,
int
class_id,
int
index)
const
;
149
150
// Gets the canonical sample for the given font, class pair.
151
// ComputeCanonicalSamples must have been called first.
152
const
TrainingSample
*
GetCanonicalSample
(
int
font_id,
int
class_id)
const
;
153
// Gets the max distance for the given canonical sample.
154
// ComputeCanonicalSamples must have been called first.
155
float
GetCanonicalDist
(
int
font_id,
int
class_id)
const
;
156
157
// Returns a mutable pointer to the sample with the given index.
158
TrainingSample
*
mutable_sample
(
int
index) {
159
return
samples_[index];
160
}
161
// Gets ownership of the sample with the given index, removing it from this.
162
TrainingSample
*
extract_sample
(
int
index) {
163
TrainingSample
*
sample
= samples_[index];
164
samples_[index] =
NULL
;
165
return
sample;
166
}
167
168
// Generates indexed features for all samples with the supplied feature_space.
169
void
IndexFeatures
(
const
IntFeatureSpace
& feature_space);
170
171
// Delete outlier samples with few features that are shared with others.
172
// IndexFeatures must have been called already.
173
void
DeleteOutliers
(
const
IntFeatureSpace
& feature_space,
bool
debug);
174
175
// Marks the given sample for deletion.
176
// Deletion is actually completed by DeleteDeadSamples.
177
void
KillSample
(
TrainingSample
*
sample
);
178
179
// Deletes all samples with a negative sample index marked by KillSample.
180
// Must be called before OrganizeByFontAndClass, and OrganizeByFontAndClass
181
// must be called after as the samples have been renumbered.
182
void
DeleteDeadSamples
();
183
184
// Callback function returns true if the given sample is to be deleted, due
185
// to having a negative classid.
186
bool
DeleteableSample
(
const
TrainingSample
*
sample
);
187
188
// Construct an array to access the samples by font,class pair.
189
void
OrganizeByFontAndClass
();
190
191
// Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
192
// index for the font_class_array_.
193
void
SetupFontIdMap
();
194
195
// Finds the sample for each font, class pair that has least maximum
196
// distance to all the other samples of the same font, class.
197
// OrganizeByFontAndClass must have been already called.
198
void
ComputeCanonicalSamples
(
const
IntFeatureMap
& map,
bool
debug);
199
200
// Replicates the samples to a minimum frequency defined by
201
// 2 * kSampleRandomSize, or for larger counts duplicates all samples.
202
// After replication, the replicated samples are perturbed slightly, but
203
// in a predictable and repeatable way.
204
// Use after OrganizeByFontAndClass().
205
void
ReplicateAndRandomizeSamples
();
206
207
// Caches the indexed features of the canonical samples.
208
// ComputeCanonicalSamples must have been already called.
209
void
ComputeCanonicalFeatures
();
210
// Computes the combined set of features used by all the samples of each
211
// font/class combination. Use after ReplicateAndRandomizeSamples.
212
void
ComputeCloudFeatures
(
int
feature_space_size);
213
214
// Adds all fonts of the given class to the shape.
215
void
AddAllFontsForClass
(
int
class_id,
Shape
* shape)
const
;
216
217
// Display the samples with the given indexed feature that also match
218
// the given shape.
219
void
DisplaySamplesWithFeature
(
int
f_index,
const
Shape
& shape,
220
const
IntFeatureSpace
& feature_space,
221
ScrollView::Color
color,
222
ScrollView
* window)
const
;
223
224
private
:
225
// Struct to store a triplet of unichar, font, distance in the distance cache.
226
struct
FontClassDistance {
227
int
unichar_id;
228
int
font_id;
// Real font id.
229
float
distance;
230
};
231
// Simple struct to store information related to each font/class combination.
232
struct
FontClassInfo {
233
FontClassInfo();
234
235
// Writes to the given file. Returns false in case of error.
236
bool
Serialize
(FILE* fp)
const
;
237
// Reads from the given file. Returns false in case of error.
238
// If swap is true, assumes a big/little-endian swap is needed.
239
bool
DeSerialize
(
bool
swap, FILE* fp);
240
241
// Number of raw samples.
242
inT32
num_raw_samples
;
243
// Index of the canonical sample.
244
inT32
canonical_sample;
245
// Max distance of the canonical sample from any other.
246
float
canonical_dist;
247
// Sample indices for the samples, including replicated.
248
GenericVector<inT32>
samples;
249
250
// Non-serialized cache data.
251
// Indexed features of the canonical sample.
252
GenericVector<int>
canonical_features;
253
// The mapped features of all the samples.
254
BitVector cloud_features;
255
256
// Caches for ClusterDistance.
257
// Caches for other fonts but matching this unichar. -1 indicates not set.
258
// Indexed by compact font index from font_id_map_.
259
GenericVector<float>
font_distance_cache;
260
// Caches for other unichars but matching this font. -1 indicates not set.
261
GenericVector<float>
unichar_distance_cache;
262
// Cache for the rest (non matching font and unichar.)
263
// A cache of distances computed by ReliablySeparable.
264
GenericVector<FontClassDistance>
distance_cache;
265
};
266
267
PointerVector<TrainingSample> samples_;
268
// Number of samples before replication/randomization.
269
int
num_raw_samples_;
270
// Character set we are training for.
271
UNICHARSET
unicharset_;
272
// Character set size to which the 2-d arrays below refer.
273
int
unicharset_size_;
274
// Map to allow the font_class_array_ below to be compact.
275
// The sparse space is the real font_id, used in samples_ .
276
// The compact space is an index to font_class_array_
277
IndexMapBiDi font_id_map_;
278
// A 2-d array of FontClassInfo holding information related to each
279
// (font_id, class_id) pair.
280
GENERIC_2D_ARRAY<FontClassInfo>
* font_class_array_;
281
282
// Reference to the fontinfo_table_ in MasterTrainer. Provides names
283
// for font_ids in the samples. Not serialized!
284
const
UnicityTable<FontInfo>
& fontinfo_table_;
285
};
286
287
}
// namespace tesseract.
288
289
290
#endif // TRAININGSAMPLESETSET_H_
mnt
data
src
tesseract-ocr
classify
trainingsampleset.h
Generated on Thu Nov 1 2012 20:19:47 for Tesseract by
1.8.1