Tesseract
3.02
Main Page
Related Pages
Modules
Namespaces
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Groups
Pages
params_training_featdef.h
Go to the documentation of this file.
1
2
// File: params_training_featdef.h
3
// Description: Feature definitions for params training.
4
// Author: Rika Antonova
5
// Created: Mon Nov 28 11:26:42 PDT 2011
6
//
7
// (C) Copyright 2011, Google Inc.
8
// Licensed under the Apache License, Version 2.0 (the "License");
9
// you may not use this file except in compliance with the License.
10
// You may obtain a copy of the License at
11
// http://www.apache.org/licenses/LICENSE-2.0
12
// Unless required by applicable law or agreed to in writing, software
13
// distributed under the License is distributed on an "AS IS" BASIS,
14
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
// See the License for the specific language governing permissions and
16
// limitations under the License.
17
//
19
20
#ifndef TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
21
#define TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
22
23
#include "
genericvector.h
"
24
#include "
strngs.h
"
25
26
namespace
tesseract
{
27
28
// Raw features extracted from a single OCR hypothesis.
29
// The features are non-normalized real-valued quantities with
30
// unbounded range and unknown distribution.
31
// Normalization / binarization of these features is done at a later stage.
32
// Note: when adding new fields to this enum make sure to modify
33
// kParamsTrainingRawFeatureTypeName enum accordingly.
34
enum
ParamsTrainingRawFeatureType
{
35
// What dictionary (if any) was this hypothesis found in.
36
// See PermuterType enum in ccstruct/ratngs.h for interpretation.
37
PTRAIN_RAW_FEATURE_DICT_MATCH_TYPE
,
// 0
38
// Boolean indicator of whether this hypothesis is ambiguous to a known
39
// dictionary word (or a valid number pattern).
40
PTRAIN_RAW_FEATURE_UNAMBIG_DICT_MATCH
,
// 1
41
// Shape cost of the segmentation path for this hypothesis.
42
PTRAIN_RAW_FEATURE_SHAPE_COST
,
// 2
43
// Character ngram probability of the string of unichars of this hypothesis.
44
PTRAIN_RAW_FEATURE_NGRAM_PROB
,
// 3
45
// Number of bad/inconsistent spots in this hypothesis.
46
PTRAIN_RAW_FEATURE_NUM_BAD_PUNC
,
// 4
47
PTRAIN_RAW_FEATURE_NUM_BAD_CASE
,
// 5
48
PTRAIN_RAW_FEATURE_NUM_BAD_CHAR_TYPE
,
// 6
49
PTRAIN_RAW_FEATURE_NUM_BAD_SPACING
,
// 7
50
PTRAIN_RAW_FEATURE_NUM_BAD_SCRIPT
,
// 8
51
PTRAIN_RAW_FEATURE_NUM_BAD_FONT
,
// 9
52
// Classifier-related features.
53
PTRAIN_RAW_FEATURE_WORST_CERT
,
// 10
54
PTRAIN_RAW_FEATURE_RATING
,
// 11
55
// Number of classifier results that came from adapted templates.
56
PTRAIN_RAW_FEATURE_ADAPTED
,
// 12
57
// Features potentially useful for normalization.
58
PTRAIN_RAW_FEATURE_NUM_UNICHARS
,
// 13
59
PTRAIN_RAW_FEATURE_OUTLINE_LEN
,
// 14
60
61
PTRAIN_NUM_RAW_FEATURE_TYPES
62
};
63
64
static
const
char
*
const
kParamsTrainingRawFeatureTypeName[] = {
65
"DICT_MATCH_TYPE"
,
// 0
66
"UNAMBIG_DICT_MATCH"
,
// 1
67
"SHAPE_COST"
,
// 2
68
"NGRAM_PROB"
,
// 3
69
"NUM_BAD_PUNC"
,
// 4
70
"NUM_BAD_CASE"
,
// 5
71
"NUM_BAD_CHAR_TYPE"
,
// 6
72
"NUM_BAD_SPACING"
,
// 7
73
"NUM_BAD_SCRIPT"
,
// 8
74
"NUM_BAD_FONT"
,
// 9
75
"WORST_CERT"
,
// 10
76
"RATING"
,
// 11
77
"ADAPTED"
,
// 12
78
"NUM_UNICHARS"
,
// 13
79
"OUTLINE_LEN"
,
// 14
80
};
81
82
// Entry with features extracted from a single OCR hypothesis for a word.
83
struct
ParamsTrainingHypothesis
{
84
ParamsTrainingHypothesis
() {
85
for
(
int
i = 0; i <
PTRAIN_NUM_RAW_FEATURE_TYPES
; ++i)
features
[i] = 0.0;
86
}
87
float
features
[
PTRAIN_NUM_RAW_FEATURE_TYPES
];
88
STRING
str
;
// string corresponding to word hypothesis (for debugging)
89
};
90
91
// A list of hypotheses explored during one run of segmentation search.
92
typedef
GenericVector<ParamsTrainingHypothesis>
ParamsTrainingHypothesisList
;
93
94
// A bundle that accumulates all of the hypothesis lists explored during all
95
// of the runs of segmentation search on a word (e.g. a list of hypotheses
96
// explored on PASS1, PASS2, fix xheight pass, etc).
97
class
ParamsTrainingBundle
{
98
public
:
99
ParamsTrainingBundle
() {};
100
// Starts a new hypothesis list.
101
// Should be called at the beginning of a new run of the segmentation search.
102
void
StartHypothesisList
() {
103
hyp_list_vec
.
push_back
(
ParamsTrainingHypothesisList
());
104
}
105
// Adds a new ParamsTrainingHypothesis to the current hypothesis list
106
// and returns the reference to the newly added entry.
107
ParamsTrainingHypothesis
&
AddHypothesis
() {
108
if
(
hyp_list_vec
.
empty
())
StartHypothesisList
();
109
hyp_list_vec
.
back
().push_back(
ParamsTrainingHypothesis
());
110
return
hyp_list_vec
.
back
().back();
111
}
112
113
GenericVector<ParamsTrainingHypothesisList>
hyp_list_vec
;
114
};
115
116
}
// namespace tesseract
117
118
#endif // TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
mnt
data
src
tesseract-ocr
ccstruct
params_training_featdef.h
Generated on Thu Nov 1 2012 20:19:44 for Tesseract by
1.8.1