Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tessdatamanager.h
Go to the documentation of this file.
1 
2 // File: tessdatamanager.h
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author: Daria Antonova
5 // Created: Wed Jun 03 11:26:43 PST 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
21 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
22 
23 #include <stdio.h>
24 #include "host.h"
25 #include "tprintf.h"
26 
27 static const char kTrainedDataSuffix[] = "traineddata";
28 
29 // When adding new tessdata types and file suffixes, please make sure to
30 // update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText.
31 static const char kLangConfigFileSuffix[] = "config";
32 static const char kUnicharsetFileSuffix[] = "unicharset";
33 static const char kAmbigsFileSuffix[] = "unicharambigs";
34 static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
35 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
36 static const char kNormProtoFileSuffix[] = "normproto";
37 static const char kPuncDawgFileSuffix[] = "punc-dawg";
38 static const char kSystemDawgFileSuffix[] = "word-dawg";
39 static const char kNumberDawgFileSuffix[] = "number-dawg";
40 static const char kFreqDawgFileSuffix[] = "freq-dawg";
41 static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs";
42 static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset";
43 static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
44 static const char kShapeTableFileSuffix[] = "shapetable";
45 static const char kBigramDawgFileSuffix[] = "bigram-dawg";
46 static const char kUnambigDawgFileSuffix[] = "unambig-dawg";
47 static const char kParamsTrainingModelFileSuffix[] = "params-training-model";
48 
49 namespace tesseract {
50 
69 
71 };
72 
77 static const char * const kTessdataFileSuffixes[] = {
78  kLangConfigFileSuffix, // 0
79  kUnicharsetFileSuffix, // 1
80  kAmbigsFileSuffix, // 2
81  kBuiltInTemplatesFileSuffix, // 3
82  kBuiltInCutoffsFileSuffix, // 4
83  kNormProtoFileSuffix, // 5
84  kPuncDawgFileSuffix, // 6
85  kSystemDawgFileSuffix, // 7
86  kNumberDawgFileSuffix, // 8
87  kFreqDawgFileSuffix, // 9
88  kFixedLengthDawgsFileSuffix, // 10
89  kCubeUnicharsetFileSuffix, // 11
90  kCubeSystemDawgFileSuffix, // 12
91  kShapeTableFileSuffix, // 13
92  kBigramDawgFileSuffix, // 14
93  kUnambigDawgFileSuffix, // 15
94  kParamsTrainingModelFileSuffix, // 16
95 };
96 
101 static const bool kTessdataFileIsText[] = {
102  true, // 0
103  true, // 1
104  true, // 2
105  false, // 3
106  true, // 4
107  true, // 5
108  false, // 6
109  false, // 7
110  false, // 8
111  false, // 9
112  false, // 10
113  true, // 11
114  false, // 12
115  false, // 13
116  false, // 14
117  false, // 15
118  false, // 16
119 };
120 
128 static const int kMaxNumTessdataEntries = 1000;
129 
130 
132  public:
134  data_file_ = NULL;
135  actual_tessdata_num_entries_ = 0;
136  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
137  offset_table_[i] = -1;
138  }
139  }
141  int DebugLevel() { return debug_level_; }
142 
147  bool Init(const char *data_file_name, int debug_level);
148 
150  inline FILE *GetDataFilePtr() const { return data_file_; }
151 
157  inline bool SeekToStart(TessdataType tessdata_type) {
158  if (debug_level_) {
159  tprintf("TessdataManager: seek to offset %lld - start of tessdata"
160  "type %d (%s))\n", offset_table_[tessdata_type],
161  tessdata_type, kTessdataFileSuffixes[tessdata_type]);
162  }
163  if (offset_table_[tessdata_type] < 0) {
164  return false;
165  } else {
166  ASSERT_HOST(fseek(data_file_,
167  static_cast<size_t>(offset_table_[tessdata_type]),
168  SEEK_SET) == 0);
169  return true;
170  }
171  }
173  inline inT64 GetEndOffset(TessdataType tessdata_type) const {
174  int index = tessdata_type + 1;
175  while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) {
176  ++index; // skip tessdata types not present in the combined file
177  }
178  if (debug_level_) {
179  tprintf("TessdataManager: end offset for type %d is %lld\n",
180  tessdata_type,
181  (index == actual_tessdata_num_entries_) ? -1
182  : offset_table_[index]);
183  }
184  return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1;
185  }
187  inline void End() {
188  if (data_file_ != NULL) {
189  fclose(data_file_);
190  data_file_ = NULL;
191  }
192  }
193  bool swap() const {
194  return swap_;
195  }
196 
198  static void WriteMetadata(inT64 *offset_table, FILE *output_file);
199 
205  static bool CombineDataFiles(const char *language_data_path_prefix,
206  const char *output_filename);
207 
213  bool OverwriteComponents(const char *new_traineddata_filename,
214  char **component_filenames,
215  int num_new_components);
216 
227  bool ExtractToFile(const char *filename);
228 
234  static void CopyFile(FILE *input_file, FILE *output_file,
235  bool newline_end, inT64 num_bytes_to_copy);
236 
245  static bool TessdataTypeFromFileSuffix(const char *suffix,
246  TessdataType *type,
247  bool *text_file);
248 
253  static bool TessdataTypeFromFileName(const char *filename,
254  TessdataType *type,
255  bool *text_file);
256 
257  private:
258 
263  static FILE *GetFilePtr(const char *language_data_path_prefix,
264  const char *file_suffix, bool text_file);
265 
270  inT64 offset_table_[TESSDATA_NUM_ENTRIES];
279  inT32 actual_tessdata_num_entries_;
280  FILE *data_file_;
281  int debug_level_;
282  // True if the bytes need swapping.
283  bool swap_;
284 };
285 
286 
287 } // namespace tesseract
288 
289 #endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_