30 static const char* kMultiBlobLabelCode =
"WordStr";
35 const char *lastdot = strrchr(filename.
string(),
'.');
37 filename[lastdot - filename.
string()] =
'\0';
40 FILE* box_file =
NULL;
41 if (!(box_file = fopen(filename.
string(),
"rb"))) {
43 "Cant open box file %s",
61 return ReadNextBox(-1, line_number, box_file, utf8_str, bounding_box);
67 bool ReadNextBox(
int target_page,
int *line_number, FILE* box_file,
73 while (fgets(buff,
sizeof(buff) - 1, box_file)) {
77 const unsigned char *ubuf =
reinterpret_cast<const unsigned char*
>(buffptr);
78 if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
81 while (*buffptr ==
' ' || *buffptr ==
'\t')
83 if (*buffptr !=
'\0') {
85 tprintf(
"Box file format error on line %i; ignored\n", *line_number);
88 if (target_page >= 0 && target_page != page)
107 *bounding_box =
TBOX();
110 const char *buffptr = boxfile_str;
116 while (*buffptr !=
'\0' && *buffptr !=
' ' && *buffptr !=
'\t' &&
118 uch[uch_len++] = *buffptr++;
121 if (*buffptr !=
'\0') ++buffptr;
122 int x_min, y_min, x_max, y_max;
124 int count = sscanf(buffptr,
"%d %d %d %d %d",
125 &x_min, &y_min, &x_max, &y_max, page_number);
126 if (count != 5 && count != 4) {
127 tprintf(
"Bad box coordinates in boxfile string!\n");
131 if (strcmp(uch, kMultiBlobLabelCode) == 0 &&
132 (buffptr = strchr(buffptr,
'#')) !=
NULL) {
135 uch_len = strlen(uch);
139 while (used < uch_len) {
140 UNICHAR ch(uch + used, uch_len - used);
143 tprintf(
"Bad UTF-8 str %s starts with 0x%02x at col %d\n",
144 uch + used, uch[used], used + 1);
157 *box_str = unichar_str;