Lucene++ - a full-featured, c++ search engine
API Documentation


 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
UTF8Stream.h
Go to the documentation of this file.
1 // Copyright (c) 2009-2011 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
6 
7 #ifndef UTF8STREAM_H
8 #define UTF8STREAM_H
9 
10 #include "LuceneObject.h"
11 
12 namespace Lucene
13 {
14  class UTF8Base : public LuceneObject
15  {
16  public:
17  virtual ~UTF8Base();
19 
20  public:
21  static const uint16_t LEAD_SURROGATE_MIN;
22  static const uint16_t LEAD_SURROGATE_MAX;
23  static const uint16_t TRAIL_SURROGATE_MIN;
24  static const uint16_t TRAIL_SURROGATE_MAX;
25  static const uint16_t LEAD_OFFSET;
26  static const uint32_t SURROGATE_OFFSET;
27  static const uint32_t CODE_POINT_MAX;
28 
29  static const wchar_t UNICODE_REPLACEMENT_CHAR;
30  static const wchar_t UNICODE_TERMINATOR;
31 
32  protected:
33  virtual uint32_t readNext() = 0;
34 
35  uint8_t mask8(uint32_t b);
36  uint16_t mask16(uint32_t c);
37  bool isTrail(uint32_t b);
38  bool isSurrogate(uint32_t cp);
39  bool isLeadSurrogate(uint32_t cp);
40  bool isTrailSurrogate(uint32_t cp);
41  bool isValidCodePoint(uint32_t cp);
42  bool isOverlongSequence(uint32_t cp, int32_t length);
43  };
44 
45  class UTF8Encoder : public UTF8Base
46  {
47  public:
48  UTF8Encoder(const wchar_t* unicodeBegin, const wchar_t* unicodeEnd);
49  virtual ~UTF8Encoder();
50 
52 
53  protected:
54  const wchar_t* unicodeBegin;
55  const wchar_t* unicodeEnd;
56 
57  public:
58  int32_t encode(uint8_t* utf8, int32_t length);
59 
60  int32_t utf16to8(uint8_t* utf8, int32_t length);
61  int32_t utf32to8(uint8_t* utf8, int32_t length);
62 
63  protected:
64  virtual uint32_t readNext();
65 
66  uint8_t* appendChar(uint8_t* utf8, uint32_t cp);
67  };
68 
70  {
71  public:
73  virtual ~UTF8EncoderStream();
74 
76 
77  protected:
79 
80  protected:
81  virtual uint32_t readNext();
82  };
83 
84  class UTF8Decoder : public UTF8Base
85  {
86  public:
87  UTF8Decoder(const uint8_t* utf8Begin, const uint8_t* utf8End);
88  virtual ~UTF8Decoder();
89 
91 
92  protected:
93  const uint8_t* utf8Begin;
94  const uint8_t* utf8End;
95 
96  public:
97  int32_t decode(wchar_t* unicode, int32_t length);
98 
99  int32_t utf8to16(wchar_t* unicode, int32_t length);
100  int32_t utf8to32(wchar_t* unicode, int32_t length);
101 
102  protected:
103  virtual uint32_t readNext();
104 
105  int32_t sequenceLength(uint32_t cp);
106  bool getSequence(uint32_t& cp, int32_t length);
107  bool isValidNext(uint32_t& cp);
108  };
109 
111  {
112  public:
114  virtual ~UTF8DecoderStream();
115 
117 
118  protected:
120 
121  protected:
122  virtual uint32_t readNext();
123  };
124 
125  class UTF16Decoder : public UTF8Base
126  {
127  public:
128  UTF16Decoder(const uint16_t* utf16Begin, const uint16_t* utf16End);
129  virtual ~UTF16Decoder();
130 
132 
133  protected:
134  const uint16_t* utf16Begin;
135  const uint16_t* utf16End;
136 
137  public:
138  int32_t decode(wchar_t* unicode, int32_t length);
139 
140  int32_t utf16to16(wchar_t* unicode, int32_t length);
141  int32_t utf16to32(wchar_t* unicode, int32_t length);
142 
143  protected:
144  virtual uint32_t readNext();
145  };
146 }
147 
148 #endif

clucene.sourceforge.net