Lucene++ - a full-featured, c++ search engine
API Documentation


 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
UTF8Stream.h
Go to the documentation of this file.
1 
2 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
6 
7 #ifndef UTF8STREAM_H
8 #define UTF8STREAM_H
9 
10 #include "LuceneObject.h"
11 
12 namespace Lucene {
13 
14 class UTF8Base : public LuceneObject {
15 public:
16  virtual ~UTF8Base();
18 
19 public:
20  static const uint16_t LEAD_SURROGATE_MIN;
21  static const uint16_t LEAD_SURROGATE_MAX;
22  static const uint16_t TRAIL_SURROGATE_MIN;
23  static const uint16_t TRAIL_SURROGATE_MAX;
24  static const uint16_t LEAD_OFFSET;
25  static const uint32_t SURROGATE_OFFSET;
26  static const uint32_t CODE_POINT_MAX;
27 
28  static const wchar_t UNICODE_REPLACEMENT_CHAR;
29  static const wchar_t UNICODE_TERMINATOR;
30 
31 protected:
32  virtual uint32_t readNext() = 0;
33 
34  uint8_t mask8(uint32_t b);
35  uint16_t mask16(uint32_t c);
36  bool isTrail(uint32_t b);
37  bool isSurrogate(uint32_t cp);
38  bool isLeadSurrogate(uint32_t cp);
39  bool isTrailSurrogate(uint32_t cp);
40  bool isValidCodePoint(uint32_t cp);
41  bool isOverlongSequence(uint32_t cp, int32_t length);
42 };
43 
44 class UTF8Encoder : public UTF8Base {
45 public:
46  UTF8Encoder(const wchar_t* unicodeBegin, const wchar_t* unicodeEnd);
47  virtual ~UTF8Encoder();
48 
50 
51 protected:
52  const wchar_t* unicodeBegin;
53  const wchar_t* unicodeEnd;
54 
55 public:
56  int32_t encode(uint8_t* utf8, int32_t length);
57 
58  int32_t utf16to8(uint8_t* utf8, int32_t length);
59  int32_t utf32to8(uint8_t* utf8, int32_t length);
60 
61 protected:
62  virtual uint32_t readNext();
63 
64  uint8_t* appendChar(uint8_t* utf8, uint32_t cp);
65 };
66 
68 public:
70  virtual ~UTF8EncoderStream();
71 
73 
74 protected:
76 
77 protected:
78  virtual uint32_t readNext();
79 };
80 
81 class UTF8Decoder : public UTF8Base {
82 public:
83  UTF8Decoder(const uint8_t* utf8Begin, const uint8_t* utf8End);
84  virtual ~UTF8Decoder();
85 
87 
88 protected:
89  const uint8_t* utf8Begin;
90  const uint8_t* utf8End;
91 
92 public:
93  int32_t decode(wchar_t* unicode, int32_t length);
94 
95  int32_t utf8to16(wchar_t* unicode, int32_t length);
96  int32_t utf8to32(wchar_t* unicode, int32_t length);
97 
98 protected:
99  virtual uint32_t readNext();
100 
101  int32_t sequenceLength(uint32_t cp);
102  bool getSequence(uint32_t& cp, int32_t length);
103  bool isValidNext(uint32_t& cp);
104 };
105 
107 public:
109  virtual ~UTF8DecoderStream();
110 
112 
113 protected:
115 
116 protected:
117  virtual uint32_t readNext();
118 };
119 
120 class UTF16Decoder : public UTF8Base {
121 public:
122  UTF16Decoder(const uint16_t* utf16Begin, const uint16_t* utf16End);
123  virtual ~UTF16Decoder();
124 
126 
127 protected:
128  const uint16_t* utf16Begin;
129  const uint16_t* utf16End;
130 
131 public:
132  int32_t decode(wchar_t* unicode, int32_t length);
133 
134  int32_t utf16to16(wchar_t* unicode, int32_t length);
135  int32_t utf16to32(wchar_t* unicode, int32_t length);
136 
137 protected:
138  virtual uint32_t readNext();
139 };
140 
141 }
142 
143 #endif

clucene.sourceforge.net