#include <normlzr.h>
Public Types | |
enum | { DONE = 0xffff } |
If DONE is returned from an iteration function that returns a code point, then there are no more normalization results available. More... | |
enum | { COMPAT_BIT = 1, DECOMP_BIT = 2, COMPOSE_BIT = 4, FCD_BIT = 8 } |
This tells us what the bits in the "mode" mean. More... | |
enum | EMode { NO_OP = 0, COMPOSE = COMPOSE_BIT, COMPOSE_COMPAT = COMPOSE_BIT | COMPAT_BIT, DECOMP = DECOMP_BIT, DECOMP_COMPAT = DECOMP_BIT | COMPAT_BIT, FCD = FCD_BIT } |
The mode of a Normalizer object. More... | |
enum | { IGNORE_HANGUL = 0x001 } |
The options for a Normalizer object. More... | |
Public Member Functions | |
Normalizer (const UnicodeString &str, UNormalizationMode mode) | |
Creates a new Normalizer object for iterating over the normalized form of a given string. | |
Normalizer (const UChar *str, int32_t length, UNormalizationMode mode) | |
Creates a new Normalizer object for iterating over the normalized form of a given string. | |
Normalizer (const CharacterIterator &iter, UNormalizationMode mode) | |
Creates a new Normalizer object for iterating over the normalized form of the given text. | |
Normalizer (const Normalizer ©) | |
Copy constructor. | |
~Normalizer () | |
Destructor . | |
UChar32 | current (void) |
Return the current character in the normalized text. | |
UChar32 | first (void) |
Return the first character in the normalized text. | |
UChar32 | last (void) |
Return the last character in the normalized text. | |
UChar32 | next (void) |
Return the next character in the normalized text. | |
UChar32 | previous (void) |
Return the previous character in the normalized text. | |
UChar32 | setIndex (int32_t index) |
Set the iteration position in the input text that is being normalized and return the first normalized character at that position. | |
void | setIndexOnly (int32_t index) |
Set the iteration position in the input text that is being normalized, without any immediate normalization. | |
void | reset (void) |
Reset the index to the beginning of the text. | |
int32_t | getIndex (void) const |
Retrieve the current iteration position in the input text that is being normalized. | |
int32_t | startIndex (void) const |
Retrieve the index of the start of the input text. | |
int32_t | endIndex (void) const |
Retrieve the index of the end of the input text. | |
UBool | operator== (const Normalizer &that) const |
Returns TRUE when both iterators refer to the same character in the same input text. | |
UBool | operator!= (const Normalizer &that) const |
Returns FALSE when both iterators refer to the same character in the same input text. | |
Normalizer * | clone (void) const |
Returns a pointer to a new Normalizer that is a clone of this one. | |
int32_t | hashCode (void) const |
Generates a hash code for this iterator. | |
void | setMode (UNormalizationMode newMode) |
Set the normalization mode for this object. | |
UNormalizationMode | getUMode (void) const |
Return the normalization mode for this object. | |
void | setOption (int32_t option, UBool value) |
Set options that affect this Normalizer 's operation. | |
UBool | getOption (int32_t option) const |
Determine whether an option is turned on or off. | |
void | setText (const UnicodeString &newText, UErrorCode &status) |
Set the input text over which this Normalizer will iterate. | |
void | setText (const CharacterIterator &newText, UErrorCode &status) |
Set the input text over which this Normalizer will iterate. | |
void | setText (const UChar *newText, int32_t length, UErrorCode &status) |
Set the input text over which this Normalizer will iterate. | |
void | getText (UnicodeString &result) |
Copies the input text into the UnicodeString argument. | |
Normalizer (const UnicodeString &str, EMode mode) | |
Creates a new Normalizer object for iterating over the normalized form of a given string. | |
Normalizer (const UnicodeString &str, EMode mode, int32_t opt) | |
Creates a new Normalizer object for iterating over the normalized form of a given string. | |
Normalizer (const UChar *str, int32_t length, EMode mode) | |
Creates a new Normalizer object for iterating over the normalized form of a given UChar string. | |
Normalizer (const UChar *str, int32_t length, EMode mode, int32_t option) | |
Creates a new Normalizer object for iterating over the normalized form of a given UChar string. | |
Normalizer (const CharacterIterator &iter, EMode mode) | |
Creates a new Normalizer object for iterating over the normalized form of the given text. | |
Normalizer (const CharacterIterator &iter, EMode mode, int32_t opt) | |
Creates a new Normalizer object for iterating over the normalized form of the given text. | |
void | setMode (EMode newMode) |
Set the normalization mode for this object. | |
EMode | getMode (void) const |
Return the basic operation performed by this Normalizer . | |
Static Public Member Functions | |
static void | normalize (const UnicodeString &source, UNormalizationMode mode, int32_t options, UnicodeString &result, UErrorCode &status) |
Normalizes a UnicodeString according to the specified normalization mode. | |
static void | compose (const UnicodeString &source, UBool compat, int32_t options, UnicodeString &result, UErrorCode &status) |
Compose a UnicodeString . | |
static void | decompose (const UnicodeString &source, UBool compat, int32_t options, UnicodeString &result, UErrorCode &status) |
Static method to decompose a UnicodeString . | |
static UNormalizationCheckResult | quickCheck (const UnicodeString &source, UNormalizationMode mode, UErrorCode &status) |
Performing quick check on a string, to quickly determine if the string is in a particular normalization format. | |
static UnicodeString & | concatenate (UnicodeString &left, UnicodeString &right, UnicodeString &result, UNormalizationMode mode, int32_t options, UErrorCode &errorCode) |
static void | normalize (const UnicodeString &source, EMode mode, int32_t options, UnicodeString &result, UErrorCode &status) |
Normalizes a UnicodeString using the given normalization operation. | |
static UNormalizationCheckResult | quickCheck (const UnicodeString &source, EMode mode, UErrorCode &status) |
Performing quick check on a string, to quickly determine if the string is in a particular normalization format. | |
static UNormalizationMode | getUNormalizationMode (EMode mode, UErrorCode &status) |
Converts C's Normalizer::EMode to UNormalizationMode. | |
static EMode | getNormalizerEMode (UNormalizationMode mode, UErrorCode &status) |
Converts C++'s UNormalizationMode to Normalizer::EMode. |
The Normalizer class consists of two parts:
The static functions are basically wrappers around the C implementation, using UnicodeString instead of UChar*. For basic information about normalization forms and details about the C API please see the documentation in unorm.h.
The iterator API with the Normalizer constructors and the non-static functions uses a CharacterIterator as input. It is possible to pass a string which is then internally wrapped in a CharacterIterator. The input text is not normalized all at once, but incrementally where needed (providing efficient random access). This allows to pass in a large text but spend only a small amount of time normalizing a small part of that text. However, if the entire text is normalized, then the iterator will be slower than normalizing the entire text at once and iterating over the result. A possible use of the Normalizer iterator is also to report an index into the original text that is close to where the normalized characters come from.
Important: The iterator API was cleaned up significantly for ICU 2.0. The earlier implementation reported the getIndex() inconsistently, and previous() could not be used after setIndex(), next(), first(), and current().
Normalizer allows to start normalizing from anywhere in the input text by calling setIndexOnly(), setIndex(), first(), or last(). Without calling any of these, the iterator will start at the beginning of the text.
At any time, next() returns the next normalized code point (UChar32), with post-increment semantics (like CharacterIterator::next32PostInc()). previous() returns the previous normalized code point (UChar32), with pre-decrement semantics (like CharacterIterator::previous32()).
current() and setIndex() return the current code point (respectively the one at the newly set index) without moving the getIndex(). Note that if the text at the current position needs to be normalized, then these functions will do that. (This is why current() is not const.) If you call setIndex() and then previous() then you normalize a piece of text (and get a code point from setIndex()) that you probably do not need. It is more efficient to call setIndexOnly() instead, which does not normalize.
getIndex() always refers to the position in the input text where the normalized code points are returned from. It does not always change with each returned code point. The code point that is returned from any of the functions corresponds to text at or after getIndex(), according to the function's iteration semantics (post-increment or pre-decrement).
next() returns a code point from at or after the getIndex() from before the next() call. After the next() call, the getIndex() might have moved to where the next code point will be returned from (from a next() or current() call). This is semantically equivalent to array access with array[index++] (post-increment semantics).
previous() returns a code point from at or after the getIndex() from after the previous() call. This is semantically equivalent to array access with array[--index] (pre-decrement semantics).
Internally, the Normalizer iterator normalizes a small piece of text starting at the getIndex() and ending at a following "safe" index. The normalized results is stored in an internal string buffer, and the code points are iterated from there. With multiple iteration calls, this is repeated until the next piece of text needs to be normalized, and the getIndex() needs to be moved.
The following "safe" index, the internal buffer, and the secondary iteration index into that buffer are not exposed on the API. This also means that it is currently not practical to return to a particular, arbitrary position in the text because one would need to know, and be able to set, in addition to the getIndex(), at least also the current index into the internal buffer. It is currently only possible to observe when getIndex() changes (with careful consideration of the iteration semantics), at which time the internal index will be 0. For example, if getIndex() is different after next() than before it, then the internal index is 0 and one can return to this getIndex() later with setIndexOnly().
|
If DONE is returned from an iteration function that returns a code point, then there are no more normalization results available.
|
|
This tells us what the bits in the "mode" mean.
|
|
The options for a Normalizer object.
|
|
The mode of a Normalizer object.
|
|
Creates a new
|
|
Creates a new
|
|
Creates a new
|
|
Copy constructor.
|
|
Creates a new
|
|
Creates a new
The
|
|
Creates a new
|
|
Creates a new
|
|
Creates a new
|
|
Creates a new
|
|
Returns a pointer to a new Normalizer that is a clone of this one. The caller is responsible for deleting the new clone. |
|
Compose a This is equivalent to normalize() with mode UNORM_NFC or UNORM_NFKC. This is a wrapper for unorm_normalize(), using UnicodeString's.
The
|
|
Return the current character in the normalized text. current() may need to normalize some text at getIndex(). The getIndex() is not changed.
|
|
Static method to decompose a This is equivalent to normalize() with mode UNORM_NFD or UNORM_NFKD. This is a wrapper for unorm_normalize(), using UnicodeString's.
The
|
|
Retrieve the index of the end of the input text.
This is the end index of the
|
|
Return the first character in the normalized text. This is equivalent to setIndexOnly(startIndex()) followed by next(). (Post-increment semantics.)
|
|
Retrieve the current iteration position in the input text that is being normalized. A following call to next() will return a normalized code point from the input text at or after this index. After a call to previous(), getIndex() will point at or before the position in the input text where the normalized code point was returned from with previous().
|
|
Return the basic operation performed by this
|
|
Converts C++'s UNormalizationMode to Normalizer::EMode.
|
|
Determine whether an option is turned on or off. If multiple options are specified, then the result is TRUE if any of them are set.
|
|
Copies the input text into the UnicodeString argument.
|
|
Return the normalization mode for this object. This is an unusual name because there used to be a getMode() that returned a different type.
|
|
Converts C's Normalizer::EMode to UNormalizationMode.
|
|
Generates a hash code for this iterator.
|
|
Return the last character in the normalized text. This is equivalent to setIndexOnly(endIndex()) followed by previous(). (Pre-decrement semantics.)
|
|
Return the next character in the normalized text. (Post-increment semantics.) If the end of the text has already been reached, DONE is returned.
|
|
Normalizes a
The
|
|
Normalizes a This is a wrapper for unorm_normalize(), using UnicodeString's.
The
|
|
Returns FALSE when both iterators refer to the same character in the same input text.
|
|
Returns TRUE when both iterators refer to the same character in the same input text.
|
|
Return the previous character in the normalized text. and decrement (Pre-decrement semantics.) If the beginning of the text has already been reached, DONE is returned.
|
|
Performing quick check on a string, to quickly determine if the string is in a particular normalization format. Three types of result can be returned UNORM_YES, UNORM_NO or UNORM_MAYBE. Result UNORM_YES indicates that the argument string is in the desired normalized format, UNORM_NO determines that argument string is not in the desired normalized format. A UNORM_MAYBE result indicates that a more thorough check is required, the user may have to put the string in its normalized form and compare the results.
|
|
Performing quick check on a string, to quickly determine if the string is in a particular normalization format. This is a wrapper for unorm_quickCheck(), using a UnicodeString. Three types of result can be returned UNORM_YES, UNORM_NO or UNORM_MAYBE. Result UNORM_YES indicates that the argument string is in the desired normalized format, UNORM_NO determines that argument string is not in the desired normalized format. A UNORM_MAYBE result indicates that a more thorough check is required, the user may have to put the string in its normalized form and compare the results.
|
|
Reset the index to the beginning of the text. This is equivalent to setIndexOnly(startIndex)). |
|
Set the iteration position in the input text that is being normalized and return the first normalized character at that position. This is equivalent to setIndexOnly() followed by current(). After setIndex(), getIndex() will return the same index that is specified here. Note that setIndex() normalizes some text starting at the specified index and returns the first code point from that normalization. If the next call is to previous() then this piece of text probably did not need to be normalized. This function is deprecated. It is recommended to use setIndexOnly() instead of setIndex().
|
|
Set the iteration position in the input text that is being normalized, without any immediate normalization. After setIndexOnly(), getIndex() will return the same index that is specified here.
|
|
Set the normalization mode for this object.
Note:If the normalization mode is changed while iterating over a string, calls to next and previous may return previously buffers characters in the old normalization mode until the iteration is able to re-sync at the next base character. It is safest to call setText(), first, last, etc. after calling
|
|
Set the normalization mode for this object.
Note:If the normalization mode is changed while iterating over a string, calls to next and previous may return previously buffers characters in the old normalization mode until the iteration is able to re-sync at the next base character. It is safest to call setIndexOnly, reset, setText(), first, last, etc. after calling
|
|
Set options that affect this Options do not change the basic composition or decomposition operation that is being performed, but they control whether certain optional portions of the operation are done. Currently the only available option is deprecated. It is possible to specify multiple options that are all turned on or off.
|
|
Set the input text over which this The iteration position is set to the beginning.
|
|
Set the input text over which this The iteration position is set to the beginning.
|
|
Set the input text over which this The iteration position is set to the beginning.
|
|
Retrieve the index of the start of the input text.
This is the begin index of the
|