#include <uniset.h>
Inheritance diagram for UnicodeSet:
Public Member Functions | |
UnicodeSet () | |
Constructs an empty set. | |
UnicodeSet (UChar32 start, UChar32 end) | |
Constructs a set containing the given range. | |
UnicodeSet (const UnicodeString &pattern, UErrorCode &status) | |
Constructs a set from the given pattern. | |
UnicodeSet (const UnicodeString &pattern, uint32_t options, const SymbolTable *symbols, UErrorCode &status) | |
Constructs a set from the given pattern. | |
UnicodeSet (const UnicodeString &pattern, ParsePosition &pos, uint32_t options, const SymbolTable *symbols, UErrorCode &status) | |
Constructs a set from the given pattern. | |
UnicodeSet (const UnicodeSet &o) | |
Constructs a set that is identical to the given UnicodeSet. | |
virtual | ~UnicodeSet () |
Destructs the set. | |
UnicodeSet & | operator= (const UnicodeSet &o) |
Assigns this object to be a copy of another. | |
virtual UBool | operator== (const UnicodeSet &o) const |
Compares the specified object with this set for equality. | |
UBool | operator!= (const UnicodeSet &o) const |
Compares the specified object with this set for equality. | |
virtual UnicodeFunctor * | clone () const |
Returns a copy of this object. | |
virtual int32_t | hashCode (void) const |
Returns the hash code value for this set. | |
UnicodeSet & | set (UChar32 start, UChar32 end) |
Make this object represent the range start - end . | |
virtual UnicodeSet & | applyPattern (const UnicodeString &pattern, UErrorCode &status) |
Modifies this set to represent the set specified by the given pattern, optionally ignoring white space. | |
UnicodeSet & | applyPattern (const UnicodeString &pattern, uint32_t options, const SymbolTable *symbols, UErrorCode &status) |
Modifies this set to represent the set specified by the given pattern, optionally ignoring white space. | |
UnicodeSet & | applyPattern (const UnicodeString &pattern, ParsePosition &pos, uint32_t options, const SymbolTable *symbols, UErrorCode &status) |
Parses the given pattern, starting at the given position. | |
virtual UnicodeString & | toPattern (UnicodeString &result, UBool escapeUnprintable=FALSE) const |
Returns a string representation of this set. | |
UnicodeSet & | applyIntPropertyValue (UProperty prop, int32_t value, UErrorCode &ec) |
Modifies this set to contain those code points which have the given value for the given binary or enumerated property, as returned by u_getIntPropertyValue. | |
UnicodeSet & | applyPropertyAlias (const UnicodeString &prop, const UnicodeString &value, UErrorCode &ec) |
Modifies this set to contain those code points which have the given value for the given property. | |
virtual int32_t | size (void) const |
Returns the number of elements in this set (its cardinality), n, where 0 <= n <= 65536 . | |
virtual UBool | isEmpty (void) const |
Returns true if this set contains no elements. | |
virtual UBool | contains (UChar32 c) const |
Returns true if this set contains the given character. | |
virtual UBool | contains (UChar32 start, UChar32 end) const |
Returns true if this set contains every character of the given range. | |
UBool | contains (const UnicodeString &s) const |
Returns true if this set contains the given multicharacter string. | |
virtual UBool | containsAll (const UnicodeSet &c) const |
Returns true if this set contains all the characters and strings of the given set. | |
UBool | containsAll (const UnicodeString &s) const |
Returns true if this set contains all the characters of the given string. | |
UBool | containsNone (UChar32 start, UChar32 end) const |
Returns true if this set contains none of the characters of the given range. | |
UBool | containsNone (const UnicodeSet &c) const |
Returns true if this set contains none of the characters and strings of the given set. | |
UBool | containsNone (const UnicodeString &s) const |
Returns true if this set contains none of the characters of the given string. | |
UBool | containsSome (UChar32 start, UChar32 end) const |
Returns true if this set contains one or more of the characters in the given range. | |
UBool | containsSome (const UnicodeSet &s) const |
Returns true if this set contains one or more of the characters and strings of the given set. | |
UBool | containsSome (const UnicodeString &s) const |
Returns true if this set contains one or more of the characters of the given string. | |
UMatchDegree | matches (const Replaceable &text, int32_t &offset, int32_t limit, UBool incremental) |
Implement UnicodeMatcher::matches() ICU 2.4. | |
void | addMatchSetTo (UnicodeSet &toUnionTo) const |
Implementation of UnicodeMatcher API. | |
int32_t | indexOf (UChar32 c) const |
Returns the index of the given character within this set, where the set is ordered by ascending code point. | |
UChar32 | charAt (int32_t index) const |
Returns the character at the given index within this set, where the set is ordered by ascending code point. | |
virtual UnicodeSet & | add (UChar32 start, UChar32 end) |
Adds the specified range to this set if it is not already present. | |
UnicodeSet & | add (UChar32 c) |
Adds the specified character to this set if it is not already present. | |
UnicodeSet & | add (const UnicodeString &s) |
Adds the specified multicharacter to this set if it is not already present. | |
UnicodeSet & | addAll (const UnicodeString &s) |
Adds each of the characters in this string to the set. | |
UnicodeSet & | retainAll (const UnicodeString &s) |
Retains EACH of the characters in this string. | |
UnicodeSet & | complementAll (const UnicodeString &s) |
Complement EACH of the characters in this string. | |
UnicodeSet & | removeAll (const UnicodeString &s) |
Remove EACH of the characters in this string. | |
virtual UnicodeSet & | retain (UChar32 start, UChar32 end) |
Retain only the elements in this set that are contained in the specified range. | |
UnicodeSet & | retain (UChar32 c) |
Retain the specified character from this set if it is present. | |
virtual UnicodeSet & | remove (UChar32 start, UChar32 end) |
Removes the specified range from this set if it is present. | |
UnicodeSet & | remove (UChar32 c) |
Removes the specified character from this set if it is present. | |
UnicodeSet & | remove (const UnicodeString &s) |
Removes the specified string from this set if it is present. | |
virtual UnicodeSet & | complement (void) |
Inverts this set. | |
virtual UnicodeSet & | complement (UChar32 start, UChar32 end) |
Complements the specified range in this set. | |
UnicodeSet & | complement (UChar32 c) |
Complements the specified character in this set. | |
UnicodeSet & | complement (const UnicodeString &s) |
Complement the specified string in this set. | |
virtual UnicodeSet & | addAll (const UnicodeSet &c) |
Adds all of the elements in the specified set to this set if they're not already present. | |
virtual UnicodeSet & | retainAll (const UnicodeSet &c) |
Retains only the elements in this set that are contained in the specified set. | |
virtual UnicodeSet & | removeAll (const UnicodeSet &c) |
Removes from this set all of its elements that are contained in the specified set. | |
virtual UnicodeSet & | complementAll (const UnicodeSet &c) |
Complements in this set all elements contained in the specified set. | |
virtual UnicodeSet & | clear (void) |
Removes all of the elements from this set. | |
UnicodeSet & | closeOver (int32_t attribute) |
Close this set over the given attribute. | |
virtual int32_t | getRangeCount (void) const |
Iteration method that returns the number of ranges contained in this set. | |
virtual UChar32 | getRangeStart (int32_t index) const |
Iteration method that returns the first character in the specified range of this set. | |
virtual UChar32 | getRangeEnd (int32_t index) const |
Iteration method that returns the last character in the specified range of this set. | |
int32_t | serialize (uint16_t *dest, int32_t destCapacity, UErrorCode &ec) const |
Serializes this set into an array of 16-bit integers. | |
virtual UnicodeSet & | compact () |
Reallocate this objects internal structures to take up the least possible space, without changing this object's value. | |
virtual UClassID | getDynamicClassID (void) const |
Implement UnicodeFunctor API. | |
Static Public Member Functions | |
UBool | resemblesPattern (const UnicodeString &pattern, int32_t pos) |
Return true if the given position, in the given pattern, appears to be the start of a UnicodeSet pattern. | |
UnicodeSet * | createFrom (const UnicodeString &s) |
Makes a set from a multicharacter string. | |
UnicodeSet * | createFromAll (const UnicodeString &s) |
Makes a set from each of the characters in the string. | |
UClassID | getStaticClassID (void) |
Return the class ID for this class. | |
Static Public Attributes | |
const UChar32 | MIN_VALUE |
Minimum value that can be stored in a UnicodeSet. | |
const UChar32 | MAX_VALUE |
Maximum value that can be stored in a UnicodeSet. |
Objects of this class represent character classes used in regular expressions. A character specifies a subset of Unicode code points. Legal code points are U+0000 to U+10FFFF, inclusive.
The UnicodeSet class is not designed to be subclassed.
UnicodeSet
supports two APIs. The first is the operand API that allows the caller to modify the value of a UnicodeSet
object. It conforms to Java 2's java.util.Set
interface, although UnicodeSet
does not actually implement that interface. All methods of Set
are supported, with the modification that they take a character range or single character instead of an Object
, and they take a UnicodeSet
instead of a Collection
. The operand API may be thought of in terms of boolean logic: a boolean OR is implemented by add
, a boolean AND is implemented by retain
, a boolean XOR is implemented by complement
taking an argument, and a boolean NOT is implemented by complement
with no argument. In terms of traditional set theory function names, add
is a union, retain
is an intersection, remove
is an asymmetric difference, and complement
with no argument is a set complement with respect to the superset range MIN_VALUE-MAX_VALUE
The second API is the applyPattern()
/toPattern()
API from the java.text.Format
-derived classes. Unlike the methods that add characters, add categories, and control the logic of the set, the method applyPattern()
sets all attributes of a UnicodeSet
at once, based on a string pattern.
Pattern syntax
Patterns are accepted by the constructors and the applyPattern()
methods and returned by the toPattern()
method. These patterns follow a syntax similar to that employed by version 8 regular expression character classes. Here are some simple examples:
<blockquote>
[] | No characters |
[a] | The character 'a' |
[ae] | The characters 'a' and 'e' |
[a-e] | The characters 'a' through 'e' inclusive, in Unicode code point order |
[] | The character U+4E01 |
[a{ab}{ac}] | The character 'a' and the multicharacter strings "ab" and "ac" |
[ | All characters in the general category Uppercase Letter |
Any character may be preceded by a backslash in order to remove any special meaning. White space characters, as defined by UCharacter.isWhitespace(), are ignored, unless they are escaped.
Property patterns specify a set of characters having a certain property as defined by the Unicode standard. Both the POSIX-like "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a complete list of supported property patterns, see the User's Guide for UnicodeSet at http://oss.software.ibm.com/icu/userguide/unicodeSet.html. Actual determination of property data is defined by the underlying Unicode database as implemented by UCharacter.
Patterns specify individual characters, ranges of characters, and Unicode property sets. When elements are concatenated, they specify their union. To complement a set, place a '^' immediately after the opening '['. Property patterns are inverted by modifying their delimiters; "[:^foo]" and "\P{foo}". In any other location, '^' has no special meaning.
Ranges are indicated by placing two a '-' between two characters, as in "a-z". This specifies the range of all characters from the left to the right, in Unicode order. If the left character is greater than or equal to the right character it is a syntax error. If a '-' occurs as the first character after the opening '[' or '[^', or if it occurs as the last character before the closing ']', then it is taken as a literal. Thus "[a\u005C-b]", "[-ab]", and "[ab-]" all indicate the same set of three characters, 'a', 'b', and '-'.
Sets may be intersected using the '&' operator or the asymmetric set difference may be taken using the '-' operator, for example, "[[:L:]&[\u005Cu0000-\u005Cu0FFF]]" indicates the set of all Unicode letters with values less than 4096. Operators ('&' and '|') have equal precedence and bind left-to-right. Thus "[[:L:]-[a-z]-[\u005Cu0100-\u005Cu01FF]]" is equivalent to "[[[:L:]-[a-z]]-[\u005Cu0100-\u005Cu01FF]]". This only really matters for difference; intersection is commutative.
[a] | The set containing 'a' |
[a-z] | The set containing 'a' through 'z' and all letters in between, in Unicode order |
[^a-z] | The set containing all characters but 'a' through 'z', that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF |
[[pat1][pat2]] | The union of sets specified by pat1 and pat2 |
[[pat1]&[pat2]] | The intersection of sets specified by pat1 and pat2 |
[[pat1]-[pat2]] | The asymmetric difference of sets specified by pat1 and pat2 |
[:Lu:] or | The set of characters having the specified Unicode property; in this case, Unicode uppercase letters |
[:^Lu:] or {Lu} | The set of characters not having the given Unicode property |
Warning: you cannot add an empty string ("") to a UnicodeSet.
Formal syntax
<blockquote>
pattern := | ('[' '^'? item* ']') | property |
item := | char | (char '-' char) | pattern-expr |
pattern-expr := | pattern | pattern-expr pattern | pattern-expr op pattern |
op := | '&' | '-' |
special := | '[' | ']' | '-' |
char := | any character that is not special any character) |
hex := | any character for which Character.digit(c, 16) returns a non-negative result |
property := | a Unicode property set pattern |
Legend:
|
|
Constructs an empty set. ICU 2.0 |
|
Constructs a set containing the given range.
If
|
|
Constructs a set from the given pattern. See the class description for the syntax of the pattern language.
|
|
Constructs a set from the given pattern. See the class description for the syntax of the pattern language.
For internal use only.
|
|
Constructs a set from the given pattern. See the class description for the syntax of the pattern language.
|
|
Constructs a set that is identical to the given UnicodeSet. ICU 2.0 |
|
Destructs the set. ICU 2.0 |
|
Adds the specified multicharacter to this set if it is not already present.
If this set already contains the multicharacter, the call leaves this set unchanged. Thus "ch" => {"ch"}
|
|
Adds the specified character to this set if it is not already present. If this set already contains the specified character, the call leaves this set unchanged. ICU 2.0 |
|
Adds the specified range to this set if it is not already present.
If this set already contains the specified range, the call leaves this set unchanged. If
|
|
Adds all of the elements in the specified set to this set if they're not already present. This operation effectively modifies this set so that its value is the union of the two sets. The behavior of this operation is unspecified if the specified collection is modified while the operation is in progress.
|
|
Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} If this set already any particular character, it has no effect on that character.
|
|
Implementation of UnicodeMatcher API. Union the set of all characters that may be matched by this object into the given set.
Implements UnicodeMatcher.
|
|
Modifies this set to contain those code points which have the given value for the given binary or enumerated property, as returned by u_getIntPropertyValue. Prior contents of this set are lost.
|
|
Parses the given pattern, starting at the given position. The character at pattern.charAt(pos.getIndex()) must be '[', or the parse fails. Parsing continues until the corresponding closing ']'. If a syntax error is encountered between the opening and closing brace, the parse fails. Upon return from a successful parse, the ParsePosition is updated to point to the character following the closing ']', and a StringBuffer containing a pairs list for the parsed pattern is returned. This method calls itself recursively to parse embedded subpatterns. Empties the set passed before applying the pattern.
|
|
Modifies this set to represent the set specified by the given pattern, optionally ignoring white space. See the class description for the syntax of the pattern language.
For internal use only.
|
|
Modifies this set to represent the set specified by the given pattern, optionally ignoring white space. See the class description for the syntax of the pattern language.
|
|
Modifies this set to contain those code points which have the given value for the given property. Prior contents of this set are lost.
|
|
Returns the character at the given index within this set, where the set is ordered by ascending code point.
If the index is out of range, return (UChar32)-1. The inverse of this method is
|
|
Removes all of the elements from this set. This set will be empty after this call returns. ICU 2.0 |
|
Returns a copy of this object. All UnicodeFunctor objects have to support cloning in order to allow classes using UnicodeFunctors, such as Transliterator, to implement cloning. ICU 2.0 Implements UnicodeFunctor.
|
|
Close this set over the given attribute. For the attribute USET_CASE, the result is to modify this set so that: 1. For each character or string 'a' in this set, all strings or characters 'b' such that foldCase(a) == foldCase(b) are added to this set. 2. For each string 'e' in the resulting set, if e != foldCase(e), 'e' will be removed. Example: [aq{Bc}{bC}{Fi}] => [aAqQ{ss}{bc}{fi}] (Here foldCase(x) refers to the operation u_strFoldCase, and a == b denotes that the contents are the same, not pointer comparison.)
For internal use only.
|
|
Reallocate this objects internal structures to take up the least possible space, without changing this object's value. ICU 2.4 |
|
Complement the specified string in this set.
The set will not contain the specified string once the call returns.
|
|
Complements the specified character in this set. The character will be removed if it is in this set, or will be added if it is not in this set. ICU 2.0 |
|
Complements the specified range in this set.
Any character in the range will be removed if it is in this set, or will be added if it is not in this set. If
|
|
Inverts this set.
This operation modifies this set so that its value is its complement. This is equivalent to |
|
Complements in this set all elements contained in the specified set. Any character in the other set will be removed if it is in this set, or will be added if it is not in this set.
|
|
Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} If this set already any particular character, it has no effect on that character.
|
|
Returns
|
|
Returns true if this set contains every character of the given range.
|
|
Returns true if this set contains the given character.
Implements UnicodeFilter.
|
|
Returns true if this set contains all the characters of the given string.
|
|
Returns true if this set contains all the characters and strings of the given set.
|
|
Returns true if this set contains none of the characters of the given string.
|
|
Returns true if this set contains none of the characters and strings of the given set.
|
|
Returns true if this set contains none of the characters of the given range.
|
|
Returns true if this set contains one or more of the characters of the given string.
|
|
Returns true if this set contains one or more of the characters and strings of the given set.
|
|
Returns true if this set contains one or more of the characters in the given range.
|
|
Makes a set from a multicharacter string.
Thus "ch" => {"ch"}
|
|
Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
|
|
Implement UnicodeFunctor API.
Implements UnicodeFilter.
|
|
Iteration method that returns the number of ranges contained in this set.
|
|
Iteration method that returns the last character in the specified range of this set.
|
|
Iteration method that returns the first character in the specified range of this set.
|
|
Return the class ID for this class. This is useful only for comparing to a return value from getDynamicClassID(). For example: . Base* polymorphic_pointer = createPolymorphicObject(); . if (polymorphic_pointer->getDynamicClassID() == . Derived::getStaticClassID()) ...
|
|
Returns the hash code value for this set.
|
|
Returns the index of the given character within this set, where the set is ordered by ascending code point.
If the character is not in this set, return -1. The inverse of this method is
|
|
Returns
|
|
Compares the specified object with this set for equality.
Returns |
|
Assigns this object to be a copy of another. ICU 2.0 |
|
Compares the specified object with this set for equality.
Returns
|
|
Removes the specified string from this set if it is present. The set will not contain the specified character once the call returns.
|
|
Removes the specified character from this set if it is present. The set will not contain the specified range once the call returns. ICU 2.0 |
|
Removes the specified range from this set if it is present.
The set will not contain the specified range once the call returns. If
|
|
Removes from this set all of its elements that are contained in the specified set. This operation effectively modifies this set so that its value is the asymmetric set difference of the two sets.
|
|
Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} If this set already any particular character, it has no effect on that character.
|
|
Return true if the given position, in the given pattern, appears to be the start of a UnicodeSet pattern. ICU 2.4 |
|
Retain the specified character from this set if it is present. ICU 2.0 |
|
Retain only the elements in this set that are contained in the specified range.
If
|
|
Retains only the elements in this set that are contained in the specified set. In other words, removes from this set all of its elements that are not contained in the specified set. This operation effectively modifies this set so that its value is the intersection of the two sets.
|
|
Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} If this set already any particular character, it has no effect on that character.
|
|
Serializes this set into an array of 16-bit integers. Serialization (currently) only records the characters in the set; multicharacter strings are ignored. The array has following format (each line is one 16-bit integer): length = (n+2*m) | (m!=0?0x8000:0) bmpLength = n; present if m!=0 bmp[0] bmp[1] ... bmp[n-1] supp-high[0] supp-low[0] supp-high[1] supp-low[1] ... supp-high[m-1] supp-low[m-1] The array starts with a header. After the header are n bmp code points, then m supplementary code points. Either n or m or both may be zero. n+2*m is always <= 0x7FFF. If there are no supplementary characters (if m==0) then the header is one 16-bit integer, 'length', with value n. If there are supplementary characters (if m!=0) then the header is two 16-bit integers. The first, 'length', has value (n+2*m)|0x8000. The second, 'bmpLength', has value n. After the header the code points are stored in ascending order. Supplementary code points are stored as most significant 16 bits followed by least significant 16 bits.
|
|
Make this object represent the range
If
|
|
Returns the number of elements in this set (its cardinality), n, where
|
|
Returns a string representation of this set. If the result of calling this function is passed to a UnicodeSet constructor, it will produce another set that is equal to this one.
Implements UnicodeMatcher.
|
|
Maximum value that can be stored in a UnicodeSet. ICU 2.4 |
|
Minimum value that can be stored in a UnicodeSet. ICU 2.4 |