#include "unicode/utf.h"
Go to the source code of this file.
Defines | |
#define | U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte]) |
Count the trail bytes for a UTF-8 lead byte. | |
#define | U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) |
Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. | |
#define | U8_IS_SINGLE(c) (((c)&0x80)==0) |
Does this code unit (byte) encode a code point by itself (US-ASCII 0. | |
#define | U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e) |
Is this code unit (byte) a UTF-8 lead byte? | |
#define | U8_IS_TRAIL(c) (((c)&0xc0)==0x80) |
Is this code unit (byte) a UTF-8 trail byte? | |
#define | U8_LENGTH(c) |
How many code units (bytes) are used for the UTF-8 encoding of this Unicode code point? | |
#define | U8_MAX_LENGTH 4 |
The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000. | |
#define | U8_GET_UNSAFE(s, i, c) |
Get a code point from a string at a random-access offset, without changing the offset. | |
#define | U8_GET(s, start, i, length, c) |
Get a code point from a string at a random-access offset, without changing the offset. | |
#define | U8_NEXT_UNSAFE(s, i, c) |
Get a code point from a string at a code point boundary offset, and advance the offset to the next code point boundary. | |
#define | U8_NEXT(s, i, length, c) |
Get a code point from a string at a code point boundary offset, and advance the offset to the next code point boundary. | |
#define | U8_APPEND_UNSAFE(s, i, c) |
Append a code point to a string, overwriting 1 to 4 bytes. | |
#define | U8_APPEND(s, i, capacity, c, isError) |
Append a code point to a string, overwriting 1 to 4 bytes. | |
#define | U8_FWD_1_UNSAFE(s, i) |
Advance the string offset from one code point boundary to the next. | |
#define | U8_FWD_1(s, i, length) |
Advance the string offset from one code point boundary to the next. | |
#define | U8_FWD_N_UNSAFE(s, i, n) |
Advance the string offset from one code point boundary to the n-th next one, i.e. | |
#define | U8_FWD_N(s, i, length, n) |
Advance the string offset from one code point boundary to the n-th next one, i.e. | |
#define | U8_SET_CP_START_UNSAFE(s, i) |
Adjust a random-access offset to a code point boundary at the start of a code point. | |
#define | U8_SET_CP_START(s, start, i) |
Adjust a random-access offset to a code point boundary at the start of a code point. | |
#define | U8_PREV_UNSAFE(s, i, c) |
Move the string offset from one code point boundary to the previous one and get the code point between them. | |
#define | U8_PREV(s, start, i, c) |
Move the string offset from one code point boundary to the previous one and get the code point between them. | |
#define | U8_BACK_1_UNSAFE(s, i) |
Move the string offset from one code point boundary to the previous one. | |
#define | U8_BACK_1(s, start, i) |
Move the string offset from one code point boundary to the previous one. | |
#define | U8_BACK_N_UNSAFE(s, i, n) |
Move the string offset from one code point boundary to the n-th one before it, i.e. | |
#define | U8_BACK_N(s, start, i, n) |
Move the string offset from one code point boundary to the n-th one before it, i.e. | |
#define | U8_SET_CP_LIMIT_UNSAFE(s, i) |
Adjust a random-access offset to a code point boundary after a code point. | |
#define | U8_SET_CP_LIMIT(s, start, i, length) |
Adjust a random-access offset to a code point boundary after a code point. | |
Functions | |
UChar32 | utf8_nextCharSafeBody (const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) |
Function for handling "next code point" with error-checking. | |
int32_t | utf8_appendCharSafeBody (uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) |
Function for handling "append code point" with error-checking. | |
UChar32 | utf8_prevCharSafeBody (const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) |
Function for handling "previous code point" with error-checking. | |
int32_t | utf8_back1SafeBody (const uint8_t *s, int32_t start, int32_t i) |
Function for handling "skip backward one code point" with error-checking. | |
Variables | |
U_CFUNC U_IMPORT const uint8_t | utf8_countTrailBytes [256] |
Internal array with numbers of trail bytes for any given byte used in lead byte position. |
This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings. utf8.h is included by utf.h after unicode/umachine.h and some common definitions.
For more information see utf.h and the ICU User Guide Strings chapter (http://icu.sourceforge.net/userguide/strings.html).
Usage: ICU coding guidelines for if() statements should be followed when using these macros. Compound statements (curly braces {}) must be used for if-else-while... bodies and all macro statements should be terminated with semicolon.
Definition in file utf8.h.
|
Value: { \ if((uint32_t)(c)<=0x7f) { \ (s)[(i)++]=(uint8_t)(c); \ } else if((uint32_t)(c)<=0x7ff && (i)+1<(capacity)) { \ (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ } else if((uint32_t)(c)<=0xd7ff && (i)+2<(capacity)) { \ (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ } else { \ (i)=utf8_appendCharSafeBody(s, (int32_t)(i), (int32_t)(capacity), c, &(isError)); \ } \ } The offset points to the current end of the string contents and is advanced (post-increment). "Safe" macro, checks for a valid code point. If a non-ASCII code point is written, checks for sufficient space in the string. If the code point is not valid or trail bytes do not fit, then isError is set to TRUE.
|
|
Value: { \ if((uint32_t)(c)<=0x7f) { \ (s)[(i)++]=(uint8_t)(c); \ } else { \ if((uint32_t)(c)<=0x7ff) { \ (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ } else { \ if((uint32_t)(c)<=0xffff) { \ (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ } else { \ (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \ (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \ } \ (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ } \ (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ } \ } The offset points to the current end of the string contents and is advanced (post-increment). "Unsafe" macro, assumes a valid code point and sufficient space in the string. Otherwise, the result is undefined.
|
|
Value: { \ if(U8_IS_TRAIL((s)[--(i)])) { \ (i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \ } \ } (Pre-decrementing backward iteration.) The input offset may be the same as the string length. "Safe" macro, checks for illegal sequences and for string boundaries.
|
|
Value: { \ while(U8_IS_TRAIL((s)[--(i)])) {} \ } (Pre-decrementing backward iteration.) The input offset may be the same as the string length. "Unsafe" macro, assumes well-formed UTF-8.
|
|
Value: { \ int32_t __N=(n); \ while(__N>0 && (i)>(start)) { \ U8_BACK_1(s, start, i); \ --__N; \ } \ } , move backward by n code points. (Pre-decrementing backward iteration.) The input offset may be the same as the string length. "Safe" macro, checks for illegal sequences and for string boundaries.
|
|
Value: { \ int32_t __N=(n); \ while(__N>0) { \ U8_BACK_1_UNSAFE(s, i); \ --__N; \ } \ } , move backward by n code points. (Pre-decrementing backward iteration.) The input offset may be the same as the string length. "Unsafe" macro, assumes well-formed UTF-8.
|
|
Count the trail bytes for a UTF-8 lead byte.
|
|
Value: { \ uint8_t __b=(uint8_t)(s)[(i)++]; \ if(U8_IS_LEAD(__b)) { \ uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \ if((i)+__count>(length)) { \ __count=(uint8_t)((length)-(i)); \ } \ while(__count>0 && U8_IS_TRAIL((s)[i])) { \ ++(i); \ --__count; \ } \ } \ } (Post-incrementing iteration.) "Safe" macro, checks for illegal sequences and for string boundaries.
|
|
Value: { \ (i)+=1+U8_COUNT_TRAIL_BYTES((s)[i]); \ } (Post-incrementing iteration.) "Unsafe" macro, assumes well-formed UTF-8.
|
|
Value: { \ int32_t __N=(n); \ while(__N>0 && (i)<(length)) { \ U8_FWD_1(s, i, length); \ --__N; \ } \ } , move forward by n code points. (Post-incrementing iteration.) "Safe" macro, checks for illegal sequences and for string boundaries.
|
|
Value: { \ int32_t __N=(n); \ while(__N>0) { \ U8_FWD_1_UNSAFE(s, i); \ --__N; \ } \ } , move forward by n code points. (Post-incrementing iteration.) "Unsafe" macro, assumes well-formed UTF-8.
|
|
Value: { \ int32_t _u8_get_index=(int32_t)(i); \ U8_SET_CP_START(s, start, _u8_get_index); \ U8_NEXT(s, _u8_get_index, length, c); \ } The offset may point to either the lead byte or one of the trail bytes for a code point, in which case the macro will read all of the bytes for the code point. If the offset points to an illegal UTF-8 byte sequence, then c is set to a negative value. Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
|
|
Value: { \ int32_t _u8_get_unsafe_index=(int32_t)(i); \ U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \ U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \ } The offset may point to either the lead byte or one of the trail bytes for a code point, in which case the macro will read all of the bytes for the code point. The result is undefined if the offset points to an illegal UTF-8 byte sequence. Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
|
|
Is this code unit (byte) a UTF-8 lead byte?
|
|
Does this code unit (byte) encode a code point by itself (US-ASCII 0. .0x7f)?
|
|
Is this code unit (byte) a UTF-8 trail byte?
|
|
Value: ((uint32_t)(c)<=0x7f ? 1 : \ ((uint32_t)(c)<=0x7ff ? 2 : \ ((uint32_t)(c)<=0xd7ff ? 3 : \ ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \ ((uint32_t)(c)<=0xffff ? 3 : 4)\ ) \ ) \ ) \ )
|
|
Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
|
|
The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000. .U+10ffff).
|
|
Value: { \ (c)=(uint8_t)(s)[(i)++]; \ if((c)>=0x80) { \ uint8_t __t1, __t2; \ if( /* handle U+1000..U+CFFF inline */ \ (0xe0<(c) && (c)<=0xec) && \ (((i)+1)<(length)) && \ (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \ (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \ ) { \ /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \ (i)+=2; \ } else if( /* handle U+0080..U+07FF inline */ \ ((c)<0xe0 && (c)>=0xc2) && \ ((i)<(length)) && \ (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \ ) { \ (c)=(UChar)((((c)&0x1f)<<6)|__t1); \ ++(i); \ } else if(U8_IS_LEAD(c)) { \ /* function call for "complicated" and error cases */ \ (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -1); \ } else { \ (c)=U_SENTINEL; \ } \ } \ } (Post-incrementing forward iteration.) "Safe" macro, checks for illegal sequences and for string boundaries. The offset may point to the lead byte of a multi-byte sequence, in which case the macro will read the whole sequence. If the offset points to a trail byte or an illegal UTF-8 sequence, then c is set to a negative value.
|
|
Value: { \ (c)=(uint8_t)(s)[(i)++]; \ if((uint8_t)((c)-0xc0)<0x35) { \ uint8_t __count=U8_COUNT_TRAIL_BYTES(c); \ U8_MASK_LEAD_BYTE(c, __count); \ switch(__count) { \ /* each following branch falls through to the next one */ \ case 3: \ (c)=((c)<<6)|((s)[(i)++]&0x3f); \ case 2: \ (c)=((c)<<6)|((s)[(i)++]&0x3f); \ case 1: \ (c)=((c)<<6)|((s)[(i)++]&0x3f); \ /* no other branches to optimize switch() */ \ break; \ } \ } \ } (Post-incrementing forward iteration.) "Unsafe" macro, assumes well-formed UTF-8. The offset may point to the lead byte of a multi-byte sequence, in which case the macro will read the whole sequence. The result is undefined if the offset points to a trail byte or an illegal UTF-8 sequence.
|
|
Value: { \ (c)=(uint8_t)(s)[--(i)]; \ if((c)>=0x80) { \ if((c)<=0xbf) { \ (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \ } else { \ (c)=U_SENTINEL; \ } \ } \ } (Pre-decrementing backward iteration.) "Safe" macro, checks for illegal sequences and for string boundaries. The input offset may be the same as the string length. If the offset is behind a multi-byte sequence, then the macro will read the whole sequence. If the offset is behind a lead byte, then that itself will be returned as the code point. If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
|
|
Value: { \ (c)=(uint8_t)(s)[--(i)]; \ if(U8_IS_TRAIL(c)) { \ uint8_t __b, __count=1, __shift=6; \ \ /* c is a trail byte */ \ (c)&=0x3f; \ for(;;) { \ __b=(uint8_t)(s)[--(i)]; \ if(__b>=0xc0) { \ U8_MASK_LEAD_BYTE(__b, __count); \ (c)|=(UChar32)__b<<__shift; \ break; \ } else { \ (c)|=(UChar32)(__b&0x3f)<<__shift; \ ++__count; \ __shift+=6; \ } \ } \ } \ } (Pre-decrementing backward iteration.) "Unsafe" macro, assumes well-formed UTF-8. The input offset may be the same as the string length. If the offset is behind a multi-byte sequence, then the macro will read the whole sequence. If the offset is behind a lead byte, then that itself will be returned as the code point. The result is undefined if the offset is behind an illegal UTF-8 sequence.
|
|
Value: Adjust a random-access offset to a code point boundary after a code point.If the offset is behind a partial multi-byte sequence, then the offset is incremented to behind the whole sequence. Otherwise, it is not modified. The input offset may be the same as the string length. "Safe" macro, checks for illegal sequences and for string boundaries.
|
|
Value: { \ U8_BACK_1_UNSAFE(s, i); \ U8_FWD_1_UNSAFE(s, i); \ } If the offset is behind a partial multi-byte sequence, then the offset is incremented to behind the whole sequence. Otherwise, it is not modified. The input offset may be the same as the string length. "Unsafe" macro, assumes well-formed UTF-8.
|
|
Value: { \ if(U8_IS_TRAIL((s)[(i)])) { \ (i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \ } \ } If the offset points to a UTF-8 trail byte, then the offset is moved backward to the corresponding lead byte. Otherwise, it is not modified. "Safe" macro, checks for illegal sequences and for string boundaries.
|
|
Value: { \ while(U8_IS_TRAIL((s)[i])) { --(i); } \ } If the offset points to a UTF-8 trail byte, then the offset is moved backward to the corresponding lead byte. Otherwise, it is not modified. "Unsafe" macro, assumes well-formed UTF-8.
|
|
Function for handling "append code point" with error-checking.
|
|
Function for handling "skip backward one code point" with error-checking.
|
|
Function for handling "next code point" with error-checking.
|
|
Function for handling "previous code point" with error-checking.
|
|
Internal array with numbers of trail bytes for any given byte used in lead byte position.
|