utf8.h

Go to the documentation of this file.
00001 /* 00002 ******************************************************************************* 00003 * 00004 * Copyright (C) 1999-2004, International Business Machines 00005 * Corporation and others. All Rights Reserved. 00006 * 00007 ******************************************************************************* 00008 * file name: utf8.h 00009 * encoding: US-ASCII 00010 * tab size: 8 (not used) 00011 * indentation:4 00012 * 00013 * created on: 1999sep13 00014 * created by: Markus W. Scherer 00015 */ 00016 00034 /* utf.h must be included first. */ 00035 #ifndef __UTF_H__ 00036 # include "unicode/utf.h" 00037 #endif 00038 00039 00040 #ifndef __UTF8_H__ 00041 #define __UTF8_H__ 00042 00043 /* internal definitions ----------------------------------------------------- */ 00044 00051 #ifdef U_UTF8_IMPL 00052 U_INTERNAL const uint8_t 00053 #elif defined(U_STATIC_IMPLEMENTATION) 00054 U_CFUNC const uint8_t 00055 #else 00056 U_CFUNC U_IMPORT const uint8_t /* U_IMPORT2? */ /*U_IMPORT*/ 00057 #endif 00058 utf8_countTrailBytes[256]; 00059 00064 #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte]) 00065 00070 #define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) 00071 00076 U_INTERNAL UChar32 U_EXPORT2 00077 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict); 00078 00083 U_INTERNAL int32_t U_EXPORT2 00084 utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError); 00085 00090 U_INTERNAL UChar32 U_EXPORT2 00091 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict); 00092 00097 U_INTERNAL int32_t U_EXPORT2 00098 utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); 00099 00100 /* single-code point definitions -------------------------------------------- */ 00101 00108 #define U8_IS_SINGLE(c) (((c)&0x80)==0) 00109 00116 #define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e) 00117 00124 #define U8_IS_TRAIL(c) (((c)&0xc0)==0x80) 00125 00133 #define U8_LENGTH(c) \ 00134 ((uint32_t)(c)<=0x7f ? 1 : \ 00135 ((uint32_t)(c)<=0x7ff ? 2 : \ 00136 ((uint32_t)(c)<=0xd7ff ? 3 : \ 00137 ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \ 00138 ((uint32_t)(c)<=0xffff ? 3 : 4)\ 00139 ) \ 00140 ) \ 00141 ) \ 00142 ) 00143 00149 #define U8_MAX_LENGTH 4 00150 00167 #define U8_GET_UNSAFE(s, i, c) { \ 00168 int32_t __I=(int32_t)(i); \ 00169 U8_SET_CP_START_UNSAFE(s, __I); \ 00170 U8_NEXT_UNSAFE(s, __I, c); \ 00171 } 00172 00191 #define U8_GET(s, start, i, length, c) { \ 00192 int32_t __I=(int32_t)(i); \ 00193 U8_SET_CP_START(s, start, __I); \ 00194 U8_NEXT(s, __I, length, c); \ 00195 } 00196 00197 /* definitions with forward iteration --------------------------------------- */ 00198 00216 #define U8_NEXT_UNSAFE(s, i, c) { \ 00217 (c)=(s)[(i)++]; \ 00218 if((uint8_t)((c)-0xc0)<0x35) { \ 00219 uint8_t __count=U8_COUNT_TRAIL_BYTES(c); \ 00220 U8_MASK_LEAD_BYTE(c, __count); \ 00221 switch(__count) { \ 00222 /* each following branch falls through to the next one */ \ 00223 case 3: \ 00224 (c)=((c)<<6)|((s)[(i)++]&0x3f); \ 00225 case 2: \ 00226 (c)=((c)<<6)|((s)[(i)++]&0x3f); \ 00227 case 1: \ 00228 (c)=((c)<<6)|((s)[(i)++]&0x3f); \ 00229 /* no other branches to optimize switch() */ \ 00230 break; \ 00231 } \ 00232 } \ 00233 } 00234 00253 #define U8_NEXT(s, i, length, c) { \ 00254 (c)=(s)[(i)++]; \ 00255 if(((uint8_t)(c))>=0x80) { \ 00256 if(U8_IS_LEAD(c)) { \ 00257 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -1); \ 00258 } else { \ 00259 (c)=U_SENTINEL; \ 00260 } \ 00261 } \ 00262 } 00263 00277 #define U8_APPEND_UNSAFE(s, i, c) { \ 00278 if((uint32_t)(c)<=0x7f) { \ 00279 (s)[(i)++]=(uint8_t)(c); \ 00280 } else { \ 00281 if((uint32_t)(c)<=0x7ff) { \ 00282 (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ 00283 } else { \ 00284 if((uint32_t)(c)<=0xffff) { \ 00285 (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ 00286 } else { \ 00287 (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \ 00288 (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \ 00289 } \ 00290 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ 00291 } \ 00292 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ 00293 } \ 00294 } 00295 00313 #define U8_APPEND(s, i, length, c, isError) { \ 00314 if((uint32_t)(c)<=0x7f) { \ 00315 (s)[(i)++]=(uint8_t)(c); \ 00316 } else { \ 00317 (i)=utf8_appendCharSafeBody(s, (int32_t)(i), (int32_t)(length), c, &(isError)); \ 00318 } \ 00319 } 00320 00331 #define U8_FWD_1_UNSAFE(s, i) { \ 00332 (i)+=1+U8_COUNT_TRAIL_BYTES((s)[i]); \ 00333 } 00334 00346 #define U8_FWD_1(s, i, length) { \ 00347 uint8_t __b=(s)[(i)++]; \ 00348 if(U8_IS_LEAD(__b)) { \ 00349 uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \ 00350 if((i)+__count>(length)) { \ 00351 __count=(uint8_t)((length)-(i)); \ 00352 } \ 00353 while(__count>0 && U8_IS_TRAIL((s)[i])) { \ 00354 ++(i); \ 00355 --__count; \ 00356 } \ 00357 } \ 00358 } 00359 00372 #define U8_FWD_N_UNSAFE(s, i, n) { \ 00373 int32_t __N=(n); \ 00374 while(__N>0) { \ 00375 U8_FWD_1_UNSAFE(s, i); \ 00376 --__N; \ 00377 } \ 00378 } 00379 00393 #define U8_FWD_N(s, i, length, n) { \ 00394 int32_t __N=(n); \ 00395 while(__N>0 && (i)<(length)) { \ 00396 U8_FWD_1(s, i, length); \ 00397 --__N; \ 00398 } \ 00399 } 00400 00414 #define U8_SET_CP_START_UNSAFE(s, i) { \ 00415 while(U8_IS_TRAIL((s)[i])) { --(i); } \ 00416 } 00417 00432 #define U8_SET_CP_START(s, start, i) { \ 00433 if(U8_IS_TRAIL((s)[(i)])) { \ 00434 (i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \ 00435 } \ 00436 } 00437 00438 /* definitions with backward iteration -------------------------------------- */ 00439 00459 #define U8_PREV_UNSAFE(s, i, c) { \ 00460 (c)=(s)[--(i)]; \ 00461 if(U8_IS_TRAIL(c)) { \ 00462 uint8_t __b, __count=1, __shift=6; \ 00463 \ 00464 /* c is a trail byte */ \ 00465 (c)&=0x3f; \ 00466 for(;;) { \ 00467 __b=(s)[--(i)]; \ 00468 if(__b>=0xc0) { \ 00469 U8_MASK_LEAD_BYTE(__b, __count); \ 00470 (c)|=(UChar32)__b<<__shift; \ 00471 break; \ 00472 } else { \ 00473 (c)|=(UChar32)(__b&0x3f)<<__shift; \ 00474 ++__count; \ 00475 __shift+=6; \ 00476 } \ 00477 } \ 00478 } \ 00479 } 00480 00501 #define U8_PREV(s, start, i, c) { \ 00502 (c)=(s)[--(i)]; \ 00503 if((c)>=0x80) { \ 00504 if((c)<=0xbf) { \ 00505 (c)=utf8_prevCharSafeBody(s, start, &(i), c, -1); \ 00506 } else { \ 00507 (c)=U_SENTINEL; \ 00508 } \ 00509 } \ 00510 } 00511 00523 #define U8_BACK_1_UNSAFE(s, i) { \ 00524 while(U8_IS_TRAIL((s)[--(i)])) {} \ 00525 } 00526 00539 #define U8_BACK_1(s, start, i) { \ 00540 if(U8_IS_TRAIL((s)[--(i)])) { \ 00541 (i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \ 00542 } \ 00543 } 00544 00558 #define U8_BACK_N_UNSAFE(s, i, n) { \ 00559 int32_t __N=(n); \ 00560 while(__N>0) { \ 00561 U8_BACK_1_UNSAFE(s, i); \ 00562 --__N; \ 00563 } \ 00564 } 00565 00580 #define U8_BACK_N(s, start, i, n) { \ 00581 int32_t __N=(n); \ 00582 while(__N>0 && (i)>(start)) { \ 00583 U8_BACK_1(s, start, i); \ 00584 --__N; \ 00585 } \ 00586 } 00587 00601 #define U8_SET_CP_LIMIT_UNSAFE(s, i) { \ 00602 U8_BACK_1_UNSAFE(s, i); \ 00603 U8_FWD_1_UNSAFE(s, i); \ 00604 } 00605 00621 #define U8_SET_CP_LIMIT(s, start, i, length) { \ 00622 if((start)<(i) && (i)<(length)) { \ 00623 U8_BACK_1(s, start, i); \ 00624 U8_FWD_1(s, i, length); \ 00625 } \ 00626 } 00627 00628 #endif

Generated on Fri Jun 18 12:36:03 2004 for ICU by doxygen 1.3.7