/* ***************************************************************************************** * Copyright (C) 1996-1999, International Business Machines * Corporation and others. All Rights Reserved. ***************************************************************************************** */ // FILE NAME : unicode.h // // CREATED // Wednesday, December 11, 1996 // // CREATED BY // Helena Shih // // CHANGES // Thursday, April 15, 1999 // Modified the definitions of all the functions // C++ Wrappers for Unicode // CHANGES BY // Madhu Katragadda // 5/20/99 Madhu Added the function getVersion() // 11/22/99 aliu Added MIN_RADIX, MAX_RADIX, digit, forDigit //******************************************************************************************** #ifndef UNICODE_H #define UNICODE_H #include "unicode/utypes.h" #include "unicode/uchar.h" class U_COMMON_API Unicode { public: static const UChar MIN_VALUE; static const UChar MAX_VALUE; enum EUnicodeGeneralTypes { UNASSIGNED = 0, UPPERCASE_LETTER = 1, LOWERCASE_LETTER = 2, TITLECASE_LETTER = 3, MODIFIER_LETTER = 4, OTHER_LETTER = 5, NON_SPACING_MARK = 6, ENCLOSING_MARK = 7, COMBINING_SPACING_MARK = 8, DECIMAL_DIGIT_NUMBER = 9, LETTER_NUMBER = 10, OTHER_NUMBER = 11, SPACE_SEPARATOR = 12, LINE_SEPARATOR = 13, PARAGRAPH_SEPARATOR = 14, CONTROL = 15, FORMAT = 16, PRIVATE_USE = 17, SURROGATE = 18, DASH_PUNCTUATION = 19, START_PUNCTUATION = 20, END_PUNCTUATION = 21, CONNECTOR_PUNCTUATION = 22, OTHER_PUNCTUATION = 23, MATH_SYMBOL = 24, CURRENCY_SYMBOL = 25, MODIFIER_SYMBOL = 26, OTHER_SYMBOL = 27, INITIAL_PUNCTUATION = 28, FINAL_PUNCTUATION = 29, GENERAL_TYPES_COUNT = 30 }; enum EUnicodeScript { kBasicLatin, kLatin1Supplement, kLatinExtendedA, kLatinExtendedB, kIPAExtension, kSpacingModifier, kCombiningDiacritical, kGreek, kCyrillic, kArmenian, kHebrew, kArabic, kDevanagari, kBengali, kGurmukhi, kGujarati, kOriya, kTamil, kTelugu, kKannada, kMalayalam, kThai, kLao, kTibetan, kGeorgian, kHangulJamo, kLatinExtendedAdditional, kGreekExtended, kGeneralPunctuation, kSuperSubScript, kCurrencySymbolScript, kSymbolCombiningMark, kLetterlikeSymbol, kNumberForm, kArrow, kMathOperator, kMiscTechnical, kControlPicture, kOpticalCharacter, kEnclosedAlphanumeric, kBoxDrawing, kBlockElement, kGeometricShape, kMiscSymbol, kDingbat, kCJKSymbolPunctuation, kHiragana, kKatakana, kBopomofo, kHangulCompatibilityJamo, kKanbun, kEnclosedCJKLetterMonth, kCJKCompatibility, kCJKUnifiedIdeograph, kHangulSyllable, kHighSurrogate, kHighPrivateUseSurrogate, kLowSurrogate, kPrivateUse, kCJKCompatibilityIdeograph, kAlphabeticPresentation, kArabicPresentationA, kCombiningHalfMark, kCJKCompatibilityForm, kSmallFormVariant, kArabicPresentationB, kNoScript, kHalfwidthFullwidthForm, kScriptCount }; enum EDirectionProperty { LEFT_TO_RIGHT = 0, RIGHT_TO_LEFT = 1, EUROPEAN_NUMBER = 2, EUROPEAN_NUMBER_SEPARATOR = 3, EUROPEAN_NUMBER_TERMINATOR = 4, ARABIC_NUMBER = 5, COMMON_NUMBER_SEPARATOR = 6, BLOCK_SEPARATOR = 7, SEGMENT_SEPARATOR = 8, WHITE_SPACE_NEUTRAL = 9, OTHER_NEUTRAL = 10, LEFT_TO_RIGHT_EMBEDDING = 11, LEFT_TO_RIGHT_OVERRIDE = 12, RIGHT_TO_LEFT_ARABIC = 13, RIGHT_TO_LEFT_EMBEDDING = 14, RIGHT_TO_LEFT_OVERRIDE = 15, POP_DIRECTIONAL_FORMAT = 16, DIR_NON_SPACING_MARK = 17, BOUNDARY_NEUTRAL = 18 }; enum ECellWidths { ZERO_WIDTH = 0, HALF_WIDTH = 1, FULL_WIDTH = 2, NEUTRAL = 3 }; static const int8_t MIN_RADIX; static const int8_t MAX_RADIX; static bool_t isLowerCase(UChar ch); static bool_t isUpperCase(UChar ch); static bool_t isTitleCase(UChar ch); static bool_t isDigit(UChar ch); static bool_t isDefined(UChar ch); static bool_t isControl(UChar ch); static bool_t isPrintable(UChar ch); static bool_t isBaseForm(UChar ch); static bool_t isLetter(UChar ch); static bool_t isJavaIdentifierStart(UChar ch); static bool_t isJavaIdentifierPart(UChar ch); static bool_t isUnicodeIdentifierStart(UChar ch); static bool_t isUnicodeIdentifierPart(UChar ch); static bool_t isIdentifierIgnorable(UChar ch); static UChar toLowerCase(UChar ch); static UChar toUpperCase(UChar ch); static UChar toTitleCase(UChar ch); static bool_t isSpaceChar(UChar ch); static bool_t isWhitespace(UChar ch); static int8_t getType(UChar ch); static EDirectionProperty characterDirection(UChar ch); static EUnicodeScript getScript(UChar ch); static uint16_t getCellWidth(UChar ch); static inline UTextOffset getCharName(uint32_t code, char *buffer, UTextOffset bufferLength, UCharNameChoice nameChoice=U_UNICODE_CHAR_NAME); static int32_t digitValue(UChar ch); static int8_t digit(UChar ch, int8_t radix); static UChar forDigit(int32_t digit, int8_t radix); static void getUnicodeVersion(UVersionInfo info); protected: // These constructors, destructor, and assignment operator must // be protected (not private, as they semantically are) to make // various UNIX compilers happy. [LIU] Unicode(); Unicode( const Unicode& other); ~Unicode(); const Unicode& operator=( const Unicode& other); }; inline UTextOffset Unicode::getCharName(uint32_t code, char *buffer, UTextOffset bufferLength, UCharNameChoice nameChoice) { UErrorCode errorCode=U_ZERO_ERROR; UTextOffset length=u_charName(code, nameChoice, buffer, bufferLength, &errorCode); return U_SUCCESS(errorCode) ? length : 0; } #endif