Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members  

normlzr.h

Go to the documentation of this file.
00001 /*
00002  ********************************************************************
00003  * COPYRIGHT: 
00004  * Copyright (c) 1996-2001, International Business Machines Corporation and
00005  * others. All Rights Reserved.
00006  ********************************************************************
00007  */
00008 
00009 /*
00010 * Modification history
00011 * 
00012 * Date      Name      Description
00013 * 02/02/01  synwee    Added converters from EMode to UNormalizationMode, 
00014 *                     getUNormalizationMode and getNormalizerEMode,
00015 *                     useful in tbcoll and unorm.
00016 *                     Added quickcheck method and incorporated it into 
00017 *                     normalize()
00018 *                     Removed hard coded on EMode to UNormalizationMode 
00019 *                     conversion
00020 */
00021 
00022 #ifndef NORMLZR_H
00023 #define NORMLZR_H
00024 
00025 #include "unicode/utypes.h"
00026 #include "unicode/unistr.h"
00027 #include "unicode/chariter.h"
00028 #include "unicode/unorm.h"
00029 
00030 /* forward declaration */
00031 class ComposedCharIter;
00032 
00128 class U_COMMON_API Normalizer
00129 {
00130 
00131  public:
00132   // This tells us what the bits in the "mode" mean.
00133   enum {
00134     COMPAT_BIT         = 1,
00135     DECOMP_BIT         = 2,
00136     COMPOSE_BIT     = 4
00137   };
00138 
00139 
00140 
00142   enum {
00143       DONE=0xffff
00144   };
00145 
00147   enum EMode {
00148 
00161     NO_OP         = 0,
00162     
00177     COMPOSE         = COMPOSE_BIT,
00178 
00193     COMPOSE_COMPAT     = COMPOSE_BIT | COMPAT_BIT,
00194 
00209     DECOMP         = DECOMP_BIT,
00210 
00225     DECOMP_COMPAT     = DECOMP_BIT | COMPAT_BIT
00226   };
00227 
00229   enum {
00230 
00248     IGNORE_HANGUL     = 0x001
00249   };
00250 
00251   // Constructors
00252 
00263   Normalizer(const UnicodeString& str, 
00264          EMode mode);
00265     
00284   Normalizer(const UnicodeString& str, 
00285          EMode mode, 
00286          int32_t opt);
00287 
00300   Normalizer(const UChar* str,
00301          int32_t length,
00302          EMode mode);
00303 
00320   Normalizer(const UChar* str,
00321          int32_t length,
00322          EMode mode,
00323          int32_t option);
00324 
00336   Normalizer(const CharacterIterator& iter, 
00337          EMode mode);
00338 
00354   Normalizer(const CharacterIterator& iter, 
00355          EMode mode, 
00356          int32_t opt);
00357 
00362   Normalizer(const Normalizer& copy);
00363 
00368   ~Normalizer();
00369 
00370 
00371   //-------------------------------------------------------------------------
00372   // Static utility methods
00373   //-------------------------------------------------------------------------
00374 
00395   static void normalize(const UnicodeString& source, 
00396             EMode mode, 
00397             int32_t options,
00398             UnicodeString& result, 
00399             UErrorCode &status);
00400 
00424   static void compose(const UnicodeString& source, 
00425               UBool compat,
00426               int32_t options,
00427               UnicodeString& result, 
00428               UErrorCode &status);
00429 
00456   static void decompose(const UnicodeString& source, 
00457             UBool compat,
00458             int32_t options,
00459             UnicodeString& result, 
00460             UErrorCode &status);
00461 
00468   inline static UNormalizationMode getUNormalizationMode(EMode mode, 
00469                                                   UErrorCode& status);
00470 
00477   inline static EMode getNormalizerEMode(UNormalizationMode mode, 
00478                                          UErrorCode& status);
00479 
00495   static UNormalizationCheckResult
00496   quickCheck(const UnicodeString& source,
00497              EMode                mode, 
00498              UErrorCode&          status);
00499 
00500   //-------------------------------------------------------------------------
00501   // CharacterIterator overrides
00502   //-------------------------------------------------------------------------
00503   
00508   UChar32              current(void) const;
00509 
00515   UChar32              first(void);
00516 
00523   UChar32              last(void);
00524 
00531   UChar32              next(void);
00532 
00539   UChar32              previous(void);
00540 
00558   UChar32              setIndex(UTextOffset index);
00559 
00568   void                reset(void);
00569 
00584   UTextOffset            getIndex(void) const;
00585 
00592   UTextOffset            startIndex(void) const;
00593 
00600   UTextOffset            endIndex(void) const;
00601 
00602 
00608   //  virtual UBool    operator==(const CharacterIterator& that) const;
00609   UBool        operator==(const Normalizer& that) const;
00610   inline UBool        operator!=(const Normalizer& that) const;
00611 
00617   Normalizer*        clone(void) const;
00618 
00623   int32_t                hashCode(void) const;
00624 
00625   //-------------------------------------------------------------------------
00626   // Property access methods
00627   //-------------------------------------------------------------------------
00628 
00655   void setMode(EMode newMode);
00656 
00663   EMode getMode(void) const;
00664 
00688   void setOption(int32_t option, 
00689          UBool value);
00690 
00697   UBool getOption(int32_t option) const;
00698 
00704   void setText(const UnicodeString& newText, 
00705            UErrorCode &status);
00706 
00712   void setText(const CharacterIterator& newText, 
00713            UErrorCode &status);
00714 
00720   void setText(const UChar* newText,
00721                     int32_t length,
00722             UErrorCode &status);
00729   void            getText(UnicodeString&  result);
00730 
00736   const UChar*     getText(int32_t&  count);
00737 
00738 private:
00739   // Private utility methods for iteration
00740   // For documentation, see the source code
00741   UChar nextCompose(void);
00742   UChar prevCompose(void);
00743   UChar nextDecomp(void);
00744   UChar prevDecomp(void);
00745 
00746   UChar curForward(void);
00747   UChar curBackward(void);
00748 
00749   void    init(CharacterIterator* iter, 
00750          EMode mode, 
00751          int32_t option);
00752   void    initBuffer(void);
00753   void    clearBuffer(void);
00754 
00755   // Utilities used by Compose
00756   static void        bubbleAppend(UnicodeString& target, 
00757                      UChar ch, 
00758                      uint32_t cclass);
00759   static uint32_t     getComposeClass(UChar ch);
00760   static uint16_t    composeLookup(UChar ch);
00761   static uint16_t    composeAction(uint16_t baseIndex, 
00762                       uint16_t comIndex);
00763   static void        explode(UnicodeString& target, 
00764                 uint16_t index);
00765   static UChar    pairExplode(UnicodeString& target, 
00766                     uint16_t action);
00767 
00768   // Utilities used by Decompose
00769   static void        fixCanonical(UnicodeString& result);    // Reorders combining marks
00770   static uint8_t    getClass(UChar ch);                    // Gets char's combining class
00771 
00772   // Other static utility methods
00773   static void doAppend(const UChar source[], 
00774                uint16_t offset, 
00775                UnicodeString& dest);
00776   static void doInsert(const UChar source[], 
00777                uint16_t offset, 
00778                UnicodeString& dest, 
00779                UTextOffset pos);
00780   static uint16_t doReplace(const UChar source[], 
00781                uint16_t offset, 
00782                UnicodeString& dest, 
00783                UTextOffset pos);
00784 
00785   static void hangulToJamo(UChar ch, 
00786                UnicodeString& result, 
00787                uint16_t decompLimit);
00788   static void jamoAppend(UChar ch, 
00789              uint16_t decompLimit, 
00790              UnicodeString& dest);
00791   static void jamoToHangul(UnicodeString& buffer, 
00792                UTextOffset start);
00793 
00794   //-------------------------------------------------------------------------
00795   // Private data
00796   //-------------------------------------------------------------------------
00797 
00798   EMode         fMode;
00799   int32_t       fOptions;
00800   int16_t    minDecomp;
00801 
00802   // The input text and our position in it
00803   CharacterIterator*  text;
00804 
00805   // A buffer for holding intermediate results
00806   UnicodeString       buffer;
00807   UTextOffset          bufferPos;
00808   UTextOffset          bufferLimit;
00809   UChar             currentChar;
00810 
00811   // Another buffer for use during iterative composition
00812   UnicodeString       explodeBuf;
00813 
00814   enum {
00815     EMPTY = -1,
00816     STR_INDEX_SHIFT = 2, //Must agree with the constants used in NormalizerBuilder
00817     STR_LENGTH_MASK = 0x0003
00818   };
00819 
00820   enum {
00821     HANGUL_BASE = 0xac00,
00822     HANGUL_LIMIT = 0xd7a4,
00823     JAMO_LBASE = 0x1100,
00824     JAMO_VBASE = 0x1161,
00825     JAMO_TBASE = 0x11a7,
00826     JAMO_LCOUNT = 19,
00827     JAMO_VCOUNT = 21,
00828     JAMO_TCOUNT = 28,
00829     JAMO_NCOUNT = JAMO_VCOUNT * JAMO_TCOUNT
00830   };
00831 
00832   friend class ComposedCharIter;
00833 };
00834 
00835 inline UBool
00836 Normalizer::operator!= (const Normalizer& other) const
00837 { return ! operator==(other); }
00838 
00839 inline UNormalizationMode Normalizer::getUNormalizationMode(
00840                                    Normalizer::EMode  mode, UErrorCode &status)
00841 {
00842   if (U_SUCCESS(status))
00843   { 
00844     switch (mode)
00845     {
00846     case Normalizer::NO_OP : 
00847       return UNORM_NONE;
00848     case Normalizer::COMPOSE :
00849       return UNORM_NFC;
00850     case Normalizer::COMPOSE_COMPAT :
00851       return UNORM_NFKC;
00852     case Normalizer::DECOMP :
00853       return UNORM_NFD;
00854     case Normalizer::DECOMP_COMPAT :
00855       return UNORM_NFKD;
00856     default : 
00857       status = U_ILLEGAL_ARGUMENT_ERROR; 
00858     }
00859   }
00860   return UNORM_DEFAULT;
00861 }
00862 
00863 inline Normalizer::EMode Normalizer::getNormalizerEMode(
00864                                   UNormalizationMode mode, UErrorCode &status)
00865 {
00866   if (U_SUCCESS(status))
00867   {
00868     switch (mode)
00869     {
00870     case UNORM_NONE :
00871       return Normalizer::NO_OP;
00872     case UNORM_NFD :
00873       return Normalizer::DECOMP;
00874     case UNORM_NFKD :
00875       return Normalizer::DECOMP_COMPAT;
00876     case UNORM_NFC :
00877       return Normalizer::COMPOSE;
00878     case UNORM_NFKC :
00879       return Normalizer::COMPOSE_COMPAT;
00880     default : 
00881       status = U_ILLEGAL_ARGUMENT_ERROR; 
00882     }
00883   }
00884   return Normalizer::DECOMP_COMPAT;
00885 }
00886 
00887 #endif // _NORMLZR

Generated at Tue Jun 12 14:03:57 2001 for ICU 1.8.1 by doxygen1.2.3 written by Dimitri van Heesch, © 1997-2000