From 82e011125e8d869896c50a23c3533d63f04190ed Mon Sep 17 00:00:00 2001 From: Vladimir Weinstein Date: Wed, 6 Dec 2000 00:52:58 +0000 Subject: [PATCH] ICU-756 normalization C API moved where it belongs X-SVN-Rev: 3145 --- icu4c/source/common/common.dsp | 25 ++++- icu4c/source/common/unicode/unorm.h | 146 ++++++++++++++++++++++++++++ icu4c/source/common/unorm.cpp | 58 +++++++++++ 3 files changed, 228 insertions(+), 1 deletion(-) create mode 100644 icu4c/source/common/unicode/unorm.h create mode 100644 icu4c/source/common/unorm.cpp diff --git a/icu4c/source/common/common.dsp b/icu4c/source/common/common.dsp index 7533101826a..b1f7e27beae 100644 --- a/icu4c/source/common/common.dsp +++ b/icu4c/source/common/common.dsp @@ -43,7 +43,7 @@ RSC=rc.exe # PROP Ignore_Export_Lib 0 # PROP Target_Dir "" # ADD BASE CPP /nologo /MT /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "COMMON_EXPORTS" /YX /FD /c -# ADD CPP /nologo /MD /Ze /W3 /GX /I "..\..\include" /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "COMMON_EXPORTS" /D "U_COMMON_IMPLEMENTATION" /YX /FD /c +# ADD CPP /nologo /MD /W3 /GX /I "..\..\include" /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "COMMON_EXPORTS" /D "U_COMMON_IMPLEMENTATION" /YX /FD /c # ADD BASE MTL /nologo /D "NDEBUG" /mktyplib203 /win32 # ADD MTL /nologo /D "NDEBUG" /mktyplib203 /win32 # ADD BASE RSC /l 0x409 /d "NDEBUG" @@ -406,6 +406,10 @@ SOURCE=.\unistr.cpp # End Source File # Begin Source File +SOURCE=.\unorm.cpp +# End Source File +# Begin Source File + SOURCE=.\uresbund.c # End Source File # Begin Source File @@ -1155,6 +1159,25 @@ SOURCE=.\unistrm.h # End Source File # Begin Source File +SOURCE=.\unicode\unorm.h + +!IF "$(CFG)" == "common - Win32 Release" + +!ELSEIF "$(CFG)" == "common - Win32 Debug" + +# Begin Custom Build +InputPath=.\unicode\unorm.h + +"..\..\include\unicode\unorm.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" + copy unicode\unorm.h ..\..\include\unicode + +# End Custom Build + +!ENDIF + +# End Source File +# Begin Source File + SOURCE=.\unicode\urep.h !IF "$(CFG)" == "common - Win32 Release" diff --git a/icu4c/source/common/unicode/unorm.h b/icu4c/source/common/unicode/unorm.h new file mode 100644 index 00000000000..54810d823ff --- /dev/null +++ b/icu4c/source/common/unicode/unorm.h @@ -0,0 +1,146 @@ +/* +******************************************************************************* +* Copyright © {1996-2001}, International Business Machines Corporation and others. All Rights Reserved. +******************************************************************************* +* File unorm.h +* +* Created by: Vladimir Weinstein 12052000 +* +*/ +#ifndef UNORM_H +#define UNORM_H + +#include "unicode/utypes.h" + +/** + * @name Unicode normalization API + * + * u_normalize transforms Unicode text into an equivalent composed or + * decomposed form, allowing for easier sorting and searching of text. + * u_normalize supports the standard normalization forms described in + * + * Unicode Technical Report #15. + *

+ * Characters with accents or other adornments can be encoded in + * several different ways in Unicode. For example, take the character "Á" + * (A-acute). In Unicode, this can be encoded as a single character (the + * "composed" form): + *

+ *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
+ * or as two separate characters (the "decomposed" form): + *
+ *      0041    LATIN CAPITAL LETTER A
+ *      0301    COMBINING ACUTE ACCENT
+ *

+ * To a user of your program, however, both of these sequences should be + * treated as the same "user-level" character "Á". When you are searching or + * comparing text, you must ensure that these two sequences are treated + * equivalently. In addition, you must handle characters with more than one + * accent. Sometimes the order of a character's combining accents is + * significant, while in other cases accent sequences in different orders are + * really equivalent. + *

+ * Similarly, the string "ffi" can be encoded as three separate letters: + *

+ *      0066    LATIN SMALL LETTER F
+ *      0066    LATIN SMALL LETTER F
+ *      0069    LATIN SMALL LETTER I
+ * or as the single character + *
+ *      FB03    LATIN SMALL LIGATURE FFI
+ *

+ * The ffi ligature is not a distinct semantic character, and strictly speaking + * it shouldn't be in Unicode at all, but it was included for compatibility + * with existing character sets that already provided it. The Unicode standard + * identifies such characters by giving them "compatibility" decompositions + * into the corresponding semantic characters. When sorting and searching, you + * will often want to use these mappings. + *

+ * u_normalize helps solve these problems by transforming text into the + * canonical composed and decomposed forms as shown in the first example above. + * In addition, you can have it perform compatibility decompositions so that + * you can treat compatibility characters the same as their equivalents. + * Finally, u_normalize rearranges accents into the proper canonical + * order, so that you do not have to worry about accent rearrangement on your + * own. + *

+ * u_normalize adds one optional behavior, {@link #UCOL_IGNORE_HANGUL}, + * that differs from + * the standard Unicode Normalization Forms. + **/ + + /** + * UCOL_NO_NORMALIZATION : Accented characters will not be decomposed for sorting. + * UCOL_DECOM_CAN : Characters that are canonical variants according + * to Unicode 2.0 will be decomposed for sorting. + * UCOL_DECOMP_COMPAT : Characters that are compatibility variants will be + * decomposed for sorting. This is the default normalization mode used. + * UCOL_DECOMP_CAN_COMP_COMPAT : Canonical decomposition followed by canonical composition + * UCOL_DECOMP_COMPAT_COMP_CAN : Compatibility decomposition followed by canonical composition + * + **/ + +typedef enum { + /** No decomposition/composition */ + UCOL_NO_NORMALIZATION = 1, + /** Canonical decomposition */ + UCOL_DECOMP_CAN = 2, + /** Compatibility decomposition */ + UCOL_DECOMP_COMPAT = 3, + /** Default normalization */ + UCOL_DEFAULT_NORMALIZATION = UCOL_DECOMP_COMPAT, + /** Canonical decomposition followed by canonical composition */ + UCOL_DECOMP_CAN_COMP_COMPAT = 4, + /** Compatibility decomposition followed by canonical composition */ + UCOL_DECOMP_COMPAT_COMP_CAN =5, + /** No decomposition/composition */ + UNORM_NONE = 1, + /** Canonical decomposition */ + UNORM_NFD = 2, + /** Compatibility decomposition */ + UNORM_NFKD = 3, + /** Canonical decomposition followed by canonical composition */ + UNORM_NFC = 4, + /** Default normalization */ + UNORM_DEFAULT = UNORM_NFC, + /** Compatibility decomposition followed by canonical composition */ + UNORM_NFKC =5, + + UNORM_MODE_COUNT, + + /** Do not normalize Hangul */ + UCOL_IGNORE_HANGUL = 16, + UNORM_IGNORE_HANGUL = 16 +} UNormalizationMode; + +/** Possible normalization options */ +typedef UNormalizationMode UNormalizationOption; + +/** + * Normalize a string. + * The string will be normalized according the the specified normalization mode + * and options. + * @param source The string to normalize. + * @param sourceLength The length of source, or -1 if null-terminated. + * @param mode The normalization mode; one of UCOL_NO_NORMALIZATION, + * UCOL_CAN_DECOMP, UCOL_COMPAT_DECOMP, UCOL_CAN_DECOMP_COMPAT_COMP, + * UCOL_COMPAT_DECOMP_CAN_COMP, UCOL_DEFAULT_NORMALIZATION + * @param options The normalization options, ORed together; possible values + * are UCOL_IGNORE_HANGUL + * @param result A pointer to a buffer to receive the attribute. + * @param resultLength The maximum size of result. + * @param status A pointer to an UErrorCode to receive any errors + * @return The total buffer size needed; if greater than resultLength, + * the output was truncated. + * @stable + */ +U_CAPI int32_t +u_normalize(const UChar* source, + int32_t sourceLength, + UNormalizationMode mode, + int32_t options, + UChar* result, + int32_t resultLength, + UErrorCode* status); + +#endif diff --git a/icu4c/source/common/unorm.cpp b/icu4c/source/common/unorm.cpp new file mode 100644 index 00000000000..62773f059bc --- /dev/null +++ b/icu4c/source/common/unorm.cpp @@ -0,0 +1,58 @@ +/* +******************************************************************************* +* Copyright © {1996-2001}, International Business Machines Corporation and others. All Rights Reserved. +******************************************************************************* +* File unorm.cpp +* +* Created by: Vladimir Weinstein 12052000 +* +*/ + +#include "unicode/unorm.h" +#include "unicode/normlzr.h" +#include "unicode/ustring.h" +#include "cpputils.h" + +U_CAPI int32_t +u_normalize(const UChar* source, + int32_t sourceLength, + UNormalizationMode mode, + int32_t option, + UChar* result, + int32_t resultLength, + UErrorCode* status) +{ + if(U_FAILURE(*status)) return -1; + + Normalizer::EMode normMode; + switch(mode) { + case UCOL_NO_NORMALIZATION: + normMode = Normalizer::NO_OP; + break; + case UCOL_DECOMP_CAN: + normMode = Normalizer::DECOMP; + break; + case UCOL_DECOMP_COMPAT: + normMode = Normalizer::DECOMP_COMPAT; + break; + case UCOL_DECOMP_CAN_COMP_COMPAT: + normMode = Normalizer::COMPOSE; + break; + case UCOL_DECOMP_COMPAT_COMP_CAN: + normMode = Normalizer::COMPOSE_COMPAT; + break; + default: + *status = U_ILLEGAL_ARGUMENT_ERROR; + return -1; + } + + int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); + const UnicodeString src((UChar*)source, len, len); + UnicodeString dst(result, 0, resultLength); + Normalizer::normalize(src, normMode, option, dst, *status); + int32_t actualLen; + T_fillOutputParams(&dst, result, resultLength, &actualLen, status); + return actualLen; +} + +