From 82e011125e8d869896c50a23c3533d63f04190ed Mon Sep 17 00:00:00 2001
From: Vladimir Weinstein <icu@weivsara.com>
Date: Wed, 6 Dec 2000 00:52:58 +0000
Subject: [PATCH] ICU-756 normalization C API moved where it belongs

X-SVN-Rev: 3145
---
 icu4c/source/common/common.dsp      |  25 ++++-
 icu4c/source/common/unicode/unorm.h | 146 ++++++++++++++++++++++++++++
 icu4c/source/common/unorm.cpp       |  58 +++++++++++
 3 files changed, 228 insertions(+), 1 deletion(-)
 create mode 100644 icu4c/source/common/unicode/unorm.h
 create mode 100644 icu4c/source/common/unorm.cpp
diff --git a/icu4c/source/common/common.dsp b/icu4c/source/common/common.dsp
index 7533101826a..b1f7e27beae 100644
--- a/icu4c/source/common/common.dsp
+++ b/icu4c/source/common/common.dsp
@@ -43,7 +43,7 @@ RSC=rc.exe
 # PROP Ignore_Export_Lib 0
 # PROP Target_Dir ""
 # ADD BASE CPP /nologo /MT /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "COMMON_EXPORTS" /YX /FD /c
-# ADD CPP /nologo /MD /Ze /W3 /GX /I "..\..\include" /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "COMMON_EXPORTS" /D "U_COMMON_IMPLEMENTATION" /YX /FD /c
+# ADD CPP /nologo /MD /W3 /GX /I "..\..\include" /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "COMMON_EXPORTS" /D "U_COMMON_IMPLEMENTATION" /YX /FD /c
 # ADD BASE MTL /nologo /D "NDEBUG" /mktyplib203 /win32
 # ADD MTL /nologo /D "NDEBUG" /mktyplib203 /win32
 # ADD BASE RSC /l 0x409 /d "NDEBUG"
@@ -406,6 +406,10 @@ SOURCE=.\unistr.cpp
 # End Source File
 # Begin Source File
 
+SOURCE=.\unorm.cpp
+# End Source File
+# Begin Source File
+
 SOURCE=.\uresbund.c
 # End Source File
 # Begin Source File
@@ -1155,6 +1159,25 @@ SOURCE=.\unistrm.h
 # End Source File
 # Begin Source File
 
+SOURCE=.\unicode\unorm.h
+
+!IF  "$(CFG)" == "common - Win32 Release"
+
+!ELSEIF  "$(CFG)" == "common - Win32 Debug"
+
+# Begin Custom Build
+InputPath=.\unicode\unorm.h
+
+"..\..\include\unicode\unorm.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	copy    unicode\unorm.h    ..\..\include\unicode
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
 SOURCE=.\unicode\urep.h
 
 !IF  "$(CFG)" == "common - Win32 Release"
diff --git a/icu4c/source/common/unicode/unorm.h b/icu4c/source/common/unicode/unorm.h
new file mode 100644
index 00000000000..54810d823ff
--- /dev/null
+++ b/icu4c/source/common/unicode/unorm.h
@@ -0,0 +1,146 @@
+/*
+*******************************************************************************
+* Copyright Š {1996-2001}, International Business Machines Corporation and others. All Rights Reserved.
+*******************************************************************************
+* File unorm.h
+*
+* Created by: Vladimir Weinstein 12052000
+*
+*/
+#ifndef UNORM_H
+#define UNORM_H
+
+#include "unicode/utypes.h"
+
+/**
+ * @name Unicode normalization API
+ *
+ * <tt>u_normalize</tt> transforms Unicode text into an equivalent composed or
+ * decomposed form, allowing for easier sorting and searching of text.
+ * <tt>u_normalize</tt> supports the standard normalization forms described in
+ * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
+ * Unicode Technical Report #15</a>.
+ * <p>
+ * Characters with accents or other adornments can be encoded in
+ * several different ways in Unicode.  For example, take the character "Á"
+ * (A-acute).   In Unicode, this can be encoded as a single character (the
+ * "composed" form):
+ * <pre>
+ *      00C1    LATIN CAPITAL LETTER A WITH ACUTE</pre>
+ * or as two separate characters (the "decomposed" form):
+ * <pre>
+ *      0041    LATIN CAPITAL LETTER A
+ *      0301    COMBINING ACUTE ACCENT</pre>
+ * <p>
+ * To a user of your program, however, both of these sequences should be
+ * treated as the same "user-level" character "Á".  When you are searching or
+ * comparing text, you must ensure that these two sequences are treated 
+ * equivalently.  In addition, you must handle characters with more than one
+ * accent.  Sometimes the order of a character's combining accents is
+ * significant, while in other cases accent sequences in different orders are
+ * really equivalent.
+ * <p>
+ * Similarly, the string "ffi" can be encoded as three separate letters:
+ * <pre>
+ *      0066    LATIN SMALL LETTER F
+ *      0066    LATIN SMALL LETTER F
+ *      0069    LATIN SMALL LETTER I</pre>
+ * or as the single character
+ * <pre>
+ *      FB03    LATIN SMALL LIGATURE FFI</pre>
+ * <p>
+ * The ffi ligature is not a distinct semantic character, and strictly speaking
+ * it shouldn't be in Unicode at all, but it was included for compatibility
+ * with existing character sets that already provided it.  The Unicode standard
+ * identifies such characters by giving them "compatibility" decompositions
+ * into the corresponding semantic characters.  When sorting and searching, you
+ * will often want to use these mappings.
+ * <p>
+ * <tt>u_normalize</tt> helps solve these problems by transforming text into the
+ * canonical composed and decomposed forms as shown in the first example above.  
+ * In addition, you can have it perform compatibility decompositions so that 
+ * you can treat compatibility characters the same as their equivalents.
+ * Finally, <tt>u_normalize</tt> rearranges accents into the proper canonical
+ * order, so that you do not have to worry about accent rearrangement on your
+ * own.
+ * <p>
+ * <tt>u_normalize</tt> adds one optional behavior, {@link #UCOL_IGNORE_HANGUL},
+ * that differs from
+ * the standard Unicode Normalization Forms. 
+ **/
+
+  /**
+    * UCOL_NO_NORMALIZATION : Accented characters will not be decomposed for sorting.  
+    * UCOL_DECOM_CAN          : Characters that are canonical variants according 
+    * to Unicode 2.0 will be decomposed for sorting. 
+    * UCOL_DECOMP_COMPAT    : Characters that are compatibility variants will be
+    * decomposed for sorting. This is the default normalization mode used.
+    * UCOL_DECOMP_CAN_COMP_COMPAT : Canonical decomposition followed by canonical composition 
+    * UCOL_DECOMP_COMPAT_COMP_CAN : Compatibility decomposition followed by canonical composition
+    *
+    **/
+
+typedef enum {
+  /** No decomposition/composition */
+  UCOL_NO_NORMALIZATION = 1,
+  /** Canonical decomposition */
+  UCOL_DECOMP_CAN = 2,
+  /** Compatibility decomposition */
+  UCOL_DECOMP_COMPAT = 3,
+  /** Default normalization */
+  UCOL_DEFAULT_NORMALIZATION = UCOL_DECOMP_COMPAT, 
+  /** Canonical decomposition followed by canonical composition */
+  UCOL_DECOMP_CAN_COMP_COMPAT = 4,
+  /** Compatibility decomposition followed by canonical composition */
+  UCOL_DECOMP_COMPAT_COMP_CAN =5,
+  /** No decomposition/composition */
+  UNORM_NONE = 1, 
+  /** Canonical decomposition */
+  UNORM_NFD = 2,
+  /** Compatibility decomposition */
+  UNORM_NFKD = 3,
+  /** Canonical decomposition followed by canonical composition */
+  UNORM_NFC = 4,
+  /** Default normalization */
+  UNORM_DEFAULT = UNORM_NFC, 
+  /** Compatibility decomposition followed by canonical composition */
+  UNORM_NFKC =5,
+
+  UNORM_MODE_COUNT,
+
+  /** Do not normalize Hangul */
+  UCOL_IGNORE_HANGUL    = 16,
+  UNORM_IGNORE_HANGUL    = 16
+} UNormalizationMode;
+
+/** Possible normalization options */
+typedef UNormalizationMode UNormalizationOption;
+
+/**
+ * Normalize a string.
+ * The string will be normalized according the the specified normalization mode
+ * and options.
+ * @param source The string to normalize.
+ * @param sourceLength The length of source, or -1 if null-terminated.
+ * @param mode The normalization mode; one of UCOL_NO_NORMALIZATION, 
+ * UCOL_CAN_DECOMP, UCOL_COMPAT_DECOMP, UCOL_CAN_DECOMP_COMPAT_COMP, 
+ * UCOL_COMPAT_DECOMP_CAN_COMP, UCOL_DEFAULT_NORMALIZATION
+ * @param options The normalization options, ORed together; possible values
+ * are UCOL_IGNORE_HANGUL
+ * @param result A pointer to a buffer to receive the attribute.
+ * @param resultLength The maximum size of result.
+ * @param status A pointer to an UErrorCode to receive any errors
+ * @return The total buffer size needed; if greater than resultLength,
+ * the output was truncated.
+ * @stable
+ */
+U_CAPI int32_t
+u_normalize(const UChar*           source,
+        int32_t                 sourceLength, 
+        UNormalizationMode      mode, 
+        int32_t            options,
+        UChar*                  result,
+        int32_t                 resultLength,
+        UErrorCode*             status);    
+
+#endif
diff --git a/icu4c/source/common/unorm.cpp b/icu4c/source/common/unorm.cpp
new file mode 100644
index 00000000000..62773f059bc
--- /dev/null
+++ b/icu4c/source/common/unorm.cpp
@@ -0,0 +1,58 @@
+/*
+*******************************************************************************
+* Copyright Š {1996-2001}, International Business Machines Corporation and others. All Rights Reserved.
+*******************************************************************************
+* File unorm.cpp
+*
+* Created by: Vladimir Weinstein 12052000
+*
+*/
+
+#include "unicode/unorm.h"
+#include "unicode/normlzr.h"
+#include "unicode/ustring.h"
+#include "cpputils.h"
+
+U_CAPI int32_t
+u_normalize(const UChar*            source,
+        int32_t                 sourceLength, 
+        UNormalizationMode      mode, 
+        int32_t                 option,
+        UChar*                  result,
+        int32_t                 resultLength,
+        UErrorCode*             status)
+{
+  if(U_FAILURE(*status)) return -1;
+
+  Normalizer::EMode normMode;
+  switch(mode) {
+  case UCOL_NO_NORMALIZATION:
+    normMode = Normalizer::NO_OP;
+    break;
+  case UCOL_DECOMP_CAN:
+    normMode = Normalizer::DECOMP;
+    break;
+  case UCOL_DECOMP_COMPAT:
+    normMode = Normalizer::DECOMP_COMPAT;
+    break;
+  case UCOL_DECOMP_CAN_COMP_COMPAT:
+    normMode = Normalizer::COMPOSE;
+    break;
+  case UCOL_DECOMP_COMPAT_COMP_CAN:
+    normMode = Normalizer::COMPOSE_COMPAT;
+    break;
+  default:
+    *status = U_ILLEGAL_ARGUMENT_ERROR;
+    return -1;
+  }
+
+  int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
+  const UnicodeString src((UChar*)source, len, len);
+  UnicodeString dst(result, 0, resultLength);
+  Normalizer::normalize(src, normMode, option, dst, *status);
+  int32_t actualLen;
+  T_fillOutputParams(&dst, result, resultLength, &actualLen, status);
+  return actualLen;
+}
+
+