ICU-221 add isWhitespace, isSpace

X-SVN-Rev: 546
2025-04-21 12:40:02 +00:00 · 2000-01-12 18:01:51 +00:00 · 2000-01-12 18:01:51 +00:00 · b06425886b
commit b06425886b
parent fea8b1b618
2 changed files with 145 additions and 0 deletions
--- a/icu4c/source/common/unicode.cpp
+++ b/icu4c/source/common/unicode.cpp
@ -202,6 +202,40 @@ Unicode::toTitleCase(UChar ch)
    return(u_totitle(ch) );
 }

+/**
+ * Determines if the specified character is ISO-LATIN-1 white space. 
+ * This method returns <code>true</code> for the following five 
+ * characters only: 
+ * <table>
+ * <tr><td>'\t'</td>            <td>&#92;u0009</td>
+ *     <td><code>HORIZONTAL TABULATION</code></td></tr>
+ * <tr><td>'\n'</td>            <td>&#92;u000A</td>
+ *     <td><code>NEW LINE</code></td></tr>
+ * <tr><td>'\f'</td>            <td>&#92;u000C</td>
+ *     <td><code>FORM FEED</code></td></tr>
+ * <tr><td>'\r'</td>            <td>&#92;u000D</td>
+ *     <td><code>CARRIAGE RETURN</code></td></tr>
+ * <tr><td>'&nbsp;&nbsp;'</td>  <td>&#92;u0020</td>
+ *     <td><code>SPACE</code></td></tr>
+ * </table>
+ *
+ * @param      ch   the character to be tested.
+ * @return     <code>true</code> if the character is ISO-LATIN-1 white
+ *             space; <code>false</code> otherwise.
+ * @see        #isSpaceChar
+ * @see        #isWhitespace
+ * @deprecated Replaced by isWhitespace(char).
+ */
+bool_t
+Unicode::isSpace(UChar ch) {
+    return (ch <= 0x0020) &&
+        (((((int32_t(1) << 0x0009) |
+            (int32_t(1) << 0x000A) |
+            (int32_t(1) << 0x000C) |
+            (int32_t(1) << 0x000D) |
+            (int32_t(1) << 0x0020)) >> ch) & int32_t(1)) != 0);
+}
+
 // Checks if the Unicode character is a space character.
 bool_t
 Unicode::isSpaceChar(UChar ch) 
@ -209,6 +243,64 @@ Unicode::isSpaceChar(UChar ch)
    return(u_isspace(ch) );
 }

+/**
+ * Determines if the specified character is white space according to ICU.
+ * A character is considered to be an ICU whitespace character if and only
+ * if it satisfies one of the following criteria:
+ * <ul>
+ * <li> It is a Unicode space separator (category "Zs"), but is not
+ *      a no-break space (&#92;u00A0 or &#92;uFEFF).
+ * <li> It is a Unicode line separator (category "Zl").
+ * <li> It is a Unicode paragraph separator (category "Zp").
+ * <li> It is &#92;u0009, HORIZONTAL TABULATION.
+ * <li> It is &#92;u000A, LINE FEED.
+ * <li> It is &#92;u000B, VERTICAL TABULATION.
+ * <li> It is &#92;u000C, FORM FEED.
+ * <li> It is &#92;u000D, CARRIAGE RETURN.
+ * <li> It is &#92;u001C, FILE SEPARATOR.
+ * <li> It is &#92;u001D, GROUP SEPARATOR.
+ * <li> It is &#92;u001E, RECORD SEPARATOR.
+ * <li> It is &#92;u001F, UNIT SEPARATOR.
+ * </ul>
+ *
+ * @param   ch	the character to be tested.
+ * @return  true if the character is a Java whitespace character;
+ *          false otherwise.
+ * @see     #isSpaceChar
+ */
+bool_t
+Unicode::isWhitespace(UChar ch) {
+    // From Mark Davis:
+    //| What we should do is to make sure that the special Cc characters like CR
+    //| have either Zs, Zl, or Zp in the property database. We can then just call
+    //| the equivalent of:
+    //| 
+    //|  public static boolean isWhileSpace(char ch) {
+    //|   return ((1 << Character.getType(c)) & WHITESPACE_MASK) != 0; }
+    //| 
+    //| where WHITESPACE_MASK = (1 << Zs) | (1 << Zl) | (1 << Zp);
+    //| 
+    //| This is much faster code, since it just looksup the property value and does
+    //| a couple of arithmetics to get the right answer.
+
+    // TEMPORARY IMPLEMENTATION until the tables are updated to
+    // modify Cc character categories:
+    int8_t cat = Unicode::getType(ch);
+    return
+        (cat == SPACE_SEPARATOR && ch != 0x00A0 && ch != 0xFEFF) ||
+        (cat == LINE_SEPARATOR) ||
+        (cat == PARAGRAPH_SEPARATOR) ||
+        (ch <= 0x1F && ((((int32_t(1) << 0x0009) |
+                          (int32_t(1) << 0x000A) |
+                          (int32_t(1) << 0x000B) |
+                          (int32_t(1) << 0x000C) |
+                          (int32_t(1) << 0x000D) |
+                          (int32_t(1) << 0x001C) |
+                          (int32_t(1) << 0x001D) |
+                          (int32_t(1) << 0x001E) |
+                          (int32_t(1) << 0x001F)) >> ch) & int32_t(1)) != 0);
+}
+
 // Gets if the Unicode character's character property.
 int8_t
 Unicode::getType(UChar ch)
--- a/icu4c/source/common/unicode/unicode.h
+++ b/icu4c/source/common/unicode/unicode.h
@ -528,6 +528,32 @@ public:
     */
    static  UChar             toTitleCase(UChar     ch);

+    /**
+     * Determines if the specified character is ISO-LATIN-1 white space. 
+     * This method returns <code>true</code> for the following five 
+     * characters only: 
+     * <table>
+     * <tr><td>'\t'</td>            <td>&#92;u0009</td>
+     *     <td><code>HORIZONTAL TABULATION</code></td></tr>
+     * <tr><td>'\n'</td>            <td>&#92;u000A</td>
+     *     <td><code>NEW LINE</code></td></tr>
+     * <tr><td>'\f'</td>            <td>&#92;u000C</td>
+     *     <td><code>FORM FEED</code></td></tr>
+     * <tr><td>'\r'</td>            <td>&#92;u000D</td>
+     *     <td><code>CARRIAGE RETURN</code></td></tr>
+     * <tr><td>'&nbsp;&nbsp;'</td>  <td>&#92;u0020</td>
+     *     <td><code>SPACE</code></td></tr>
+     * </table>
+     *
+     * @param      ch   the character to be tested.
+     * @return     <code>true</code> if the character is ISO-LATIN-1 white
+     *             space; <code>false</code> otherwise.
+     * @see        #isSpaceChar
+     * @see        #isWhitespace
+     * @deprecated Replaced by isWhitespace(char).
+     */
+    static bool_t isSpace(UChar ch);
+
    /**
     * Determines if the specified character is a Unicode space character
     * according to Unicode 2.1.2.
@ -537,6 +563,33 @@ public:
     */
    static  bool_t              isSpaceChar(UChar     ch);

+    /**
+     * Determines if the specified character is white space according to ICU.
+     * A character is considered to be an ICU whitespace character if and only
+     * if it satisfies one of the following criteria:
+     * <ul>
+     * <li> It is a Unicode space separator (category "Zs"), but is not
+     *      a no-break space (&#92;u00A0 or &#92;uFEFF).
+     * <li> It is a Unicode line separator (category "Zl").
+     * <li> It is a Unicode paragraph separator (category "Zp").
+     * <li> It is &#92;u0009, HORIZONTAL TABULATION.
+     * <li> It is &#92;u000A, LINE FEED.
+     * <li> It is &#92;u000B, VERTICAL TABULATION.
+     * <li> It is &#92;u000C, FORM FEED.
+     * <li> It is &#92;u000D, CARRIAGE RETURN.
+     * <li> It is &#92;u001C, FILE SEPARATOR.
+     * <li> It is &#92;u001D, GROUP SEPARATOR.
+     * <li> It is &#92;u001E, RECORD SEPARATOR.
+     * <li> It is &#92;u001F, UNIT SEPARATOR.
+     * </ul>
+     *
+     * @param   ch	the character to be tested.
+     * @return  true if the character is a Java whitespace character;
+     *          false otherwise.
+     * @see     #isSpaceChar
+     */
+    static bool_t isWhitespace(UChar ch);
+
   /**
     * Returns a value indicating a character category according to Unicode
     * 2.1.2.