ICU-221 add isWhitespace, isSpace

X-SVN-Rev: 546
This commit is contained in:
Alan Liu 2000-01-12 18:01:51 +00:00
parent fea8b1b618
commit b06425886b
2 changed files with 145 additions and 0 deletions

View file

@ -202,6 +202,40 @@ Unicode::toTitleCase(UChar ch)
return(u_totitle(ch) );
}
/**
* Determines if the specified character is ISO-LATIN-1 white space.
* This method returns <code>true</code> for the following five
* characters only:
* <table>
* <tr><td>'\t'</td> <td>&#92;u0009</td>
* <td><code>HORIZONTAL TABULATION</code></td></tr>
* <tr><td>'\n'</td> <td>&#92;u000A</td>
* <td><code>NEW LINE</code></td></tr>
* <tr><td>'\f'</td> <td>&#92;u000C</td>
* <td><code>FORM FEED</code></td></tr>
* <tr><td>'\r'</td> <td>&#92;u000D</td>
* <td><code>CARRIAGE RETURN</code></td></tr>
* <tr><td>'&nbsp;&nbsp;'</td> <td>&#92;u0020</td>
* <td><code>SPACE</code></td></tr>
* </table>
*
* @param ch the character to be tested.
* @return <code>true</code> if the character is ISO-LATIN-1 white
* space; <code>false</code> otherwise.
* @see #isSpaceChar
* @see #isWhitespace
* @deprecated Replaced by isWhitespace(char).
*/
bool_t
Unicode::isSpace(UChar ch) {
return (ch <= 0x0020) &&
(((((int32_t(1) << 0x0009) |
(int32_t(1) << 0x000A) |
(int32_t(1) << 0x000C) |
(int32_t(1) << 0x000D) |
(int32_t(1) << 0x0020)) >> ch) & int32_t(1)) != 0);
}
// Checks if the Unicode character is a space character.
bool_t
Unicode::isSpaceChar(UChar ch)
@ -209,6 +243,64 @@ Unicode::isSpaceChar(UChar ch)
return(u_isspace(ch) );
}
/**
* Determines if the specified character is white space according to ICU.
* A character is considered to be an ICU whitespace character if and only
* if it satisfies one of the following criteria:
* <ul>
* <li> It is a Unicode space separator (category "Zs"), but is not
* a no-break space (&#92;u00A0 or &#92;uFEFF).
* <li> It is a Unicode line separator (category "Zl").
* <li> It is a Unicode paragraph separator (category "Zp").
* <li> It is &#92;u0009, HORIZONTAL TABULATION.
* <li> It is &#92;u000A, LINE FEED.
* <li> It is &#92;u000B, VERTICAL TABULATION.
* <li> It is &#92;u000C, FORM FEED.
* <li> It is &#92;u000D, CARRIAGE RETURN.
* <li> It is &#92;u001C, FILE SEPARATOR.
* <li> It is &#92;u001D, GROUP SEPARATOR.
* <li> It is &#92;u001E, RECORD SEPARATOR.
* <li> It is &#92;u001F, UNIT SEPARATOR.
* </ul>
*
* @param ch the character to be tested.
* @return true if the character is a Java whitespace character;
* false otherwise.
* @see #isSpaceChar
*/
bool_t
Unicode::isWhitespace(UChar ch) {
// From Mark Davis:
//| What we should do is to make sure that the special Cc characters like CR
//| have either Zs, Zl, or Zp in the property database. We can then just call
//| the equivalent of:
//|
//| public static boolean isWhileSpace(char ch) {
//| return ((1 << Character.getType(c)) & WHITESPACE_MASK) != 0; }
//|
//| where WHITESPACE_MASK = (1 << Zs) | (1 << Zl) | (1 << Zp);
//|
//| This is much faster code, since it just looksup the property value and does
//| a couple of arithmetics to get the right answer.
// TEMPORARY IMPLEMENTATION until the tables are updated to
// modify Cc character categories:
int8_t cat = Unicode::getType(ch);
return
(cat == SPACE_SEPARATOR && ch != 0x00A0 && ch != 0xFEFF) ||
(cat == LINE_SEPARATOR) ||
(cat == PARAGRAPH_SEPARATOR) ||
(ch <= 0x1F && ((((int32_t(1) << 0x0009) |
(int32_t(1) << 0x000A) |
(int32_t(1) << 0x000B) |
(int32_t(1) << 0x000C) |
(int32_t(1) << 0x000D) |
(int32_t(1) << 0x001C) |
(int32_t(1) << 0x001D) |
(int32_t(1) << 0x001E) |
(int32_t(1) << 0x001F)) >> ch) & int32_t(1)) != 0);
}
// Gets if the Unicode character's character property.
int8_t
Unicode::getType(UChar ch)

View file

@ -528,6 +528,32 @@ public:
*/
static UChar toTitleCase(UChar ch);
/**
* Determines if the specified character is ISO-LATIN-1 white space.
* This method returns <code>true</code> for the following five
* characters only:
* <table>
* <tr><td>'\t'</td> <td>&#92;u0009</td>
* <td><code>HORIZONTAL TABULATION</code></td></tr>
* <tr><td>'\n'</td> <td>&#92;u000A</td>
* <td><code>NEW LINE</code></td></tr>
* <tr><td>'\f'</td> <td>&#92;u000C</td>
* <td><code>FORM FEED</code></td></tr>
* <tr><td>'\r'</td> <td>&#92;u000D</td>
* <td><code>CARRIAGE RETURN</code></td></tr>
* <tr><td>'&nbsp;&nbsp;'</td> <td>&#92;u0020</td>
* <td><code>SPACE</code></td></tr>
* </table>
*
* @param ch the character to be tested.
* @return <code>true</code> if the character is ISO-LATIN-1 white
* space; <code>false</code> otherwise.
* @see #isSpaceChar
* @see #isWhitespace
* @deprecated Replaced by isWhitespace(char).
*/
static bool_t isSpace(UChar ch);
/**
* Determines if the specified character is a Unicode space character
* according to Unicode 2.1.2.
@ -537,6 +563,33 @@ public:
*/
static bool_t isSpaceChar(UChar ch);
/**
* Determines if the specified character is white space according to ICU.
* A character is considered to be an ICU whitespace character if and only
* if it satisfies one of the following criteria:
* <ul>
* <li> It is a Unicode space separator (category "Zs"), but is not
* a no-break space (&#92;u00A0 or &#92;uFEFF).
* <li> It is a Unicode line separator (category "Zl").
* <li> It is a Unicode paragraph separator (category "Zp").
* <li> It is &#92;u0009, HORIZONTAL TABULATION.
* <li> It is &#92;u000A, LINE FEED.
* <li> It is &#92;u000B, VERTICAL TABULATION.
* <li> It is &#92;u000C, FORM FEED.
* <li> It is &#92;u000D, CARRIAGE RETURN.
* <li> It is &#92;u001C, FILE SEPARATOR.
* <li> It is &#92;u001D, GROUP SEPARATOR.
* <li> It is &#92;u001E, RECORD SEPARATOR.
* <li> It is &#92;u001F, UNIT SEPARATOR.
* </ul>
*
* @param ch the character to be tested.
* @return true if the character is a Java whitespace character;
* false otherwise.
* @see #isSpaceChar
*/
static bool_t isWhitespace(UChar ch);
/**
* Returns a value indicating a character category according to Unicode
* 2.1.2.