mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-21 12:40:02 +00:00
ICU-221 add isWhitespace, isSpace
X-SVN-Rev: 546
This commit is contained in:
parent
fea8b1b618
commit
b06425886b
2 changed files with 145 additions and 0 deletions
|
@ -202,6 +202,40 @@ Unicode::toTitleCase(UChar ch)
|
|||
return(u_totitle(ch) );
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if the specified character is ISO-LATIN-1 white space.
|
||||
* This method returns <code>true</code> for the following five
|
||||
* characters only:
|
||||
* <table>
|
||||
* <tr><td>'\t'</td> <td>\u0009</td>
|
||||
* <td><code>HORIZONTAL TABULATION</code></td></tr>
|
||||
* <tr><td>'\n'</td> <td>\u000A</td>
|
||||
* <td><code>NEW LINE</code></td></tr>
|
||||
* <tr><td>'\f'</td> <td>\u000C</td>
|
||||
* <td><code>FORM FEED</code></td></tr>
|
||||
* <tr><td>'\r'</td> <td>\u000D</td>
|
||||
* <td><code>CARRIAGE RETURN</code></td></tr>
|
||||
* <tr><td>' '</td> <td>\u0020</td>
|
||||
* <td><code>SPACE</code></td></tr>
|
||||
* </table>
|
||||
*
|
||||
* @param ch the character to be tested.
|
||||
* @return <code>true</code> if the character is ISO-LATIN-1 white
|
||||
* space; <code>false</code> otherwise.
|
||||
* @see #isSpaceChar
|
||||
* @see #isWhitespace
|
||||
* @deprecated Replaced by isWhitespace(char).
|
||||
*/
|
||||
bool_t
|
||||
Unicode::isSpace(UChar ch) {
|
||||
return (ch <= 0x0020) &&
|
||||
(((((int32_t(1) << 0x0009) |
|
||||
(int32_t(1) << 0x000A) |
|
||||
(int32_t(1) << 0x000C) |
|
||||
(int32_t(1) << 0x000D) |
|
||||
(int32_t(1) << 0x0020)) >> ch) & int32_t(1)) != 0);
|
||||
}
|
||||
|
||||
// Checks if the Unicode character is a space character.
|
||||
bool_t
|
||||
Unicode::isSpaceChar(UChar ch)
|
||||
|
@ -209,6 +243,64 @@ Unicode::isSpaceChar(UChar ch)
|
|||
return(u_isspace(ch) );
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if the specified character is white space according to ICU.
|
||||
* A character is considered to be an ICU whitespace character if and only
|
||||
* if it satisfies one of the following criteria:
|
||||
* <ul>
|
||||
* <li> It is a Unicode space separator (category "Zs"), but is not
|
||||
* a no-break space (\u00A0 or \uFEFF).
|
||||
* <li> It is a Unicode line separator (category "Zl").
|
||||
* <li> It is a Unicode paragraph separator (category "Zp").
|
||||
* <li> It is \u0009, HORIZONTAL TABULATION.
|
||||
* <li> It is \u000A, LINE FEED.
|
||||
* <li> It is \u000B, VERTICAL TABULATION.
|
||||
* <li> It is \u000C, FORM FEED.
|
||||
* <li> It is \u000D, CARRIAGE RETURN.
|
||||
* <li> It is \u001C, FILE SEPARATOR.
|
||||
* <li> It is \u001D, GROUP SEPARATOR.
|
||||
* <li> It is \u001E, RECORD SEPARATOR.
|
||||
* <li> It is \u001F, UNIT SEPARATOR.
|
||||
* </ul>
|
||||
*
|
||||
* @param ch the character to be tested.
|
||||
* @return true if the character is a Java whitespace character;
|
||||
* false otherwise.
|
||||
* @see #isSpaceChar
|
||||
*/
|
||||
bool_t
|
||||
Unicode::isWhitespace(UChar ch) {
|
||||
// From Mark Davis:
|
||||
//| What we should do is to make sure that the special Cc characters like CR
|
||||
//| have either Zs, Zl, or Zp in the property database. We can then just call
|
||||
//| the equivalent of:
|
||||
//|
|
||||
//| public static boolean isWhileSpace(char ch) {
|
||||
//| return ((1 << Character.getType(c)) & WHITESPACE_MASK) != 0; }
|
||||
//|
|
||||
//| where WHITESPACE_MASK = (1 << Zs) | (1 << Zl) | (1 << Zp);
|
||||
//|
|
||||
//| This is much faster code, since it just looksup the property value and does
|
||||
//| a couple of arithmetics to get the right answer.
|
||||
|
||||
// TEMPORARY IMPLEMENTATION until the tables are updated to
|
||||
// modify Cc character categories:
|
||||
int8_t cat = Unicode::getType(ch);
|
||||
return
|
||||
(cat == SPACE_SEPARATOR && ch != 0x00A0 && ch != 0xFEFF) ||
|
||||
(cat == LINE_SEPARATOR) ||
|
||||
(cat == PARAGRAPH_SEPARATOR) ||
|
||||
(ch <= 0x1F && ((((int32_t(1) << 0x0009) |
|
||||
(int32_t(1) << 0x000A) |
|
||||
(int32_t(1) << 0x000B) |
|
||||
(int32_t(1) << 0x000C) |
|
||||
(int32_t(1) << 0x000D) |
|
||||
(int32_t(1) << 0x001C) |
|
||||
(int32_t(1) << 0x001D) |
|
||||
(int32_t(1) << 0x001E) |
|
||||
(int32_t(1) << 0x001F)) >> ch) & int32_t(1)) != 0);
|
||||
}
|
||||
|
||||
// Gets if the Unicode character's character property.
|
||||
int8_t
|
||||
Unicode::getType(UChar ch)
|
||||
|
|
|
@ -528,6 +528,32 @@ public:
|
|||
*/
|
||||
static UChar toTitleCase(UChar ch);
|
||||
|
||||
/**
|
||||
* Determines if the specified character is ISO-LATIN-1 white space.
|
||||
* This method returns <code>true</code> for the following five
|
||||
* characters only:
|
||||
* <table>
|
||||
* <tr><td>'\t'</td> <td>\u0009</td>
|
||||
* <td><code>HORIZONTAL TABULATION</code></td></tr>
|
||||
* <tr><td>'\n'</td> <td>\u000A</td>
|
||||
* <td><code>NEW LINE</code></td></tr>
|
||||
* <tr><td>'\f'</td> <td>\u000C</td>
|
||||
* <td><code>FORM FEED</code></td></tr>
|
||||
* <tr><td>'\r'</td> <td>\u000D</td>
|
||||
* <td><code>CARRIAGE RETURN</code></td></tr>
|
||||
* <tr><td>' '</td> <td>\u0020</td>
|
||||
* <td><code>SPACE</code></td></tr>
|
||||
* </table>
|
||||
*
|
||||
* @param ch the character to be tested.
|
||||
* @return <code>true</code> if the character is ISO-LATIN-1 white
|
||||
* space; <code>false</code> otherwise.
|
||||
* @see #isSpaceChar
|
||||
* @see #isWhitespace
|
||||
* @deprecated Replaced by isWhitespace(char).
|
||||
*/
|
||||
static bool_t isSpace(UChar ch);
|
||||
|
||||
/**
|
||||
* Determines if the specified character is a Unicode space character
|
||||
* according to Unicode 2.1.2.
|
||||
|
@ -537,6 +563,33 @@ public:
|
|||
*/
|
||||
static bool_t isSpaceChar(UChar ch);
|
||||
|
||||
/**
|
||||
* Determines if the specified character is white space according to ICU.
|
||||
* A character is considered to be an ICU whitespace character if and only
|
||||
* if it satisfies one of the following criteria:
|
||||
* <ul>
|
||||
* <li> It is a Unicode space separator (category "Zs"), but is not
|
||||
* a no-break space (\u00A0 or \uFEFF).
|
||||
* <li> It is a Unicode line separator (category "Zl").
|
||||
* <li> It is a Unicode paragraph separator (category "Zp").
|
||||
* <li> It is \u0009, HORIZONTAL TABULATION.
|
||||
* <li> It is \u000A, LINE FEED.
|
||||
* <li> It is \u000B, VERTICAL TABULATION.
|
||||
* <li> It is \u000C, FORM FEED.
|
||||
* <li> It is \u000D, CARRIAGE RETURN.
|
||||
* <li> It is \u001C, FILE SEPARATOR.
|
||||
* <li> It is \u001D, GROUP SEPARATOR.
|
||||
* <li> It is \u001E, RECORD SEPARATOR.
|
||||
* <li> It is \u001F, UNIT SEPARATOR.
|
||||
* </ul>
|
||||
*
|
||||
* @param ch the character to be tested.
|
||||
* @return true if the character is a Java whitespace character;
|
||||
* false otherwise.
|
||||
* @see #isSpaceChar
|
||||
*/
|
||||
static bool_t isWhitespace(UChar ch);
|
||||
|
||||
/**
|
||||
* Returns a value indicating a character category according to Unicode
|
||||
* 2.1.2.
|
||||
|
|
Loading…
Add table
Reference in a new issue