mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 15:42:14 +00:00
ICU-21652 add emoji properties of strings
- 7 new properties: API constants & property names - u_stringHasBinaryProperty(s, property) & UCharacter.hasBinaryProperty(s, property) - two additional source data files - new genprops part for writing new binary data file uemoji.icu - data for existing emoji properties moved from uprops.icu (hardcoded in C++) to uemoji.icu (always loaded) - new EmojiProps implementation
This commit is contained in:
parent
6244d57559
commit
f9beb616a8
49 changed files with 8116 additions and 3293 deletions
|
@ -829,9 +829,9 @@ loadable data objects.)
|
|||
|
||||
#### Unicode Character Data (Normalization since ICU 4.4) & custom normalization data
|
||||
* Source format:
|
||||
[source/data/unidata/norm2/*.tx](https://github.com/unicode-org/icu/blob/main/icu4c/source/data/unidata/norm2):
|
||||
[source/data/unidata/norm2/*.txt](https://github.com/unicode-org/icu/blob/main/icu4c/source/data/unidata/norm2):
|
||||
Files derived from the [Unicode Character
|
||||
Database](http://www.unicode.org/onlinedat/online.html), or custom data.
|
||||
Database](https://www.unicode.org/onlinedat/online.html), or custom data.
|
||||
* Binary format: .nrm:
|
||||
[source/common/normalizer2impl.h](https://github.com/unicode-org/icu/blob/main/icu4c/source/common/normalizer2impl.h)
|
||||
* Generator tool:
|
||||
|
@ -863,6 +863,18 @@ loadable data objects.)
|
|||
* Generator tool:
|
||||
[genprops](https://github.com/unicode-org/icu/blob/main/tools/unicode/c/genprops)
|
||||
|
||||
#### Unicode Character Data (Emoji properties since ICU 70)
|
||||
Emoji properties of code points moved out of uprops.icu.
|
||||
Emoji properties of strings added.
|
||||
* Source format:
|
||||
[source/data/unidata/emoji-sequences.txt](https://github.com/unicode-org/icu/blob/main/icu4c/source/data/unidata/emoji-sequences.txt) and
|
||||
[source/data/unidata/emoji-zwj-sequences.txt](https://github.com/unicode-org/icu/blob/main/icu4c/source/data/unidata/emoji-zwj-sequences.txt):
|
||||
[UTS #51 Data Files](https://www.unicode.org/reports/tr51/#Data_Files)
|
||||
* Binary format: uemoji.icu:
|
||||
[tools/unicode/c/genprops/emojipropsbuilder.cpp](https://github.com/unicode-org/icu/blob/main/tools/unicode/c/genprops/emojipropsbuilder.cpp)
|
||||
* Generator tool:
|
||||
[genprops](https://github.com/unicode-org/icu/blob/main/tools/unicode/c/genprops)
|
||||
|
||||
#### Collation data (root collation & tailorings; ICU 53 & later)
|
||||
* Source format: Original data from allkeys_CLDR.txt in
|
||||
[CLDR Root Collation Data Files](http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Data_Files)
|
||||
|
|
|
@ -101,10 +101,12 @@ sets of values (for use with value aliases and UnicodeSet).
|
|||
| Age | Unicode version | (U) | C: u_charAge fills in UVersionInfo<br>Java: getAge returns a VersionInfo reference |
|
||||
| Alphabetic | binary | (U) | u_isUAlphabetic, UCHAR_ALPHABETIC |
|
||||
| ASCII_Hex_Digit | binary | (U) | UCHAR_ASCII_HEX_DIGIT |
|
||||
| Basic_Emoji* | binary | (U) | UCHAR_BASIC_EMOJI |
|
||||
| Bidi_Class | enum | (U) | u_charDirection, UCHAR_BIDI_CLASS<br>returns enum UCharDirection |
|
||||
| Bidi_Control | binary | (U) | UCHAR_BIDI_CONTROL |
|
||||
| Bidi_Mirrored | binary | (U) | u_isMirrored, UCHAR_BIDI_MIRRORED |
|
||||
| Bidi_Mirroring_Glyph | code point | | u_charMirror |
|
||||
| Bidi_Paired_Bracket_Type | enum | (U) | UCHAR_BIDI_PAIRED_BRACKET_TYPE<br>returns enum UBidiPairedBracketType |
|
||||
| Block | enum | (U) | ublock_getCode, UCHAR_BLOCK<br>returns enum UBlockCode |
|
||||
| Canonical_Combining_Class | 0..255 | (U) | u_getCombiningClass, UCHAR_CANONICAL_COMBINING_CLASS |
|
||||
| Case_Folding | Unicode string | | u_strFoldCase (ustring.h) |
|
||||
|
@ -124,7 +126,14 @@ sets of values (for use with value aliases and UnicodeSet).
|
|||
| Deprecated | binary | (U) | UCHAR_DEPRECATED |
|
||||
| Diacritic | binary | (U) | UCHAR_DIACRITIC |
|
||||
| East_Asian_Width | enum | (U) | UCHAR_EAST_ASIAN_WIDTH<br>returns enum UEastAsianWidth |
|
||||
| Emoji | binary | (U) | UCHAR_EMOJI |
|
||||
| Emoji_Component | binary | (U) | UCHAR_EMOJI_COMPONENT |
|
||||
| Emoji_Keycap_Sequence* | binary | (U) | UCHAR_EMOJI_KEYCAP_SEQUENCE |
|
||||
| Emoji_Modifier | binary | (U) | UCHAR_EMOJI_MODIFIER |
|
||||
| Emoji_Modifier_Base | binary | (U) | UCHAR_EMOJI_MODIFIER_BASE |
|
||||
| Emoji_Presentation | binary | (U) | UCHAR_EMOJI_PRESENTATION |
|
||||
| Expands_On_NF* | binary | | available via normalization API (normalizer2.h) |
|
||||
| Extended_Pictographic | binary | (U) | UCHAR_EXTENDED_PICTOGRAPHIC |
|
||||
| Extender | binary | (U) | UCHAR_EXTENDER |
|
||||
| FC_NFKC_Closure | Unicode string | | u_getFC_NFKC_Closure |
|
||||
| Full_Composition_Exclusion | binary | (U) | UCHAR_FULL_COMPOSITION_EXCLUSION |
|
||||
|
@ -168,8 +177,15 @@ sets of values (for use with value aliases and UnicodeSet).
|
|||
| Other_Uppercase | binary | (c) | contributes to Uppercase |
|
||||
| Pattern_Syntax | binary | (U) | UCHAR_PATTERN_SYNTAX |
|
||||
| Pattern_White_Space | binary | (U) | UCHAR_PATTERN_WHITE_SPACE |
|
||||
| Prepended_Concatenation_Mark | binary | (U) | UCHAR_PREPENDED_CONCATENATION_MARK |
|
||||
| Quotation_Mark | binary | (U) | UCHAR_QUOTATION_MARK |
|
||||
| Radical | binary | (U) | UCHAR_RADICAL |
|
||||
| Regional_Indicator | binary | (U) | UCHAR_REGIONAL_INDICATOR |
|
||||
| RGI_Emoji* | binary | (U) | UCHAR_RGI_EMOJI |
|
||||
| RGI_Emoji_Flag_Sequence* | binary | (U) | UCHAR_RGI_EMOJI_FLAG_SEQUENCE |
|
||||
| RGI_Emoji_Modifier_Sequence* | binary | (U) | UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE |
|
||||
| RGI_Emoji_Tag_Sequence* | binary | (U) | UCHAR_RGI_EMOJI_TAG_SEQUENCE |
|
||||
| RGI_Emoji_ZWJ_Sequence* | binary | (U) | UCHAR_RGI_EMOJI_ZWJ_SEQUENCE |
|
||||
| Script | enum | (U) | uscript_getCode (uscript.h), UCHAR_SCRIPT<br>returns enum UScriptCode |
|
||||
| Script_Extensions | list | (U) | uscript_getScriptExtensions & uscript_hasScript (uscript.h), UCHAR_SCRIPT_EXTENSIONS<br>returns a list of enum UScriptCode values |
|
||||
| Sentence_Break | enum | (U) | UCHAR_SENTENCE_BREAK<br>returns enum USentenceBreak |
|
||||
|
@ -202,10 +218,15 @@ Notes:
|
|||
Properties which are not available in UnicodeSet are generally those that
|
||||
are not available through a UProperty selector.
|
||||
|
||||
3. UnicodeSet `[:scx=Arab:]` is a superset of `[:sc=Arab:]`;
|
||||
3. When a property name is followed by a star (*), it is a property of strings;
|
||||
for example, Basic_Emoji and RGI_Emoji.
|
||||
See https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
Properties of strings are not yet supported in ICU regular expressions.
|
||||
|
||||
4. UnicodeSet `[:scx=Arab:]` is a superset of `[:sc=Arab:]`;
|
||||
see https://www.unicode.org/reports/tr18/#Script_Property
|
||||
|
||||
4. Full case mapping properties (e.g., Lowercase_Mapping) are complex.
|
||||
5. Full case mapping properties (e.g., Lowercase_Mapping) are complex.
|
||||
The string case mapping functions that implement them handle language-specific
|
||||
and/or context-sensitive mappings.
|
||||
The output may have more code points or fewer code points than the input.
|
||||
|
|
|
@ -429,6 +429,7 @@ cc_library(
|
|||
includes = ["."],
|
||||
deps = [
|
||||
":headers",
|
||||
":emojiprops",
|
||||
":ucptrie",
|
||||
":umutablecptrie",
|
||||
":uniset_core",
|
||||
|
@ -735,6 +736,25 @@ cc_library(
|
|||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "emojiprops",
|
||||
srcs = [
|
||||
"emojiprops.cpp",
|
||||
"emojiprops.h",
|
||||
],
|
||||
includes = ["."],
|
||||
deps = [
|
||||
":headers",
|
||||
":ucharstrie",
|
||||
":ucharstrieiterator",
|
||||
":ucptrie",
|
||||
":udata",
|
||||
],
|
||||
local_defines = [
|
||||
"U_COMMON_IMPLEMENTATION",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "ucharstrie",
|
||||
srcs = [
|
||||
|
@ -997,6 +1017,7 @@ cc_library(
|
|||
includes = ["."],
|
||||
deps = [
|
||||
":headers",
|
||||
":emojiprops",
|
||||
":loadednormalizer2",
|
||||
":normalizer2",
|
||||
":ubidi_props",
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#include "unicode/uscript.h"
|
||||
#include "unicode/uset.h"
|
||||
#include "cmemory.h"
|
||||
#include "emojiprops.h"
|
||||
#include "mutex.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "uassert.h"
|
||||
|
@ -170,6 +171,13 @@ void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
|
|||
case UPROPS_SRC_VO:
|
||||
uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);
|
||||
break;
|
||||
case UPROPS_SRC_EMOJI: {
|
||||
const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
ep->addPropertyStarts(&sa, errorCode);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
errorCode = U_INTERNAL_PROGRAM_ERROR;
|
||||
break;
|
||||
|
@ -268,6 +276,26 @@ UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
|
|||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
if (UCHAR_BASIC_EMOJI <= property && property <= UCHAR_RGI_EMOJI) {
|
||||
// property of strings
|
||||
const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
USetAdder sa = {
|
||||
(USet *)set.getAlias(),
|
||||
_set_add,
|
||||
_set_addRange,
|
||||
_set_addString,
|
||||
nullptr, // don't need remove()
|
||||
nullptr // don't need removeRange()
|
||||
};
|
||||
ep->addStrings(&sa, property, errorCode);
|
||||
if (property != UCHAR_BASIC_EMOJI && property != UCHAR_RGI_EMOJI) {
|
||||
// property of _only_ strings
|
||||
set->freeze();
|
||||
return set.orphan();
|
||||
}
|
||||
}
|
||||
|
||||
const UnicodeSet *inclusions =
|
||||
icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
|
|
|
@ -204,6 +204,7 @@
|
|||
<ClCompile Include="ucase.cpp" />
|
||||
<ClCompile Include="uchar.cpp" />
|
||||
<ClCompile Include="characterproperties.cpp" />
|
||||
<ClCompile Include="emojiprops.cpp" />
|
||||
<ClCompile Include="unames.cpp" />
|
||||
<ClCompile Include="unifiedcache.cpp" />
|
||||
<ClCompile Include="unifilt.cpp" />
|
||||
|
@ -365,6 +366,7 @@
|
|||
<ClInclude Include="patternprops.h" />
|
||||
<ClInclude Include="propname.h" />
|
||||
<ClInclude Include="ruleiter.h" />
|
||||
<ClInclude Include="emojiprops.h" />
|
||||
<ClInclude Include="ucase.h" />
|
||||
<ClInclude Include="ulayout_props.h" />
|
||||
<ClInclude Include="unisetspan.h" />
|
||||
|
|
|
@ -412,6 +412,9 @@
|
|||
<ClCompile Include="characterproperties.cpp">
|
||||
<Filter>properties & sets</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="emojiprops.cpp">
|
||||
<Filter>properties & sets</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="propname.cpp">
|
||||
<Filter>properties & sets</Filter>
|
||||
</ClCompile>
|
||||
|
@ -894,6 +897,9 @@
|
|||
<ClInclude Include="ruleiter.h">
|
||||
<Filter>properties & sets</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="emojiprops.h">
|
||||
<Filter>properties & sets</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="ucase.h">
|
||||
<Filter>properties & sets</Filter>
|
||||
</ClInclude>
|
||||
|
|
|
@ -338,6 +338,7 @@
|
|||
<ClCompile Include="ucase.cpp" />
|
||||
<ClCompile Include="uchar.cpp" />
|
||||
<ClCompile Include="characterproperties.cpp" />
|
||||
<ClCompile Include="emojiprops.cpp" />
|
||||
<ClCompile Include="unames.cpp" />
|
||||
<ClCompile Include="unifiedcache.cpp" />
|
||||
<ClCompile Include="unifilt.cpp" />
|
||||
|
@ -500,6 +501,7 @@
|
|||
<ClInclude Include="patternprops.h" />
|
||||
<ClInclude Include="propname.h" />
|
||||
<ClInclude Include="ruleiter.h" />
|
||||
<ClInclude Include="emojiprops.h" />
|
||||
<ClInclude Include="ucase.h" />
|
||||
<ClInclude Include="ulayout_props.h" />
|
||||
<ClInclude Include="unisetspan.h" />
|
||||
|
|
220
icu4c/source/common/emojiprops.cpp
Normal file
220
icu4c/source/common/emojiprops.cpp
Normal file
|
@ -0,0 +1,220 @@
|
|||
// © 2021 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: https://www.unicode.org/copyright.html
|
||||
|
||||
// emojiprops.cpp
|
||||
// created: 2021sep04 Markus W. Scherer
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ucharstrie.h"
|
||||
#include "unicode/ucptrie.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/ustringtrie.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "emojiprops.h"
|
||||
#include "ucln.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "umutex.h"
|
||||
#include "uset_imp.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
namespace {
|
||||
|
||||
EmojiProps *singleton = nullptr;
|
||||
icu::UInitOnce emojiInitOnce = U_INITONCE_INITIALIZER;
|
||||
|
||||
UBool U_CALLCONV emojiprops_cleanup() {
|
||||
delete singleton;
|
||||
singleton = nullptr;
|
||||
emojiInitOnce.reset();
|
||||
return true;
|
||||
}
|
||||
|
||||
void U_CALLCONV initSingleton(UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
singleton = new EmojiProps(errorCode);
|
||||
if (singleton == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
} else if (U_FAILURE(errorCode)) {
|
||||
delete singleton;
|
||||
singleton = nullptr;
|
||||
}
|
||||
ucln_common_registerCleanup(UCLN_COMMON_EMOJIPROPS, emojiprops_cleanup);
|
||||
}
|
||||
|
||||
// TODO: turn this into a shared helper function
|
||||
// Requires the major version to match, and then requires at least the minor version.
|
||||
UBool udata_isAcceptableMajorMinor(
|
||||
const UDataInfo &info, const UChar *dataFormat, uint8_t major, uint8_t minor) {
|
||||
return
|
||||
info.size >= 20 &&
|
||||
info.isBigEndian == U_IS_BIG_ENDIAN &&
|
||||
info.charsetFamily == U_CHARSET_FAMILY &&
|
||||
info.dataFormat[0] == dataFormat[0] &&
|
||||
info.dataFormat[1] == dataFormat[1] &&
|
||||
info.dataFormat[2] == dataFormat[2] &&
|
||||
info.dataFormat[3] == dataFormat[3] &&
|
||||
info.formatVersion[0] == major &&
|
||||
info.formatVersion[1] >= minor;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
EmojiProps::~EmojiProps() {
|
||||
udata_close(memory);
|
||||
ucptrie_close(cpTrie);
|
||||
}
|
||||
|
||||
const EmojiProps *
|
||||
EmojiProps::getSingleton(UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
umtx_initOnce(emojiInitOnce, &initSingleton, errorCode);
|
||||
return singleton;
|
||||
}
|
||||
|
||||
UBool U_CALLCONV
|
||||
EmojiProps::isAcceptable(void * /*context*/, const char * /*type*/, const char * /*name*/,
|
||||
const UDataInfo *pInfo) {
|
||||
return udata_isAcceptableMajorMinor(*pInfo, u"Emoj", 1, 0);
|
||||
}
|
||||
|
||||
void
|
||||
EmojiProps::load(UErrorCode &errorCode) {
|
||||
memory = udata_openChoice(nullptr, "icu", "uemoji", isAcceptable, this, &errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
const uint8_t *inBytes = (const uint8_t *)udata_getMemory(memory);
|
||||
const int32_t *inIndexes = (const int32_t *)inBytes;
|
||||
int32_t indexesLength = inIndexes[IX_CPTRIE_OFFSET] / 4;
|
||||
if (indexesLength <= IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET) {
|
||||
errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes.
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t i = IX_CPTRIE_OFFSET;
|
||||
int32_t offset = inIndexes[i++];
|
||||
int32_t nextOffset = inIndexes[i];
|
||||
cpTrie = ucptrie_openFromBinary(UCPTRIE_TYPE_FAST, UCPTRIE_VALUE_BITS_8,
|
||||
inBytes + offset, nextOffset - offset, nullptr, &errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = IX_BASIC_EMOJI_TRIE_OFFSET; i <= IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET; ++i) {
|
||||
offset = inIndexes[i];
|
||||
nextOffset = inIndexes[i + 1];
|
||||
// Set/leave nullptr if there is no UCharsTrie.
|
||||
const UChar *p = nextOffset > offset ? (const UChar *)(inBytes + offset) : nullptr;
|
||||
stringTries[getStringTrieIndex(i)] = p;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
EmojiProps::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
|
||||
// Add the start code point of each same-value range of the trie.
|
||||
UChar32 start = 0, end;
|
||||
uint32_t value;
|
||||
while ((end = ucptrie_getRange(cpTrie, start, UCPMAP_RANGE_NORMAL, 0,
|
||||
nullptr, nullptr, &value)) >= 0) {
|
||||
sa->add(sa->set, start);
|
||||
start = end + 1;
|
||||
}
|
||||
}
|
||||
|
||||
UBool
|
||||
EmojiProps::hasBinaryProperty(UChar32 c, UProperty which) {
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
const EmojiProps *ep = getSingleton(errorCode);
|
||||
return U_SUCCESS(errorCode) && ep->hasBinaryPropertyImpl(c, which);
|
||||
}
|
||||
|
||||
UBool
|
||||
EmojiProps::hasBinaryPropertyImpl(UChar32 c, UProperty which) const {
|
||||
if (which < UCHAR_EMOJI || UCHAR_RGI_EMOJI < which) {
|
||||
return false;
|
||||
}
|
||||
// Note: UCHAR_REGIONAL_INDICATOR is a single, hardcoded range implemented elsewhere.
|
||||
static constexpr int8_t bitFlags[] = {
|
||||
BIT_EMOJI, // UCHAR_EMOJI=57
|
||||
BIT_EMOJI_PRESENTATION, // UCHAR_EMOJI_PRESENTATION=58
|
||||
BIT_EMOJI_MODIFIER, // UCHAR_EMOJI_MODIFIER=59
|
||||
BIT_EMOJI_MODIFIER_BASE, // UCHAR_EMOJI_MODIFIER_BASE=60
|
||||
BIT_EMOJI_COMPONENT, // UCHAR_EMOJI_COMPONENT=61
|
||||
-1, // UCHAR_REGIONAL_INDICATOR=62
|
||||
-1, // UCHAR_PREPENDED_CONCATENATION_MARK=63
|
||||
BIT_EXTENDED_PICTOGRAPHIC, // UCHAR_EXTENDED_PICTOGRAPHIC=64
|
||||
BIT_BASIC_EMOJI, // UCHAR_BASIC_EMOJI=65
|
||||
-1, // UCHAR_EMOJI_KEYCAP_SEQUENCE=66
|
||||
-1, // UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE=67
|
||||
-1, // UCHAR_RGI_EMOJI_FLAG_SEQUENCE=68
|
||||
-1, // UCHAR_RGI_EMOJI_TAG_SEQUENCE=69
|
||||
-1, // UCHAR_RGI_EMOJI_ZWJ_SEQUENCE=70
|
||||
BIT_BASIC_EMOJI, // UCHAR_RGI_EMOJI=71
|
||||
};
|
||||
int32_t bit = bitFlags[which - UCHAR_EMOJI];
|
||||
if (bit < 0) {
|
||||
return false; // not a property that we support in this function
|
||||
}
|
||||
uint8_t bits = UCPTRIE_FAST_GET(cpTrie, UCPTRIE_8, c);
|
||||
return (bits >> bit) & 1;
|
||||
}
|
||||
|
||||
UBool
|
||||
EmojiProps::hasBinaryProperty(const UChar *s, int32_t length, UProperty which) {
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
const EmojiProps *ep = getSingleton(errorCode);
|
||||
return U_SUCCESS(errorCode) && ep->hasBinaryPropertyImpl(s, length, which);
|
||||
}
|
||||
|
||||
UBool
|
||||
EmojiProps::hasBinaryPropertyImpl(const UChar *s, int32_t length, UProperty which) const {
|
||||
if (s == nullptr && length != 0) { return false; }
|
||||
if (length <= 0 && (length == 0 || *s == 0)) { return false; } // empty string
|
||||
// The caller should have delegated single code points to hasBinaryProperty(c, which).
|
||||
if (which < UCHAR_BASIC_EMOJI || UCHAR_RGI_EMOJI < which) {
|
||||
return false;
|
||||
}
|
||||
UProperty firstProp = which, lastProp = which;
|
||||
if (which == UCHAR_RGI_EMOJI) {
|
||||
// RGI_Emoji is the union of the other emoji properties of strings.
|
||||
firstProp = UCHAR_BASIC_EMOJI;
|
||||
lastProp = UCHAR_RGI_EMOJI_ZWJ_SEQUENCE;
|
||||
}
|
||||
for (int32_t prop = firstProp; prop <= lastProp; ++prop) {
|
||||
const UChar *trieUChars = stringTries[prop - UCHAR_BASIC_EMOJI];
|
||||
if (trieUChars != nullptr) {
|
||||
UCharsTrie trie(trieUChars);
|
||||
UStringTrieResult result = trie.next(s, length);
|
||||
if (USTRINGTRIE_HAS_VALUE(result)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void
|
||||
EmojiProps::addStrings(const USetAdder *sa, UProperty which, UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
if (which < UCHAR_BASIC_EMOJI || UCHAR_RGI_EMOJI < which) {
|
||||
return;
|
||||
}
|
||||
UProperty firstProp = which, lastProp = which;
|
||||
if (which == UCHAR_RGI_EMOJI) {
|
||||
// RGI_Emoji is the union of the other emoji properties of strings.
|
||||
firstProp = UCHAR_BASIC_EMOJI;
|
||||
lastProp = UCHAR_RGI_EMOJI_ZWJ_SEQUENCE;
|
||||
}
|
||||
for (int32_t prop = firstProp; prop <= lastProp; ++prop) {
|
||||
const UChar *trieUChars = stringTries[prop - UCHAR_BASIC_EMOJI];
|
||||
if (trieUChars != nullptr) {
|
||||
UCharsTrie::Iterator iter(trieUChars, 0, errorCode);
|
||||
while (iter.next(errorCode)) {
|
||||
const UnicodeString &s = iter.getString();
|
||||
sa->addString(sa->set, s.getBuffer(), s.length());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
90
icu4c/source/common/emojiprops.h
Normal file
90
icu4c/source/common/emojiprops.h
Normal file
|
@ -0,0 +1,90 @@
|
|||
// © 2021 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: https://www.unicode.org/copyright.html
|
||||
|
||||
// emojiprops.h
|
||||
// created: 2021sep03 Markus W. Scherer
|
||||
|
||||
#ifndef __EMOJIPROPS_H__
|
||||
#define __EMOJIPROPS_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/ucptrie.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "uset_imp.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class EmojiProps : public UMemory {
|
||||
public:
|
||||
// @internal
|
||||
EmojiProps(UErrorCode &errorCode) { load(errorCode); }
|
||||
~EmojiProps();
|
||||
|
||||
static const EmojiProps *getSingleton(UErrorCode &errorCode);
|
||||
static UBool hasBinaryProperty(UChar32 c, UProperty which);
|
||||
static UBool hasBinaryProperty(const UChar *s, int32_t length, UProperty which);
|
||||
|
||||
void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
|
||||
void addStrings(const USetAdder *sa, UProperty which, UErrorCode &errorCode) const;
|
||||
|
||||
enum {
|
||||
// Byte offsets from the start of the data, after the generic header,
|
||||
// in ascending order.
|
||||
// UCPTrie=CodePointTrie, follows the indexes
|
||||
IX_CPTRIE_OFFSET,
|
||||
IX_RESERVED1,
|
||||
IX_RESERVED2,
|
||||
IX_RESERVED3,
|
||||
|
||||
// UCharsTrie=CharsTrie
|
||||
IX_BASIC_EMOJI_TRIE_OFFSET,
|
||||
IX_EMOJI_KEYCAP_SEQUENCE_TRIE_OFFSET,
|
||||
IX_RGI_EMOJI_MODIFIER_SEQUENCE_TRIE_OFFSET,
|
||||
IX_RGI_EMOJI_FLAG_SEQUENCE_TRIE_OFFSET,
|
||||
IX_RGI_EMOJI_TAG_SEQUENCE_TRIE_OFFSET,
|
||||
IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET,
|
||||
IX_RESERVED10,
|
||||
IX_RESERVED11,
|
||||
IX_RESERVED12,
|
||||
IX_TOTAL_SIZE,
|
||||
|
||||
// Not initially byte offsets.
|
||||
IX_RESERVED14,
|
||||
IX_RESERVED15,
|
||||
IX_COUNT // 16
|
||||
};
|
||||
|
||||
// Properties in the code point trie.
|
||||
enum {
|
||||
// https://www.unicode.org/reports/tr51/#Emoji_Properties
|
||||
BIT_EMOJI,
|
||||
BIT_EMOJI_PRESENTATION,
|
||||
BIT_EMOJI_MODIFIER,
|
||||
BIT_EMOJI_MODIFIER_BASE,
|
||||
BIT_EMOJI_COMPONENT,
|
||||
BIT_EXTENDED_PICTOGRAPHIC,
|
||||
// https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
BIT_BASIC_EMOJI
|
||||
};
|
||||
|
||||
private:
|
||||
static UBool U_CALLCONV
|
||||
isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
|
||||
/** Input i: One of the IX_..._TRIE_OFFSET indexes into the data file indexes[] array. */
|
||||
static int32_t getStringTrieIndex(int32_t i) {
|
||||
return i - IX_BASIC_EMOJI_TRIE_OFFSET;
|
||||
}
|
||||
|
||||
void load(UErrorCode &errorCode);
|
||||
UBool hasBinaryPropertyImpl(UChar32 c, UProperty which) const;
|
||||
UBool hasBinaryPropertyImpl(const UChar *s, int32_t length, UProperty which) const;
|
||||
|
||||
UDataMemory *memory = nullptr;
|
||||
UCPTrie *cpTrie = nullptr;
|
||||
const UChar *stringTries[6] = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr };
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __EMOJIPROPS_H__
|
File diff suppressed because it is too large
Load diff
|
@ -19,6 +19,7 @@ dictbe.cpp
|
|||
dictionarydata.cpp
|
||||
dtintrv.cpp
|
||||
edits.cpp
|
||||
emojiprops.cpp
|
||||
errorcode.cpp
|
||||
filteredbrk.cpp
|
||||
filterednormalizer2.cpp
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -51,6 +51,7 @@ typedef enum ECleanupCommonType {
|
|||
UCLN_COMMON_USET,
|
||||
UCLN_COMMON_UNAMES,
|
||||
UCLN_COMMON_UPROPS,
|
||||
UCLN_COMMON_EMOJIPROPS,
|
||||
UCLN_COMMON_UCNV,
|
||||
UCLN_COMMON_UCNV_IO,
|
||||
UCLN_COMMON_UDATA,
|
||||
|
|
|
@ -483,12 +483,63 @@ typedef enum UProperty {
|
|||
* @stable ICU 62
|
||||
*/
|
||||
UCHAR_EXTENDED_PICTOGRAPHIC=64,
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* Binary property of strings Basic_Emoji.
|
||||
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
*
|
||||
* @draft ICU 70
|
||||
*/
|
||||
UCHAR_BASIC_EMOJI=65,
|
||||
/**
|
||||
* Binary property of strings Emoji_Keycap_Sequence.
|
||||
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
*
|
||||
* @draft ICU 70
|
||||
*/
|
||||
UCHAR_EMOJI_KEYCAP_SEQUENCE=66,
|
||||
/**
|
||||
* Binary property of strings RGI_Emoji_Modifier_Sequence.
|
||||
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
*
|
||||
* @draft ICU 70
|
||||
*/
|
||||
UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE=67,
|
||||
/**
|
||||
* Binary property of strings RGI_Emoji_Flag_Sequence.
|
||||
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
*
|
||||
* @draft ICU 70
|
||||
*/
|
||||
UCHAR_RGI_EMOJI_FLAG_SEQUENCE=68,
|
||||
/**
|
||||
* Binary property of strings RGI_Emoji_Tag_Sequence.
|
||||
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
*
|
||||
* @draft ICU 70
|
||||
*/
|
||||
UCHAR_RGI_EMOJI_TAG_SEQUENCE=69,
|
||||
/**
|
||||
* Binary property of strings RGI_Emoji_ZWJ_Sequence.
|
||||
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
*
|
||||
* @draft ICU 70
|
||||
*/
|
||||
UCHAR_RGI_EMOJI_ZWJ_SEQUENCE=70,
|
||||
/**
|
||||
* Binary property of strings RGI_Emoji.
|
||||
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
*
|
||||
* @draft ICU 70
|
||||
*/
|
||||
UCHAR_RGI_EMOJI=71,
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
#ifndef U_HIDE_DEPRECATED_API
|
||||
/**
|
||||
* One more than the last constant for binary Unicode properties.
|
||||
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
|
||||
*/
|
||||
UCHAR_BINARY_LIMIT,
|
||||
UCHAR_BINARY_LIMIT=72,
|
||||
#endif // U_HIDE_DEPRECATED_API
|
||||
|
||||
/** Enumerated property Bidi_Class.
|
||||
|
@ -2615,10 +2666,10 @@ typedef enum UVerticalOrientation {
|
|||
*
|
||||
* @param c Code point to test.
|
||||
* @param which UProperty selector constant, identifies which binary property to check.
|
||||
* Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT.
|
||||
* Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT.
|
||||
* @return true or false according to the binary Unicode property value for c.
|
||||
* Also false if 'which' is out of bounds or if the Unicode version
|
||||
* does not have data for the property at all, or not for this code point.
|
||||
* does not have data for the property at all.
|
||||
*
|
||||
* @see UProperty
|
||||
* @see u_getBinaryPropertySet
|
||||
|
@ -2629,6 +2680,37 @@ typedef enum UVerticalOrientation {
|
|||
U_CAPI UBool U_EXPORT2
|
||||
u_hasBinaryProperty(UChar32 c, UProperty which);
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Returns true if the property is true for the string.
|
||||
* Same as u_hasBinaryProperty(single code point, which)
|
||||
* if the string contains exactly one code point.
|
||||
*
|
||||
* Most properties apply only to single code points.
|
||||
* <a href="https://www.unicode.org/reports/tr51/#Emoji_Sets">UTS #51 Unicode Emoji</a>
|
||||
* defines several properties of strings.
|
||||
*
|
||||
* @param s String to test.
|
||||
* @param length Length of the string, or negative if NUL-terminated.
|
||||
* @param which UProperty selector constant, identifies which binary property to check.
|
||||
* Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT.
|
||||
* @return true or false according to the binary Unicode property value for the string.
|
||||
* Also false if 'which' is out of bounds or if the Unicode version
|
||||
* does not have data for the property at all.
|
||||
*
|
||||
* @see UProperty
|
||||
* @see u_hasBinaryProperty
|
||||
* @see u_getBinaryPropertySet
|
||||
* @see u_getIntPropertyValue
|
||||
* @see u_getUnicodeVersion
|
||||
* @draft ICU 70
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_stringHasBinaryProperty(const UChar *s, int32_t length, UProperty which);
|
||||
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Returns a frozen USet for a binary property.
|
||||
* The library retains ownership over the returned object.
|
||||
|
|
|
@ -30,7 +30,9 @@
|
|||
#include "unicode/unorm2.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "cstring.h"
|
||||
#include "emojiprops.h"
|
||||
#include "mutex.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "umutex.h"
|
||||
|
@ -322,6 +324,10 @@ static UBool isRegionalIndicator(const BinaryProperty &/*prop*/, UChar32 c, UPro
|
|||
return 0x1F1E6<=c && c<=0x1F1FF;
|
||||
}
|
||||
|
||||
static UBool hasEmojiProperty(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) {
|
||||
return EmojiProps::hasBinaryProperty(c, which);
|
||||
}
|
||||
|
||||
static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={
|
||||
/*
|
||||
* column and mask values for binary properties from u_getUnicodeProperties().
|
||||
|
@ -388,14 +394,21 @@ static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={
|
|||
{ UPROPS_SRC_CASE_AND_NORM, 0, changesWhenCasefolded },
|
||||
{ UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_CASEMAPPED
|
||||
{ UPROPS_SRC_NFKC_CF, 0, changesWhenNFKC_Casefolded },
|
||||
{ 2, U_MASK(UPROPS_2_EMOJI), defaultContains },
|
||||
{ 2, U_MASK(UPROPS_2_EMOJI_PRESENTATION), defaultContains },
|
||||
{ 2, U_MASK(UPROPS_2_EMOJI_MODIFIER), defaultContains },
|
||||
{ 2, U_MASK(UPROPS_2_EMOJI_MODIFIER_BASE), defaultContains },
|
||||
{ 2, U_MASK(UPROPS_2_EMOJI_COMPONENT), defaultContains },
|
||||
{ UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI
|
||||
{ UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_PRESENTATION
|
||||
{ UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_MODIFIER
|
||||
{ UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_MODIFIER_BASE
|
||||
{ UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_COMPONENT
|
||||
{ 2, 0, isRegionalIndicator },
|
||||
{ 1, U_MASK(UPROPS_PREPENDED_CONCATENATION_MARK), defaultContains },
|
||||
{ 2, U_MASK(UPROPS_2_EXTENDED_PICTOGRAPHIC), defaultContains },
|
||||
{ UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EXTENDED_PICTOGRAPHIC
|
||||
{ UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_BASIC_EMOJI
|
||||
{ UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_KEYCAP_SEQUENCE
|
||||
{ UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE
|
||||
{ UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_FLAG_SEQUENCE
|
||||
{ UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_TAG_SEQUENCE
|
||||
{ UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_ZWJ_SEQUENCE
|
||||
{ UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI
|
||||
};
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
|
@ -410,6 +423,26 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
|
|||
}
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_stringHasBinaryProperty(const UChar *s, int32_t length, UProperty which) {
|
||||
if (s == nullptr && length != 0) { return false; }
|
||||
if (length == 1) {
|
||||
return u_hasBinaryProperty(s[0], which); // single code point
|
||||
} else if (length == 2 || (length < 0 && *s != 0)) { // not empty string
|
||||
// first code point
|
||||
int32_t i = 0;
|
||||
UChar32 c;
|
||||
U16_NEXT(s, i, length, c);
|
||||
if (length > 0 ? i == length : s[i] == 0) {
|
||||
return u_hasBinaryProperty(c, which); // single code point
|
||||
}
|
||||
}
|
||||
// Only call into EmojiProps for a relevant property,
|
||||
// so that we not unnecessarily try to load its data file.
|
||||
return UCHAR_BASIC_EMOJI <= which && which <= UCHAR_RGI_EMOJI &&
|
||||
EmojiProps::hasBinaryProperty(s, length, which);
|
||||
}
|
||||
|
||||
struct IntProperty;
|
||||
|
||||
typedef int32_t IntPropertyGetValue(const IntProperty &prop, UChar32 c, UProperty which);
|
||||
|
|
|
@ -224,7 +224,8 @@ enum {
|
|||
/*
|
||||
* Properties in vector word 2
|
||||
* Bits
|
||||
* 31..26 http://www.unicode.org/reports/tr51/#Emoji_Properties
|
||||
* 31..26 unused since ICU 70 added uemoji.icu;
|
||||
* in ICU 57..69 stored emoji properties
|
||||
* 25..20 Line Break
|
||||
* 19..15 Sentence Break
|
||||
* 14..10 Word Break
|
||||
|
@ -232,12 +233,12 @@ enum {
|
|||
* 4.. 0 Decomposition Type
|
||||
*/
|
||||
enum {
|
||||
UPROPS_2_EXTENDED_PICTOGRAPHIC=26,
|
||||
UPROPS_2_EMOJI_COMPONENT,
|
||||
UPROPS_2_EMOJI,
|
||||
UPROPS_2_EMOJI_PRESENTATION,
|
||||
UPROPS_2_EMOJI_MODIFIER,
|
||||
UPROPS_2_EMOJI_MODIFIER_BASE
|
||||
UPROPS_2_UNUSED_WAS_EXTENDED_PICTOGRAPHIC=26, // ICU 62..69
|
||||
UPROPS_2_UNUSED_WAS_EMOJI_COMPONENT, // ICU 60..69
|
||||
UPROPS_2_UNUSED_WAS_EMOJI, // ICU 57..69
|
||||
UPROPS_2_UNUSED_WAS_EMOJI_PRESENTATION, // ICU 57..69
|
||||
UPROPS_2_UNUSED_WAS_EMOJI_MODIFIER, // ICU 57..69
|
||||
UPROPS_2_UNUSED_WAS_EMOJI_MODIFIER_BASE // ICU 57..69
|
||||
};
|
||||
|
||||
#define UPROPS_LB_MASK 0x03f00000
|
||||
|
@ -377,6 +378,7 @@ enum UPropertySource {
|
|||
UPROPS_SRC_INPC,
|
||||
UPROPS_SRC_INSC,
|
||||
UPROPS_SRC_VO,
|
||||
UPROPS_SRC_EMOJI,
|
||||
/** One more than the highest UPropertySource (UPROPS_SRC_) constant. */
|
||||
UPROPS_SRC_COUNT
|
||||
};
|
||||
|
|
|
@ -22,6 +22,7 @@ def generate(config, io, common_vars):
|
|||
|
||||
requests += generate_cnvalias(config, io, common_vars)
|
||||
requests += generate_ulayout(config, io, common_vars)
|
||||
requests += generate_uemoji(config, io, common_vars)
|
||||
requests += generate_confusables(config, io, common_vars)
|
||||
requests += generate_conversion_mappings(config, io, common_vars)
|
||||
requests += generate_brkitr_brk(config, io, common_vars)
|
||||
|
@ -181,7 +182,9 @@ def generate_brkitr_brk(config, io, common_vars):
|
|||
RepeatedExecutionRequest(
|
||||
name = "brkitr_brk",
|
||||
category = "brkitr_rules",
|
||||
dep_targets = [DepTarget("cnvalias"), DepTarget("ulayout"), DepTarget("lstm_res")],
|
||||
dep_targets =
|
||||
[DepTarget("cnvalias"),
|
||||
DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res")],
|
||||
input_files = input_files,
|
||||
output_files = output_files,
|
||||
tool = IcuTool("genbrk"),
|
||||
|
@ -354,6 +357,25 @@ def generate_ulayout(config, io, common_vars):
|
|||
]
|
||||
|
||||
|
||||
def generate_uemoji(config, io, common_vars):
|
||||
# Unicode emoji properties
|
||||
basename = "uemoji"
|
||||
input_file = InFile("in/%s.icu" % basename)
|
||||
output_file = OutFile("%s.icu" % basename)
|
||||
return [
|
||||
SingleExecutionRequest(
|
||||
name = basename,
|
||||
category = basename,
|
||||
dep_targets = [],
|
||||
input_files = [input_file],
|
||||
output_files = [output_file],
|
||||
tool = IcuTool("icupkg"),
|
||||
args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}",
|
||||
format_with = {}
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def generate_misc(config, io, common_vars):
|
||||
# Misc Data Res Files
|
||||
input_files = [InFile(filename) for filename in io.glob("misc/*.txt")]
|
||||
|
|
Binary file not shown.
BIN
icu4c/source/data/in/uemoji.icu
Normal file
BIN
icu4c/source/data/in/uemoji.icu
Normal file
Binary file not shown.
Binary file not shown.
|
@ -38,7 +38,7 @@ https://unicode-org.atlassian.net/browse/ICU-21635
|
|||
|
||||
* Command-line environment setup
|
||||
|
||||
export UNICODE_DATA=~/unidata/uni14/20210819
|
||||
export UNICODE_DATA=~/unidata/uni14/20210903
|
||||
export CLDR_SRC=~/cldr/uni/src
|
||||
export ICU_ROOT=~/icu/uni
|
||||
export ICU_SRC=$ICU_ROOT/src
|
||||
|
|
1469
icu4c/source/data/unidata/emoji-sequences.txt
Normal file
1469
icu4c/source/data/unidata/emoji-sequences.txt
Normal file
File diff suppressed because it is too large
Load diff
1410
icu4c/source/data/unidata/emoji-zwj-sequences.txt
Normal file
1410
icu4c/source/data/unidata/emoji-zwj-sequences.txt
Normal file
File diff suppressed because it is too large
Load diff
|
@ -27,6 +27,9 @@ rm $ICU4C_DATA_IN/coll/*.icu
|
|||
# icu4c/source/i18n/collationfcd.cpp is generated by genuca;
|
||||
# probably hard to build genuca without depending on the old version.
|
||||
|
||||
# Exit this shell script when a command fails.
|
||||
set -e
|
||||
|
||||
# Generate normalization data files directly into the source tree.
|
||||
bazelisk run //icu4c/source/tools/gennorm2 -- -o $ICU4C_COMMON/norm2_nfc_data.h -s $ICU4C_NORM2 nfc.txt --csource
|
||||
bazelisk run //icu4c/source/tools/gennorm2 -- -o $ICU4C_DATA_IN/nfc.nrm -s $ICU4C_NORM2 nfc.txt
|
||||
|
|
|
@ -7,6 +7,7 @@ property;Catalog;age;Age
|
|||
property;Binary;AHex;ASCII_Hex_Digit
|
||||
property;Binary;;alnum
|
||||
property;Binary;Alpha;Alphabetic
|
||||
property;Binary;Basic_Emoji;Basic_Emoji
|
||||
property;Enumerated;bc;Bidi_Class
|
||||
property;Binary;Bidi_C;Bidi_Control
|
||||
property;Binary;Bidi_M;Bidi_Mirrored
|
||||
|
@ -38,6 +39,7 @@ property;Binary;EBase;Emoji_Modifier_Base
|
|||
property;Binary;EComp;Emoji_Component
|
||||
property;Binary;EMod;Emoji_Modifier
|
||||
property;Binary;Emoji;Emoji
|
||||
property;Binary;Emoji_Keycap_Sequence;Emoji_Keycap_Sequence
|
||||
property;Binary;EPres;Emoji_Presentation
|
||||
property;Miscellaneous;EqUIdeo;Equivalent_Unified_Ideograph
|
||||
property;Binary;Ext;Extender
|
||||
|
@ -91,6 +93,11 @@ property;Binary;PCM;Prepended_Concatenation_Mark
|
|||
property;Binary;;print
|
||||
property;Binary;QMark;Quotation_Mark
|
||||
property;Binary;Radical;Radical
|
||||
property;Binary;RGI_Emoji;RGI_Emoji
|
||||
property;Binary;RGI_Emoji_Flag_Sequence;RGI_Emoji_Flag_Sequence
|
||||
property;Binary;RGI_Emoji_Modifier_Sequence;RGI_Emoji_Modifier_Sequence
|
||||
property;Binary;RGI_Emoji_Tag_Sequence;RGI_Emoji_Tag_Sequence
|
||||
property;Binary;RGI_Emoji_ZWJ_Sequence;RGI_Emoji_ZWJ_Sequence
|
||||
property;Binary;RI;Regional_Indicator
|
||||
property;Enumerated;SB;Sentence_Break
|
||||
property;Catalog;sc;Script
|
||||
|
|
|
@ -2955,6 +2955,14 @@ TestAdditionalProperties() {
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// C API coverage
|
||||
if (u_stringHasBinaryProperty(u"⏱", 1, UCHAR_BASIC_EMOJI) ||
|
||||
u_stringHasBinaryProperty(u"⏱", -1, UCHAR_BASIC_EMOJI) ||
|
||||
!u_stringHasBinaryProperty(u"⏱\uFE0F", 2, UCHAR_BASIC_EMOJI) ||
|
||||
!u_stringHasBinaryProperty(u"⏱\uFE0F", -1, UCHAR_BASIC_EMOJI)) {
|
||||
log_data_err("error: u_stringHasBinaryProperty(stopwatch variants) is wrong\n");
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
# created on: 2011may26
|
||||
# created by: Markus W. Scherer
|
||||
#
|
||||
# See http://site.icu-project.org/processes/release/tasks/healthy-code#TOC-Check-library-dependencies
|
||||
# See https://unicode-org.github.io/icu/processes/release/tasks/healthy-code.html#check-library-dependencies
|
||||
|
||||
# Standard library symbols used by ICU --------------------------------------- #
|
||||
|
||||
|
@ -172,7 +172,7 @@ library: common
|
|||
static_unicode_sets
|
||||
uiter edits
|
||||
ucasemap ucasemap_titlecase_brkiter script_runs
|
||||
uprops ubidi_props ucase uscript uscript_props characterproperties
|
||||
uprops ubidi_props ucase uscript uscript_props emojiprops characterproperties
|
||||
ubidi ushape ubiditransform
|
||||
resourcebundle service_registration resbund_cnv ures_cnv icudataver ucat
|
||||
currency
|
||||
|
@ -411,6 +411,7 @@ group: uprops
|
|||
ubidi_props
|
||||
unistr_case ustring_case # only for case folding
|
||||
ucase
|
||||
emojiprops
|
||||
|
||||
group: characterproperties
|
||||
characterproperties.o
|
||||
|
@ -474,6 +475,12 @@ group: ubidi_props
|
|||
deps
|
||||
utrie2
|
||||
|
||||
group: emojiprops
|
||||
emojiprops.o
|
||||
deps
|
||||
ucharstrie ucharstrieiterator
|
||||
ucptrie udata
|
||||
|
||||
group: unistr_props
|
||||
unistr_props.o
|
||||
deps
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#! /usr/bin/python -B
|
||||
#! /usr/bin/python3 -B
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
|
|
|
@ -1929,9 +1929,9 @@ UBool IntlTest::assertTrue(const char* message, UBool condition, UBool quiet, UB
|
|||
UBool IntlTest::assertFalse(const char* message, UBool condition, UBool quiet, UBool possibleDataError) {
|
||||
if (condition) {
|
||||
if (possibleDataError) {
|
||||
dataerrln("FAIL: assertTrue() failed: %s", message);
|
||||
dataerrln("FAIL: assertFalse() failed: %s", message);
|
||||
} else {
|
||||
errln("FAIL: assertTrue() failed: %s", message);
|
||||
errln("FAIL: assertFalse() failed: %s", message);
|
||||
}
|
||||
} else if (!quiet) {
|
||||
logln("Ok: %s", message);
|
||||
|
|
|
@ -1883,11 +1883,11 @@ private:
|
|||
UnicodeSet *fMidNumSet;
|
||||
UnicodeSet *fNumericSet;
|
||||
UnicodeSet *fFormatSet;
|
||||
UnicodeSet *fOtherSet;
|
||||
UnicodeSet *fOtherSet = nullptr;
|
||||
UnicodeSet *fExtendSet;
|
||||
UnicodeSet *fExtendNumLetSet;
|
||||
UnicodeSet *fWSegSpaceSet;
|
||||
UnicodeSet *fDictionarySet;
|
||||
UnicodeSet *fDictionarySet = nullptr;
|
||||
UnicodeSet *fZWJSet;
|
||||
UnicodeSet *fExtendedPictSet;
|
||||
|
||||
|
@ -1926,6 +1926,11 @@ RBBIWordMonkey::RBBIWordMonkey()
|
|||
|
||||
fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
|
||||
fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
|
||||
if(U_FAILURE(status)) {
|
||||
IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
|
||||
deferredStatus = status;
|
||||
return;
|
||||
}
|
||||
|
||||
fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
|
||||
fDictionarySet->addAll(*fKatakanaSet);
|
||||
|
|
|
@ -65,6 +65,7 @@ void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name,
|
|||
TESTCASE_AUTO(TestScriptMetadata);
|
||||
TESTCASE_AUTO(TestBidiPairedBracketType);
|
||||
TESTCASE_AUTO(TestEmojiProperties);
|
||||
TESTCASE_AUTO(TestEmojiPropertiesOfStrings);
|
||||
TESTCASE_AUTO(TestIndicPositionalCategory);
|
||||
TESTCASE_AUTO(TestIndicSyllabicCategory);
|
||||
TESTCASE_AUTO(TestVerticalOrientation);
|
||||
|
@ -545,6 +546,177 @@ void UnicodeTest::TestEmojiProperties() {
|
|||
u_hasBinaryProperty(0xA9, UCHAR_EXTENDED_PICTOGRAPHIC));
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
UBool hbp(const UChar *s, int32_t length, UProperty which) {
|
||||
return u_stringHasBinaryProperty(s, length, which);
|
||||
}
|
||||
|
||||
UBool hbp(const UChar *s, UProperty which) {
|
||||
return u_stringHasBinaryProperty(s, -1, which);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void UnicodeTest::TestEmojiPropertiesOfStrings() {
|
||||
// Property of code points, for coverage
|
||||
assertFalse("null is not Ideographic", hbp(nullptr, 1, UCHAR_IDEOGRAPHIC));
|
||||
assertFalse("null/0 is not Ideographic", hbp(nullptr, -1, UCHAR_IDEOGRAPHIC));
|
||||
assertFalse("empty string is not Ideographic", hbp(u"", 0, UCHAR_IDEOGRAPHIC));
|
||||
assertFalse("empty string/0 is not Ideographic", hbp(u"", -1, UCHAR_IDEOGRAPHIC));
|
||||
assertFalse("L is not Ideographic", hbp(u"L", 1, UCHAR_IDEOGRAPHIC));
|
||||
assertFalse("L/0 is not Ideographic", hbp(u"L", -1, UCHAR_IDEOGRAPHIC));
|
||||
assertTrue("U+4E02 is Ideographic", hbp(u"丂", 1, UCHAR_IDEOGRAPHIC));
|
||||
assertTrue("U+4E02/0 is Ideographic", hbp(u"丂", -1, UCHAR_IDEOGRAPHIC));
|
||||
assertFalse("2*U+4E02 is not Ideographic", hbp(u"丂丂", 2, UCHAR_IDEOGRAPHIC));
|
||||
assertFalse("2*U+4E02/0 is not Ideographic", hbp(u"丂丂", -1, UCHAR_IDEOGRAPHIC));
|
||||
assertFalse("bicycle is not Ideographic", hbp(u"🚲", 2, UCHAR_IDEOGRAPHIC));
|
||||
assertFalse("bicycle/0 is not Ideographic", hbp(u"🚲", -1, UCHAR_IDEOGRAPHIC));
|
||||
assertTrue("U+23456 is Ideographic", hbp(u"\U00023456", 2, UCHAR_IDEOGRAPHIC));
|
||||
assertTrue("U+23456/0 is Ideographic", hbp(u"\U00023456", -1, UCHAR_IDEOGRAPHIC));
|
||||
|
||||
// Property of (code points and) strings
|
||||
assertFalse("null is not Basic_Emoji", hbp(nullptr, 1, UCHAR_BASIC_EMOJI));
|
||||
assertFalse("null/0 is not Basic_Emoji", hbp(nullptr, -1, UCHAR_BASIC_EMOJI));
|
||||
assertFalse("empty string is not Basic_Emoji", hbp(u"", 0, UCHAR_BASIC_EMOJI));
|
||||
assertFalse("empty string/0 is not Basic_Emoji", hbp(u"", -1, UCHAR_BASIC_EMOJI));
|
||||
assertFalse("L is not Basic_Emoji", hbp(u"L", 1, UCHAR_BASIC_EMOJI));
|
||||
assertFalse("L/0 is not Basic_Emoji", hbp(u"L", -1, UCHAR_BASIC_EMOJI));
|
||||
assertFalse("U+4E02 is not Basic_Emoji", hbp(u"丂", 1, UCHAR_BASIC_EMOJI));
|
||||
assertFalse("U+4E02/0 is not Basic_Emoji", hbp(u"丂", -1, UCHAR_BASIC_EMOJI));
|
||||
assertTrue("bicycle is Basic_Emoji", hbp(u"🚲", 2, UCHAR_BASIC_EMOJI));
|
||||
assertTrue("bicycle/0 is Basic_Emoji", hbp(u"🚲", -1, UCHAR_BASIC_EMOJI));
|
||||
assertFalse("2*bicycle is Basic_Emoji", hbp(u"🚲🚲", 4, UCHAR_BASIC_EMOJI));
|
||||
assertFalse("2*bicycle/0 is Basic_Emoji", hbp(u"🚲🚲", -1, UCHAR_BASIC_EMOJI));
|
||||
assertFalse("U+23456 is not Basic_Emoji", hbp(u"\U00023456", 2, UCHAR_BASIC_EMOJI));
|
||||
assertFalse("U+23456/0 is not Basic_Emoji", hbp(u"\U00023456", -1, UCHAR_BASIC_EMOJI));
|
||||
|
||||
assertFalse("stopwatch is not Basic_Emoji", hbp(u"⏱", 1, UCHAR_BASIC_EMOJI));
|
||||
assertFalse("stopwatch/0 is not Basic_Emoji", hbp(u"⏱", -1, UCHAR_BASIC_EMOJI));
|
||||
assertTrue("stopwatch+emoji is Basic_Emoji", hbp(u"⏱\uFE0F", 2, UCHAR_BASIC_EMOJI));
|
||||
assertTrue("stopwatch+emoji/0 is Basic_Emoji", hbp(u"⏱\uFE0F", -1, UCHAR_BASIC_EMOJI));
|
||||
|
||||
assertFalse("chipmunk is not Basic_Emoji", hbp(u"🐿", UCHAR_BASIC_EMOJI));
|
||||
assertTrue("chipmunk+emoji is Basic_Emoji", hbp(u"🐿\uFE0F", UCHAR_BASIC_EMOJI));
|
||||
assertFalse("chipmunk+2*emoji is not Basic_Emoji", hbp(u"🐿\uFE0F\uFE0F", UCHAR_BASIC_EMOJI));
|
||||
|
||||
// Properties of strings (only)
|
||||
assertFalse("4+emoji is not Emoji_Keycap_Sequence",
|
||||
hbp(u"4\uFE0F", UCHAR_EMOJI_KEYCAP_SEQUENCE));
|
||||
assertTrue("4+emoji+keycap is Emoji_Keycap_Sequence",
|
||||
hbp(u"4\uFE0F\u20E3", UCHAR_EMOJI_KEYCAP_SEQUENCE));
|
||||
|
||||
assertFalse("[B] is not RGI_Emoji_Flag_Sequence",
|
||||
hbp(u"\U0001F1E7", UCHAR_RGI_EMOJI_FLAG_SEQUENCE));
|
||||
assertTrue("[BE] is RGI_Emoji_Flag_Sequence",
|
||||
hbp(u"🇧🇪", UCHAR_RGI_EMOJI_FLAG_SEQUENCE));
|
||||
|
||||
assertFalse("[flag] is not RGI_Emoji_Tag_Sequence",
|
||||
hbp(u"\U0001F3F4", UCHAR_RGI_EMOJI_TAG_SEQUENCE));
|
||||
assertTrue("[Scotland] is RGI_Emoji_Tag_Sequence",
|
||||
hbp(u"🏴", UCHAR_RGI_EMOJI_TAG_SEQUENCE));
|
||||
|
||||
assertFalse("bicyclist is not RGI_Emoji_Modifier_Sequence",
|
||||
hbp(u"🚴", UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE));
|
||||
assertTrue("bicyclist+medium is RGI_Emoji_Modifier_Sequence",
|
||||
hbp(u"🚴\U0001F3FD", UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE));
|
||||
|
||||
assertFalse("woman+dark+ZWJ is not RGI_Emoji_ZWJ_Sequence",
|
||||
hbp(u"👩\U0001F3FF\u200D", UCHAR_RGI_EMOJI_ZWJ_SEQUENCE));
|
||||
assertTrue("woman pilot: dark skin tone is RGI_Emoji_ZWJ_Sequence",
|
||||
hbp(u"👩\U0001F3FF\u200D✈\uFE0F", UCHAR_RGI_EMOJI_ZWJ_SEQUENCE));
|
||||
|
||||
// RGI_Emoji = all of the above
|
||||
assertFalse("stopwatch is not RGI_Emoji", hbp(u"⏱", UCHAR_RGI_EMOJI));
|
||||
assertTrue("stopwatch+emoji is RGI_Emoji", hbp(u"⏱\uFE0F", UCHAR_RGI_EMOJI));
|
||||
|
||||
assertFalse("chipmunk is not RGI_Emoji", hbp(u"🐿", UCHAR_RGI_EMOJI));
|
||||
assertTrue("chipmunk+emoji is RGI_Emoji", hbp(u"🐿\uFE0F", UCHAR_RGI_EMOJI));
|
||||
|
||||
assertFalse("4+emoji is not RGI_Emoji", hbp(u"4\uFE0F", UCHAR_RGI_EMOJI));
|
||||
assertTrue("4+emoji+keycap is RGI_Emoji", hbp(u"4\uFE0F\u20E3", UCHAR_RGI_EMOJI));
|
||||
|
||||
assertFalse("[B] is not RGI_Emoji", hbp(u"\U0001F1E7", UCHAR_RGI_EMOJI));
|
||||
assertTrue("[BE] is RGI_Emoji", hbp(u"🇧🇪", UCHAR_RGI_EMOJI));
|
||||
|
||||
assertTrue("[flag] is RGI_Emoji", hbp(u"\U0001F3F4", UCHAR_RGI_EMOJI));
|
||||
assertTrue("[Scotland] is RGI_Emoji", hbp(u"🏴", UCHAR_RGI_EMOJI));
|
||||
|
||||
assertTrue("bicyclist is RGI_Emoji", hbp(u"🚴", UCHAR_RGI_EMOJI));
|
||||
assertTrue("bicyclist+medium is RGI_Emoji", hbp(u"🚴\U0001F3FD", UCHAR_RGI_EMOJI));
|
||||
|
||||
assertFalse("woman+dark+ZWJ is not RGI_Emoji", hbp(u"👩\U0001F3FF\u200D", UCHAR_RGI_EMOJI));
|
||||
assertTrue("woman pilot: dark skin tone is RGI_Emoji",
|
||||
hbp(u"👩\U0001F3FF\u200D✈\uFE0F", UCHAR_RGI_EMOJI));
|
||||
|
||||
// UnicodeSet with properties of strings
|
||||
IcuTestErrorCode errorCode(*this, "TestEmojiPropertiesOfStrings()");
|
||||
UnicodeSet basic("[:Basic_Emoji:]", errorCode);
|
||||
UnicodeSet keycaps("[:Emoji_Keycap_Sequence:]", errorCode);
|
||||
UnicodeSet modified("[:RGI_Emoji_Modifier_Sequence:]", errorCode);
|
||||
UnicodeSet flags("[:RGI_Emoji_Flag_Sequence:]", errorCode);
|
||||
UnicodeSet tags("[:RGI_Emoji_Tag_Sequence:]", errorCode);
|
||||
UnicodeSet combos("[:RGI_Emoji_ZWJ_Sequence:]", errorCode);
|
||||
UnicodeSet rgi("[:RGI_Emoji:]", errorCode);
|
||||
if (errorCode.errDataIfFailureAndReset("UnicodeSets")) {
|
||||
return;
|
||||
}
|
||||
|
||||
// union of all sets except for "rgi" -- should be the same as "rgi"
|
||||
UnicodeSet all(basic);
|
||||
all.addAll(keycaps).addAll(modified).addAll(flags).addAll(tags).addAll(combos);
|
||||
|
||||
UnicodeSet basicOnlyCp(basic);
|
||||
basicOnlyCp.removeAllStrings();
|
||||
|
||||
UnicodeSet rgiOnlyCp(rgi);
|
||||
rgiOnlyCp.removeAllStrings();
|
||||
|
||||
assertTrue("lots of Basic_Emoji", basic.size() > 1000);
|
||||
assertEquals("12 Emoji_Keycap_Sequence", 12, keycaps.size());
|
||||
assertTrue("lots of RGI_Emoji_Modifier_Sequence", modified.size() > 600);
|
||||
assertTrue("lots of RGI_Emoji_Flag_Sequence", flags.size() > 250);
|
||||
assertTrue("some RGI_Emoji_Tag_Sequence", tags.size() >= 3);
|
||||
assertTrue("lots of RGI_Emoji_ZWJ_Sequence", combos.size() > 1300);
|
||||
assertTrue("lots of RGI_Emoji", rgi.size() > 3000);
|
||||
|
||||
assertTrue("lots of Basic_Emoji code points", basicOnlyCp.size() > 1000);
|
||||
assertTrue("Basic_Emoji.hasStrings()", basic.hasStrings());
|
||||
assertEquals("no Emoji_Keycap_Sequence code points", 0, keycaps.getRangeCount());
|
||||
assertEquals("lots of RGI_Emoji_Modifier_Sequence", 0, modified.getRangeCount());
|
||||
assertEquals("lots of RGI_Emoji_Flag_Sequence", 0, flags.getRangeCount());
|
||||
assertEquals("some RGI_Emoji_Tag_Sequence", 0, tags.getRangeCount());
|
||||
assertEquals("lots of RGI_Emoji_ZWJ_Sequence", 0, combos.getRangeCount());
|
||||
|
||||
assertTrue("lots of RGI_Emoji code points", rgiOnlyCp.size() > 1000);
|
||||
assertTrue("RGI_Emoji.hasStrings()", rgi.hasStrings());
|
||||
assertEquals("RGI_Emoji/only-cp.size() == Basic_Emoji/only-cp.size()",
|
||||
rgiOnlyCp.size(), basicOnlyCp.size());
|
||||
assertTrue("RGI_Emoji/only-cp == Basic_Emoji/only-cp", rgiOnlyCp == basicOnlyCp);
|
||||
assertEquals("RGI_Emoji.size() == union.size()", rgi.size(), all.size());
|
||||
assertTrue("RGI_Emoji == union", rgi == all);
|
||||
|
||||
assertTrue("Basic_Emoji.contains(stopwatch+emoji)", basic.contains(u"⏱\uFE0F"));
|
||||
assertTrue("Basic_Emoji.contains(chipmunk+emoji)", basic.contains(u"🐿\uFE0F"));
|
||||
assertTrue("Emoji_Keycap_Sequence.contains(4+emoji+keycap)",
|
||||
keycaps.contains(u"4\uFE0F\u20E3"));
|
||||
assertTrue("RGI_Emoji_Flag_Sequence.contains([BE])", flags.contains(u"🇧🇪"));
|
||||
assertTrue("RGI_Emoji_Tag_Sequence.contains([Scotland])", tags.contains(u"🏴"));
|
||||
assertTrue("RGI_Emoji_Modifier_Sequence.contains(bicyclist+medium)",
|
||||
modified.contains(u"🚴\U0001F3FD"));
|
||||
assertTrue("RGI_Emoji_ZWJ_Sequence.contains(woman pilot: dark skin tone)",
|
||||
combos.contains(u"👩\U0001F3FF\u200D✈\uFE0F"));
|
||||
assertTrue("RGI_Emoji.contains(stopwatch+emoji)", rgi.contains(u"⏱\uFE0F"));
|
||||
assertTrue("RGI_Emoji.contains(chipmunk+emoji)", rgi.contains(u"🐿\uFE0F"));
|
||||
assertTrue("RGI_Emoji.contains(4+emoji+keycap)", rgi.contains(u"4\uFE0F\u20E3"));
|
||||
assertTrue("RGI_Emoji.contains([BE] is RGI_Emoji)", rgi.contains(u"🇧🇪"));
|
||||
assertTrue("RGI_Emoji.contains([flag])", rgi.contains(u"\U0001F3F4"));
|
||||
assertTrue("RGI_Emoji.contains([Scotland])", rgi.contains(u"🏴"));
|
||||
assertTrue("RGI_Emoji.contains(bicyclist)", rgi.contains(u"🚴"));
|
||||
assertTrue("RGI_Emoji.contains(bicyclist+medium)", rgi.contains(u"🚴\U0001F3FD"));
|
||||
assertTrue("RGI_Emoji.contains(woman pilot: dark skin tone)", rgi.contains(u"👩\U0001F3FF\u200D✈\uFE0F"));
|
||||
}
|
||||
|
||||
void UnicodeTest::TestIndicPositionalCategory() {
|
||||
IcuTestErrorCode errorCode(*this, "TestIndicPositionalCategory()");
|
||||
UnicodeSet na(u"[:InPC=NA:]", errorCode);
|
||||
|
@ -633,8 +805,8 @@ void UnicodeTest::TestBinaryCharacterProperties() {
|
|||
continue;
|
||||
}
|
||||
const UnicodeSet &set = *UnicodeSet::fromUSet(uset);
|
||||
int32_t size = set.size();
|
||||
if (size == 0) {
|
||||
int32_t count = set.getRangeCount();
|
||||
if (count == 0) {
|
||||
assertFalse(UnicodeString("!hasBinaryProperty(U+0020, ") + prop + u")",
|
||||
u_hasBinaryProperty(0x20, (UProperty)prop));
|
||||
assertFalse(UnicodeString("!hasBinaryProperty(U+0061, ") + prop + u")",
|
||||
|
@ -642,7 +814,7 @@ void UnicodeTest::TestBinaryCharacterProperties() {
|
|||
assertFalse(UnicodeString("!hasBinaryProperty(U+4E00, ") + prop + u")",
|
||||
u_hasBinaryProperty(0x4e00, (UProperty)prop));
|
||||
} else {
|
||||
UChar32 c = set.charAt(0);
|
||||
UChar32 c = set.getRangeStart(0);
|
||||
if (c > 0) {
|
||||
assertFalse(
|
||||
UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c - 1) +
|
||||
|
@ -653,7 +825,7 @@ void UnicodeTest::TestBinaryCharacterProperties() {
|
|||
UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
|
||||
u", " + prop + u")",
|
||||
u_hasBinaryProperty(c, (UProperty)prop));
|
||||
c = set.charAt(size - 1);
|
||||
c = set.getRangeEnd(count - 1);
|
||||
assertTrue(
|
||||
UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
|
||||
u", " + prop + u")",
|
||||
|
|
|
@ -41,6 +41,7 @@ public:
|
|||
void TestScriptMetadata();
|
||||
void TestBidiPairedBracketType();
|
||||
void TestEmojiProperties();
|
||||
void TestEmojiPropertiesOfStrings();
|
||||
void TestIndicPositionalCategory();
|
||||
void TestIndicSyllabicCategory();
|
||||
void TestVerticalOrientation();
|
||||
|
|
|
@ -45,6 +45,7 @@
|
|||
|
||||
/* swapping implementations in common */
|
||||
|
||||
#include "emojiprops.h"
|
||||
#include "uresdata.h"
|
||||
#include "ucnv_io.h"
|
||||
#include "uprops.h"
|
||||
|
@ -741,6 +742,115 @@ ulayout_swap(const UDataSwapper *ds,
|
|||
return headerSize + size;
|
||||
}
|
||||
|
||||
// Unicode emoji properties data swapping --------------------------------------
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
uemoji_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode) {
|
||||
// udata_swapDataHeader checks the arguments.
|
||||
int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
|
||||
if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Check data format and format version.
|
||||
const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4);
|
||||
if (!(
|
||||
pInfo->dataFormat[0] == u'E' &&
|
||||
pInfo->dataFormat[1] == u'm' &&
|
||||
pInfo->dataFormat[2] == u'o' &&
|
||||
pInfo->dataFormat[3] == u'j' &&
|
||||
pInfo->formatVersion[0] == 1)) {
|
||||
udata_printError(ds,
|
||||
"uemoji_swap(): data format %02x.%02x.%02x.%02x (format version %02x) "
|
||||
"is not recognized as emoji properties data\n",
|
||||
pInfo->dataFormat[0], pInfo->dataFormat[1],
|
||||
pInfo->dataFormat[2], pInfo->dataFormat[3],
|
||||
pInfo->formatVersion[0]);
|
||||
*pErrorCode = U_UNSUPPORTED_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
const uint8_t *inBytes = (const uint8_t *)inData + headerSize;
|
||||
uint8_t *outBytes = (uint8_t *)outData + headerSize;
|
||||
|
||||
const int32_t *inIndexes = (const int32_t *)inBytes;
|
||||
|
||||
if (length >= 0) {
|
||||
length -= headerSize;
|
||||
// We expect to read at least EmojiProps::IX_TOTAL_SIZE.
|
||||
if (length < 14 * 4) {
|
||||
udata_printError(ds,
|
||||
"uemoji_swap(): too few bytes (%d after header) for emoji properties data\n",
|
||||
length);
|
||||
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// First offset after indexes[].
|
||||
int32_t cpTrieOffset = udata_readInt32(ds, inIndexes[EmojiProps::IX_CPTRIE_OFFSET]);
|
||||
int32_t indexesLength = cpTrieOffset / 4;
|
||||
if (indexesLength < 14) {
|
||||
udata_printError(ds,
|
||||
"uemoji_swap(): too few indexes (%d) for emoji properties data\n",
|
||||
indexesLength);
|
||||
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Read the data offsets before swapping anything.
|
||||
int32_t indexes[EmojiProps::IX_TOTAL_SIZE + 1];
|
||||
indexes[0] = cpTrieOffset;
|
||||
for (int32_t i = 1; i <= EmojiProps::IX_TOTAL_SIZE; ++i) {
|
||||
indexes[i] = udata_readInt32(ds, inIndexes[i]);
|
||||
}
|
||||
int32_t size = indexes[EmojiProps::IX_TOTAL_SIZE];
|
||||
|
||||
if (length >= 0) {
|
||||
if (length < size) {
|
||||
udata_printError(ds,
|
||||
"uemoji_swap(): too few bytes (%d after header) "
|
||||
"for all of emoji properties data\n",
|
||||
length);
|
||||
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Copy the data for inaccessible bytes.
|
||||
if (inBytes != outBytes) {
|
||||
uprv_memcpy(outBytes, inBytes, size);
|
||||
}
|
||||
|
||||
// Swap the int32_t indexes[].
|
||||
int32_t offset = 0;
|
||||
int32_t top = cpTrieOffset;
|
||||
ds->swapArray32(ds, inBytes, top - offset, outBytes, pErrorCode);
|
||||
offset = top;
|
||||
|
||||
// Swap the code point trie.
|
||||
top = indexes[EmojiProps::IX_CPTRIE_OFFSET + 1];
|
||||
int32_t count = top - offset;
|
||||
U_ASSERT(count >= 0);
|
||||
if (count >= 16) {
|
||||
utrie_swapAnyVersion(ds, inBytes + offset, count, outBytes + offset, pErrorCode);
|
||||
}
|
||||
offset = top;
|
||||
|
||||
// Swap all of the string tries.
|
||||
// They are all serialized as arrays of 16-bit units.
|
||||
offset = indexes[EmojiProps::IX_BASIC_EMOJI_TRIE_OFFSET];
|
||||
top = indexes[EmojiProps::IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET + 1];
|
||||
ds->swapArray16(ds, inBytes + offset, top - offset, outBytes + offset, pErrorCode);
|
||||
offset = top;
|
||||
|
||||
U_ASSERT(offset == size);
|
||||
}
|
||||
|
||||
return headerSize + size;
|
||||
}
|
||||
|
||||
/* Swap 'Test' data from gentest */
|
||||
static int32_t U_CALLCONV
|
||||
test_swap(const UDataSwapper *ds,
|
||||
|
@ -836,6 +946,8 @@ static const struct {
|
|||
{ { ULAYOUT_FMT_0, ULAYOUT_FMT_1, ULAYOUT_FMT_2, ULAYOUT_FMT_3 },
|
||||
ulayout_swap }, // dataFormat="Layo"
|
||||
|
||||
{ { u'E', u'm', u'o', u'j' }, uemoji_swap },
|
||||
|
||||
#if !UCONFIG_NO_COLLATION
|
||||
{ { 0x55, 0x43, 0x6f, 0x6c }, ucol_swap }, /* dataFormat="UCol" */
|
||||
{ { 0x49, 0x6e, 0x76, 0x43 }, ucol_swapInverseUCA },/* dataFormat="InvC" */
|
||||
|
|
|
@ -70,6 +70,10 @@ public final class CharacterPropertiesImpl {
|
|||
case UCharacterProperty.SRC_VO:
|
||||
UCharacterProperty.ulayout_addPropertyStarts(src, incl);
|
||||
break;
|
||||
case UCharacterProperty.SRC_EMOJI: {
|
||||
EmojiProps.INSTANCE.addPropertyStarts(incl);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw new IllegalStateException("getInclusions(unknown src " + src + ")");
|
||||
}
|
||||
|
|
197
icu4j/main/classes/core/src/com/ibm/icu/impl/EmojiProps.java
Normal file
197
icu4j/main/classes/core/src/com/ibm/icu/impl/EmojiProps.java
Normal file
|
@ -0,0 +1,197 @@
|
|||
// © 2021 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// emojiprops.h
|
||||
// created: 2021sep06 Markus W. Scherer
|
||||
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.BytesTrie;
|
||||
import com.ibm.icu.util.CharsTrie;
|
||||
import com.ibm.icu.util.CodePointMap;
|
||||
import com.ibm.icu.util.CodePointTrie;
|
||||
import com.ibm.icu.util.ICUUncheckedIOException;
|
||||
|
||||
public final class EmojiProps {
|
||||
private static final class IsAcceptable implements ICUBinary.Authenticate {
|
||||
@Override
|
||||
public boolean isDataVersionAcceptable(byte version[]) {
|
||||
return version[0] == 1;
|
||||
}
|
||||
}
|
||||
private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
|
||||
private static final int DATA_FORMAT = 0x456d6f6a; // "Emoj"
|
||||
|
||||
// Byte offsets from the start of the data, after the generic header,
|
||||
// in ascending order.
|
||||
// UCPTrie=CodePointTrie, follows the indexes
|
||||
private static final int IX_CPTRIE_OFFSET = 0;
|
||||
|
||||
// UCharsTrie=CharsTrie
|
||||
private static final int IX_BASIC_EMOJI_TRIE_OFFSET = 4;
|
||||
//ivate static final int IX_EMOJI_KEYCAP_SEQUENCE_TRIE_OFFSET = 5;
|
||||
//ivate static final int IX_RGI_EMOJI_MODIFIER_SEQUENCE_TRIE_OFFSET = 6;
|
||||
//ivate static final int IX_RGI_EMOJI_FLAG_SEQUENCE_TRIE_OFFSET = 7;
|
||||
//ivate static final int IX_RGI_EMOJI_TAG_SEQUENCE_TRIE_OFFSET = 8;
|
||||
private static final int IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET = 9;
|
||||
|
||||
// Properties in the code point trie.
|
||||
// https://www.unicode.org/reports/tr51/#Emoji_Properties
|
||||
private static final int BIT_EMOJI = 0;
|
||||
private static final int BIT_EMOJI_PRESENTATION = 1;
|
||||
private static final int BIT_EMOJI_MODIFIER = 2;
|
||||
private static final int BIT_EMOJI_MODIFIER_BASE = 3;
|
||||
private static final int BIT_EMOJI_COMPONENT = 4;
|
||||
private static final int BIT_EXTENDED_PICTOGRAPHIC = 5;
|
||||
// https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
private static final int BIT_BASIC_EMOJI = 6;
|
||||
|
||||
public static final EmojiProps INSTANCE = new EmojiProps();
|
||||
|
||||
private CodePointTrie.Fast8 cpTrie = null;
|
||||
private String stringTries[] = new String[6];
|
||||
|
||||
/** Input i: One of the IX_..._TRIE_OFFSET indexes into the data file indexes[] array. */
|
||||
private static int getStringTrieIndex(int i) {
|
||||
return i - IX_BASIC_EMOJI_TRIE_OFFSET;
|
||||
}
|
||||
|
||||
private EmojiProps() {
|
||||
ByteBuffer bytes = ICUBinary.getRequiredData("uemoji.icu");
|
||||
try {
|
||||
ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
|
||||
int startPos = bytes.position();
|
||||
|
||||
int cpTrieOffset = bytes.getInt(); // inIndexes[IX_CPTRIE_OFFSET]
|
||||
int indexesLength = cpTrieOffset / 4;
|
||||
if (indexesLength <= IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET) {
|
||||
throw new ICUUncheckedIOException(
|
||||
"Emoji properties data: not enough indexes");
|
||||
}
|
||||
|
||||
int[] inIndexes = new int[indexesLength];
|
||||
inIndexes[0] = cpTrieOffset;
|
||||
for (int i = 1; i < indexesLength; ++i) {
|
||||
inIndexes[i] = bytes.getInt();
|
||||
}
|
||||
|
||||
int i = IX_CPTRIE_OFFSET;
|
||||
int offset = inIndexes[i++];
|
||||
int nextOffset = inIndexes[i];
|
||||
cpTrie = CodePointTrie.Fast8.fromBinary(bytes);
|
||||
int pos = bytes.position() - startPos;
|
||||
assert nextOffset >= pos;
|
||||
ICUBinary.skipBytes(bytes, nextOffset - pos); // skip padding after trie bytes
|
||||
|
||||
offset = nextOffset;
|
||||
nextOffset = inIndexes[IX_BASIC_EMOJI_TRIE_OFFSET];
|
||||
ICUBinary.skipBytes(bytes, nextOffset - offset); // skip unknown bytes
|
||||
|
||||
for (i = IX_BASIC_EMOJI_TRIE_OFFSET; i <= IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET; ++i) {
|
||||
offset = inIndexes[i];
|
||||
nextOffset = inIndexes[i + 1];
|
||||
// Set/leave null if there is no CharsTrie.
|
||||
if (nextOffset > offset) {
|
||||
stringTries[getStringTrieIndex(i)] =
|
||||
ICUBinary.getString(bytes, (nextOffset - offset) / 2, 0);
|
||||
}
|
||||
}
|
||||
} catch(IOException e) {
|
||||
throw new ICUUncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public UnicodeSet addPropertyStarts(UnicodeSet set) {
|
||||
// Add the start code point of each same-value range of the trie.
|
||||
CodePointMap.Range range = new CodePointMap.Range();
|
||||
int start = 0;
|
||||
while (cpTrie.getRange(start, null, range)) {
|
||||
set.add(start);
|
||||
start = range.getEnd() + 1;
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
// Note: REGIONAL_INDICATOR is a single, hardcoded range implemented elsewhere.
|
||||
private static final byte[] bitFlags = {
|
||||
BIT_EMOJI, // UCHAR_EMOJI=57
|
||||
BIT_EMOJI_PRESENTATION, // UCHAR_EMOJI_PRESENTATION=58
|
||||
BIT_EMOJI_MODIFIER, // UCHAR_EMOJI_MODIFIER=59
|
||||
BIT_EMOJI_MODIFIER_BASE, // UCHAR_EMOJI_MODIFIER_BASE=60
|
||||
BIT_EMOJI_COMPONENT, // UCHAR_EMOJI_COMPONENT=61
|
||||
-1, // UCHAR_REGIONAL_INDICATOR=62
|
||||
-1, // UCHAR_PREPENDED_CONCATENATION_MARK=63
|
||||
BIT_EXTENDED_PICTOGRAPHIC, // UCHAR_EXTENDED_PICTOGRAPHIC=64
|
||||
BIT_BASIC_EMOJI, // UCHAR_BASIC_EMOJI=65
|
||||
-1, // UCHAR_EMOJI_KEYCAP_SEQUENCE=66
|
||||
-1, // UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE=67
|
||||
-1, // UCHAR_RGI_EMOJI_FLAG_SEQUENCE=68
|
||||
-1, // UCHAR_RGI_EMOJI_TAG_SEQUENCE=69
|
||||
-1, // UCHAR_RGI_EMOJI_ZWJ_SEQUENCE=70
|
||||
BIT_BASIC_EMOJI, // UCHAR_RGI_EMOJI=71
|
||||
};
|
||||
|
||||
public boolean hasBinaryProperty(int c, int which) {
|
||||
if (which < UProperty.EMOJI || UProperty.RGI_EMOJI < which) {
|
||||
return false;
|
||||
}
|
||||
int bit = bitFlags[which - UProperty.EMOJI];
|
||||
if (bit < 0) {
|
||||
return false; // not a property that we support in this function
|
||||
}
|
||||
int bits = cpTrie.get(c);
|
||||
return ((bits >> bit) & 1) != 0;
|
||||
}
|
||||
|
||||
public boolean hasBinaryProperty(CharSequence s, int which) {
|
||||
int length = s.length();
|
||||
if (length == 0) { return false; } // empty string
|
||||
// The caller should have delegated single code points to hasBinaryProperty(c, which).
|
||||
if (which < UProperty.BASIC_EMOJI || UProperty.RGI_EMOJI < which) {
|
||||
return false;
|
||||
}
|
||||
int firstProp = which, lastProp = which;
|
||||
if (which == UProperty.RGI_EMOJI) {
|
||||
// RGI_Emoji is the union of the other emoji properties of strings.
|
||||
firstProp = UProperty.BASIC_EMOJI;
|
||||
lastProp = UProperty.RGI_EMOJI_ZWJ_SEQUENCE;
|
||||
}
|
||||
for (int prop = firstProp; prop <= lastProp; ++prop) {
|
||||
String trieUChars = stringTries[prop - UProperty.BASIC_EMOJI];
|
||||
if (trieUChars != null) {
|
||||
CharsTrie trie = new CharsTrie(trieUChars, 0);
|
||||
BytesTrie.Result result = trie.next(s, 0, length);
|
||||
if (result.hasValue()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public void addStrings(int which, UnicodeSet set) {
|
||||
if (which < UProperty.BASIC_EMOJI || UProperty.RGI_EMOJI < which) {
|
||||
return;
|
||||
}
|
||||
int firstProp = which, lastProp = which;
|
||||
if (which == UProperty.RGI_EMOJI) {
|
||||
// RGI_Emoji is the union of the other emoji properties of strings.
|
||||
firstProp = UProperty.BASIC_EMOJI;
|
||||
lastProp = UProperty.RGI_EMOJI_ZWJ_SEQUENCE;
|
||||
}
|
||||
for (int prop = firstProp; prop <= lastProp; ++prop) {
|
||||
String trieUChars = stringTries[prop - UProperty.BASIC_EMOJI];
|
||||
if (trieUChars != null) {
|
||||
CharsTrie trie = new CharsTrie(trieUChars, 0);
|
||||
for (CharsTrie.Entry entry : trie) {
|
||||
set.add(entry.chars);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -108,8 +108,9 @@ public final class UCharacterProperty
|
|||
public static final int SRC_INPC=12;
|
||||
public static final int SRC_INSC=13;
|
||||
public static final int SRC_VO=14;
|
||||
public static final int SRC_EMOJI=15;
|
||||
/** One more than the highest UPropertySource (SRC_) constant. */
|
||||
public static final int SRC_COUNT=15;
|
||||
public static final int SRC_COUNT=16;
|
||||
|
||||
private static final class LayoutProps {
|
||||
private static final class IsAcceptable implements ICUBinary.Authenticate {
|
||||
|
@ -352,6 +353,18 @@ public final class UCharacterProperty
|
|||
}
|
||||
}
|
||||
|
||||
private class EmojiBinaryProperty extends BinaryProperty {
|
||||
int which;
|
||||
EmojiBinaryProperty(int which) {
|
||||
super(SRC_EMOJI);
|
||||
this.which=which;
|
||||
}
|
||||
@Override
|
||||
boolean contains(int c) {
|
||||
return EmojiProps.INSTANCE.hasBinaryProperty(c, which);
|
||||
}
|
||||
}
|
||||
|
||||
private class NormInertBinaryProperty extends BinaryProperty { // UCHAR_NF*_INERT properties
|
||||
int which;
|
||||
NormInertBinaryProperty(int source, int which) {
|
||||
|
@ -534,11 +547,11 @@ public final class UCharacterProperty
|
|||
return !Normalizer2Impl.UTF16Plus.equal(dest, src);
|
||||
}
|
||||
},
|
||||
new BinaryProperty(2, 1<<PROPS_2_EMOJI),
|
||||
new BinaryProperty(2, 1<<PROPS_2_EMOJI_PRESENTATION),
|
||||
new BinaryProperty(2, 1<<PROPS_2_EMOJI_MODIFIER),
|
||||
new BinaryProperty(2, 1<<PROPS_2_EMOJI_MODIFIER_BASE),
|
||||
new BinaryProperty(2, 1<<PROPS_2_EMOJI_COMPONENT),
|
||||
new EmojiBinaryProperty(UProperty.EMOJI),
|
||||
new EmojiBinaryProperty(UProperty.EMOJI_PRESENTATION),
|
||||
new EmojiBinaryProperty(UProperty.EMOJI_MODIFIER),
|
||||
new EmojiBinaryProperty(UProperty.EMOJI_MODIFIER_BASE),
|
||||
new EmojiBinaryProperty(UProperty.EMOJI_COMPONENT),
|
||||
new BinaryProperty(SRC_PROPSVEC) { // REGIONAL_INDICATOR
|
||||
// Property starts are a subset of lb=RI etc.
|
||||
@Override
|
||||
|
@ -547,7 +560,14 @@ public final class UCharacterProperty
|
|||
}
|
||||
},
|
||||
new BinaryProperty(1, 1<<PREPENDED_CONCATENATION_MARK),
|
||||
new BinaryProperty(2, 1<<PROPS_2_EXTENDED_PICTOGRAPHIC),
|
||||
new EmojiBinaryProperty(UProperty.EXTENDED_PICTOGRAPHIC),
|
||||
new EmojiBinaryProperty(UProperty.BASIC_EMOJI),
|
||||
new EmojiBinaryProperty(UProperty.EMOJI_KEYCAP_SEQUENCE),
|
||||
new EmojiBinaryProperty(UProperty.RGI_EMOJI_MODIFIER_SEQUENCE),
|
||||
new EmojiBinaryProperty(UProperty.RGI_EMOJI_FLAG_SEQUENCE),
|
||||
new EmojiBinaryProperty(UProperty.RGI_EMOJI_TAG_SEQUENCE),
|
||||
new EmojiBinaryProperty(UProperty.RGI_EMOJI_ZWJ_SEQUENCE),
|
||||
new EmojiBinaryProperty(UProperty.RGI_EMOJI),
|
||||
};
|
||||
|
||||
public boolean hasBinaryProperty(int c, int which) {
|
||||
|
@ -1365,19 +1385,20 @@ public final class UCharacterProperty
|
|||
/*
|
||||
* Properties in vector word 2
|
||||
* Bits
|
||||
* 31..26 http://www.unicode.org/reports/tr51/#Emoji_Properties
|
||||
* 31..26 unused since ICU 70 added uemoji.icu;
|
||||
* in ICU 57..69 stored emoji properties
|
||||
* 25..20 Line Break
|
||||
* 19..15 Sentence Break
|
||||
* 14..10 Word Break
|
||||
* 9.. 5 Grapheme Cluster Break
|
||||
* 4.. 0 Decomposition Type
|
||||
*/
|
||||
private static final int PROPS_2_EXTENDED_PICTOGRAPHIC=26;
|
||||
private static final int PROPS_2_EMOJI_COMPONENT = 27;
|
||||
private static final int PROPS_2_EMOJI = 28;
|
||||
private static final int PROPS_2_EMOJI_PRESENTATION = 29;
|
||||
private static final int PROPS_2_EMOJI_MODIFIER = 30;
|
||||
private static final int PROPS_2_EMOJI_MODIFIER_BASE = 31;
|
||||
//ivate static final int PROPS_2_EXTENDED_PICTOGRAPHIC=26; // ICU 62..69
|
||||
//ivate static final int PROPS_2_EMOJI_COMPONENT = 27; // ICU 60..69
|
||||
//ivate static final int PROPS_2_EMOJI = 28; // ICU 57..69
|
||||
//ivate static final int PROPS_2_EMOJI_PRESENTATION = 29; // ICU 57..69
|
||||
//ivate static final int PROPS_2_EMOJI_MODIFIER = 30; // ICU 57..69
|
||||
//ivate static final int PROPS_2_EMOJI_MODIFIER_BASE = 31; // ICU 57..69
|
||||
|
||||
private static final int LB_MASK = 0x03f00000;
|
||||
private static final int LB_SHIFT = 20;
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
package com.ibm.icu.lang;
|
||||
|
||||
import com.ibm.icu.impl.CharacterPropertiesImpl;
|
||||
import com.ibm.icu.impl.EmojiProps;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.CodePointMap;
|
||||
import com.ibm.icu.util.CodePointTrie;
|
||||
|
@ -29,6 +30,15 @@ public final class CharacterProperties {
|
|||
|
||||
private static UnicodeSet makeSet(int property) {
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
if (UProperty.BASIC_EMOJI <= property && property <= UProperty.RGI_EMOJI) {
|
||||
// property of strings
|
||||
EmojiProps.INSTANCE.addStrings(property, set);
|
||||
if (property != UProperty.BASIC_EMOJI && property != UProperty.RGI_EMOJI) {
|
||||
// property of _only_ strings
|
||||
return set.freeze();
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(property);
|
||||
int numRanges = inclusions.getRangeCount();
|
||||
int startHasProperty = -1;
|
||||
|
|
|
@ -16,6 +16,7 @@ import java.util.Locale;
|
|||
import java.util.Map;
|
||||
|
||||
import com.ibm.icu.impl.CaseMapImpl;
|
||||
import com.ibm.icu.impl.EmojiProps;
|
||||
import com.ibm.icu.impl.IllegalIcuArgumentException;
|
||||
import com.ibm.icu.impl.Trie2;
|
||||
import com.ibm.icu.impl.UBiDiProps;
|
||||
|
@ -5920,6 +5921,43 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
|||
return UCharacterProperty.INSTANCE.hasBinaryProperty(ch, property);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@icu} Returns true if the property is true for the string.
|
||||
* Same as {@link #hasBinaryProperty(int, int)}
|
||||
* if the string contains exactly one code point.
|
||||
*
|
||||
* <p>Most properties apply only to single code points.
|
||||
* <a href="https://www.unicode.org/reports/tr51/#Emoji_Sets">UTS #51 Unicode Emoji</a>
|
||||
* defines several properties of strings.
|
||||
*
|
||||
* @param s String to test.
|
||||
* @param property UProperty selector constant, identifies which binary property to check.
|
||||
* Must be BINARY_START<=which<BINARY_LIMIT.
|
||||
* @return true or false according to the binary Unicode property value for the string.
|
||||
* Also false if <code>property</code> is out of bounds or if the Unicode version
|
||||
* does not have data for the property at all.
|
||||
*
|
||||
* @see com.ibm.icu.lang.UProperty
|
||||
* @see CharacterProperties#getBinaryPropertySet(int)
|
||||
* @draft ICU 70
|
||||
*/
|
||||
public static boolean hasBinaryProperty(CharSequence s, int property) {
|
||||
int length = s.length();
|
||||
if (length == 1) {
|
||||
return hasBinaryProperty(s.charAt(0), property); // single code point
|
||||
} else if (length == 2) {
|
||||
// first code point
|
||||
int c = Character.codePointAt(s, 0);
|
||||
if (Character.charCount(c) == length) {
|
||||
return hasBinaryProperty(c, property); // single code point
|
||||
}
|
||||
}
|
||||
// Only call into EmojiProps for a relevant property,
|
||||
// so that we not unnecessarily try to load its data file.
|
||||
return UProperty.BASIC_EMOJI <= property && property <= UProperty.RGI_EMOJI &&
|
||||
EmojiProps.INSTANCE.hasBinaryProperty(s, property);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@icu} <p>Check if a code point has the Alphabetic Unicode property.
|
||||
* <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.ALPHABETIC).
|
||||
|
|
|
@ -567,13 +567,62 @@ public interface UProperty
|
|||
* @stable ICU 62
|
||||
*/
|
||||
public static final int EXTENDED_PICTOGRAPHIC=64;
|
||||
/**
|
||||
* Binary property of strings Basic_Emoji.
|
||||
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
*
|
||||
* @draft ICU 70
|
||||
*/
|
||||
public static final int BASIC_EMOJI=65;
|
||||
/**
|
||||
* Binary property of strings Emoji_Keycap_Sequence.
|
||||
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
*
|
||||
* @draft ICU 70
|
||||
*/
|
||||
public static final int EMOJI_KEYCAP_SEQUENCE=66;
|
||||
/**
|
||||
* Binary property of strings RGI_Emoji_Modifier_Sequence.
|
||||
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
*
|
||||
* @draft ICU 70
|
||||
*/
|
||||
public static final int RGI_EMOJI_MODIFIER_SEQUENCE=67;
|
||||
/**
|
||||
* Binary property of strings RGI_Emoji_Flag_Sequence.
|
||||
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
*
|
||||
* @draft ICU 70
|
||||
*/
|
||||
public static final int RGI_EMOJI_FLAG_SEQUENCE=68;
|
||||
/**
|
||||
* Binary property of strings RGI_Emoji_Tag_Sequence.
|
||||
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
*
|
||||
* @draft ICU 70
|
||||
*/
|
||||
public static final int RGI_EMOJI_TAG_SEQUENCE=69;
|
||||
/**
|
||||
* Binary property of strings RGI_Emoji_ZWJ_Sequence.
|
||||
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
*
|
||||
* @draft ICU 70
|
||||
*/
|
||||
public static final int RGI_EMOJI_ZWJ_SEQUENCE=70;
|
||||
/**
|
||||
* Binary property of strings RGI_Emoji.
|
||||
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
*
|
||||
* @draft ICU 70
|
||||
*/
|
||||
public static final int RGI_EMOJI=71;
|
||||
|
||||
/**
|
||||
* One more than the last constant for binary Unicode properties.
|
||||
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final int BINARY_LIMIT = 65;
|
||||
public static final int BINARY_LIMIT = 72;
|
||||
|
||||
/**
|
||||
* Enumerated property Bidi_Class.
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:1353025e1c11978a08634320d78eb0ab2dc6755e9e966e6100429815165f37b5
|
||||
size 13622547
|
||||
oid sha256:87b44d8bdf19a56188c51fb4348630396dba560115cd0918f7a89c8d2871a26e
|
||||
size 13626217
|
||||
|
|
|
@ -2564,6 +2564,150 @@ public final class UCharacterTest extends TestFmwk
|
|||
UCharacter.hasBinaryProperty(0xA9, UProperty.EXTENDED_PICTOGRAPHIC));
|
||||
}
|
||||
|
||||
private static boolean hbp(CharSequence s, int property) {
|
||||
return UCharacter.hasBinaryProperty(s, property);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestEmojiPropertiesOfStrings() {
|
||||
// Property of code points, for coverage
|
||||
assertFalse("empty string is not Ideographic", hbp("", UProperty.IDEOGRAPHIC));
|
||||
assertFalse("L is not Ideographic", hbp("L", UProperty.IDEOGRAPHIC));
|
||||
assertTrue("U+4E02 is Ideographic", hbp("丂", UProperty.IDEOGRAPHIC));
|
||||
assertFalse("2*U+4E02 is not Ideographic", hbp("丂丂", UProperty.IDEOGRAPHIC));
|
||||
assertFalse("bicycle is not Ideographic", hbp("🚲", UProperty.IDEOGRAPHIC));
|
||||
assertTrue("U+23456 is Ideographic", hbp(UTF16.valueOf(0x23456), UProperty.IDEOGRAPHIC));
|
||||
|
||||
// Property of (code points and) strings
|
||||
assertFalse("empty string is not Basic_Emoji", hbp("", UProperty.BASIC_EMOJI));
|
||||
assertFalse("L is not Basic_Emoji", hbp("L", UProperty.BASIC_EMOJI));
|
||||
assertFalse("U+4E02 is not Basic_Emoji", hbp("丂", UProperty.BASIC_EMOJI));
|
||||
assertTrue("bicycle is Basic_Emoji", hbp("🚲", UProperty.BASIC_EMOJI));
|
||||
assertFalse("2*bicycle is Basic_Emoji", hbp("🚲🚲", UProperty.BASIC_EMOJI));
|
||||
assertFalse("U+23456 is not Basic_Emoji", hbp(UTF16.valueOf(0x23456), UProperty.BASIC_EMOJI));
|
||||
|
||||
assertFalse("stopwatch is not Basic_Emoji", hbp("⏱", UProperty.BASIC_EMOJI));
|
||||
assertTrue("stopwatch+emoji is Basic_Emoji", hbp("⏱\uFE0F", UProperty.BASIC_EMOJI));
|
||||
|
||||
assertFalse("chipmunk is not Basic_Emoji", hbp("🐿", UProperty.BASIC_EMOJI));
|
||||
assertTrue("chipmunk+emoji is Basic_Emoji", hbp("🐿\uFE0F", UProperty.BASIC_EMOJI));
|
||||
assertFalse("chipmunk+2*emoji is not Basic_Emoji", hbp("🐿\uFE0F\uFE0F", UProperty.BASIC_EMOJI));
|
||||
|
||||
// Properties of strings (only)
|
||||
assertFalse("4+emoji is not Emoji_Keycap_Sequence",
|
||||
hbp("4\uFE0F", UProperty.EMOJI_KEYCAP_SEQUENCE));
|
||||
assertTrue("4+emoji+keycap is Emoji_Keycap_Sequence",
|
||||
hbp("4\uFE0F\u20E3", UProperty.EMOJI_KEYCAP_SEQUENCE));
|
||||
|
||||
assertFalse("[B] is not RGI_Emoji_Flag_Sequence",
|
||||
hbp(UTF16.valueOf(0x1F1E7), UProperty.RGI_EMOJI_FLAG_SEQUENCE));
|
||||
assertTrue("[BE] is RGI_Emoji_Flag_Sequence",
|
||||
hbp("🇧🇪", UProperty.RGI_EMOJI_FLAG_SEQUENCE));
|
||||
|
||||
assertFalse("[flag] is not RGI_Emoji_Tag_Sequence",
|
||||
hbp(UTF16.valueOf(0x1F3F4), UProperty.RGI_EMOJI_TAG_SEQUENCE));
|
||||
assertTrue("[Scotland] is RGI_Emoji_Tag_Sequence",
|
||||
hbp("🏴", UProperty.RGI_EMOJI_TAG_SEQUENCE));
|
||||
|
||||
assertFalse("bicyclist is not RGI_Emoji_Modifier_Sequence",
|
||||
hbp("🚴", UProperty.RGI_EMOJI_MODIFIER_SEQUENCE));
|
||||
assertTrue("bicyclist+medium is RGI_Emoji_Modifier_Sequence",
|
||||
hbp("🚴" + UTF16.valueOf(0x1F3FD), UProperty.RGI_EMOJI_MODIFIER_SEQUENCE));
|
||||
|
||||
assertFalse("woman+dark+ZWJ is not RGI_Emoji_ZWJ_Sequence",
|
||||
hbp("👩" + UTF16.valueOf(0x1F3FF) + "\u200D", UProperty.RGI_EMOJI_ZWJ_SEQUENCE));
|
||||
assertTrue("woman pilot: dark skin tone is RGI_Emoji_ZWJ_Sequence",
|
||||
hbp("👩" + UTF16.valueOf(0x1F3FF) + "\u200D✈\uFE0F",
|
||||
UProperty.RGI_EMOJI_ZWJ_SEQUENCE));
|
||||
|
||||
// RGI_Emoji = all of the above
|
||||
assertFalse("stopwatch is not RGI_Emoji", hbp("⏱", UProperty.RGI_EMOJI));
|
||||
assertTrue("stopwatch+emoji is RGI_Emoji", hbp("⏱\uFE0F", UProperty.RGI_EMOJI));
|
||||
|
||||
assertFalse("chipmunk is not RGI_Emoji", hbp("🐿", UProperty.RGI_EMOJI));
|
||||
assertTrue("chipmunk+emoji is RGI_Emoji", hbp("🐿\uFE0F", UProperty.RGI_EMOJI));
|
||||
|
||||
assertFalse("4+emoji is not RGI_Emoji", hbp("4\uFE0F", UProperty.RGI_EMOJI));
|
||||
assertTrue("4+emoji+keycap is RGI_Emoji", hbp("4\uFE0F\u20E3", UProperty.RGI_EMOJI));
|
||||
|
||||
assertFalse("[B] is not RGI_Emoji", hbp(UTF16.valueOf(0x1F1E7), UProperty.RGI_EMOJI));
|
||||
assertTrue("[BE] is RGI_Emoji", hbp("🇧🇪", UProperty.RGI_EMOJI));
|
||||
|
||||
assertTrue("[flag] is RGI_Emoji", hbp(UTF16.valueOf(0x1F3F4), UProperty.RGI_EMOJI));
|
||||
assertTrue("[Scotland] is RGI_Emoji", hbp("🏴", UProperty.RGI_EMOJI));
|
||||
|
||||
assertTrue("bicyclist is RGI_Emoji", hbp("🚴", UProperty.RGI_EMOJI));
|
||||
assertTrue("bicyclist+medium is RGI_Emoji",
|
||||
hbp("🚴" + UTF16.valueOf(0x1F3FD), UProperty.RGI_EMOJI));
|
||||
|
||||
assertFalse("woman+dark+ZWJ is not RGI_Emoji",
|
||||
hbp("👩" + UTF16.valueOf(0x1F3FF) + "\u200D", UProperty.RGI_EMOJI));
|
||||
assertTrue("woman pilot: dark skin tone is RGI_Emoji",
|
||||
hbp("👩" + UTF16.valueOf(0x1F3FF) + "\u200D✈\uFE0F", UProperty.RGI_EMOJI));
|
||||
|
||||
// UnicodeSet with properties of strings
|
||||
UnicodeSet basic = new UnicodeSet("[:Basic_Emoji:]");
|
||||
UnicodeSet keycaps = new UnicodeSet("[:Emoji_Keycap_Sequence:]");
|
||||
UnicodeSet modified = new UnicodeSet("[:RGI_Emoji_Modifier_Sequence:]");
|
||||
UnicodeSet flags = new UnicodeSet("[:RGI_Emoji_Flag_Sequence:]");
|
||||
UnicodeSet tags = new UnicodeSet("[:RGI_Emoji_Tag_Sequence:]");
|
||||
UnicodeSet combos = new UnicodeSet("[:RGI_Emoji_ZWJ_Sequence:]");
|
||||
UnicodeSet rgi = new UnicodeSet("[:RGI_Emoji:]");
|
||||
|
||||
// union of all sets except for "rgi" -- should be the same as "rgi"
|
||||
UnicodeSet all = new UnicodeSet(basic);
|
||||
all.addAll(keycaps).addAll(modified).addAll(flags).addAll(tags).addAll(combos);
|
||||
|
||||
UnicodeSet basicOnlyCp = new UnicodeSet(basic).removeAllStrings();
|
||||
UnicodeSet rgiOnlyCp = new UnicodeSet(rgi).removeAllStrings();
|
||||
|
||||
assertTrue("lots of Basic_Emoji", basic.size() > 1000);
|
||||
assertEquals("12 Emoji_Keycap_Sequence", 12, keycaps.size());
|
||||
assertTrue("lots of RGI_Emoji_Modifier_Sequence", modified.size() > 600);
|
||||
assertTrue("lots of RGI_Emoji_Flag_Sequence", flags.size() > 250);
|
||||
assertTrue("some RGI_Emoji_Tag_Sequence", tags.size() >= 3);
|
||||
assertTrue("lots of RGI_Emoji_ZWJ_Sequence", combos.size() > 1300);
|
||||
assertTrue("lots of RGI_Emoji", rgi.size() > 3000);
|
||||
|
||||
assertTrue("lots of Basic_Emoji code points", basicOnlyCp.size() > 1000);
|
||||
assertTrue("Basic_Emoji.hasStrings()", basic.hasStrings());
|
||||
assertEquals("no Emoji_Keycap_Sequence code points", 0, keycaps.getRangeCount());
|
||||
assertEquals("lots of RGI_Emoji_Modifier_Sequence", 0, modified.getRangeCount());
|
||||
assertEquals("lots of RGI_Emoji_Flag_Sequence", 0, flags.getRangeCount());
|
||||
assertEquals("some RGI_Emoji_Tag_Sequence", 0, tags.getRangeCount());
|
||||
assertEquals("lots of RGI_Emoji_ZWJ_Sequence", 0, combos.getRangeCount());
|
||||
|
||||
assertTrue("lots of RGI_Emoji code points", rgiOnlyCp.size() > 1000);
|
||||
assertTrue("RGI_Emoji.hasStrings()", rgi.hasStrings());
|
||||
assertEquals("RGI_Emoji/only-cp.size() == Basic_Emoji/only-cp.size()",
|
||||
rgiOnlyCp.size(), basicOnlyCp.size());
|
||||
assertEquals("RGI_Emoji/only-cp == Basic_Emoji/only-cp", rgiOnlyCp, basicOnlyCp);
|
||||
assertEquals("RGI_Emoji.size() == union.size()", rgi.size(), all.size());
|
||||
assertEquals("RGI_Emoji == union", rgi, all);
|
||||
|
||||
assertTrue("Basic_Emoji.contains(stopwatch+emoji)", basic.contains("⏱\uFE0F"));
|
||||
assertTrue("Basic_Emoji.contains(chipmunk+emoji)", basic.contains("🐿\uFE0F"));
|
||||
assertTrue("Emoji_Keycap_Sequence.contains(4+emoji+keycap)",
|
||||
keycaps.contains("4\uFE0F\u20E3"));
|
||||
assertTrue("RGI_Emoji_Flag_Sequence.contains([BE])", flags.contains("🇧🇪"));
|
||||
assertTrue("RGI_Emoji_Tag_Sequence.contains([Scotland])", tags.contains("🏴"));
|
||||
assertTrue("RGI_Emoji_Modifier_Sequence.contains(bicyclist+medium)",
|
||||
modified.contains("🚴" + UTF16.valueOf(0x1F3FD)));
|
||||
assertTrue("RGI_Emoji_ZWJ_Sequence.contains(woman pilot: dark skin tone)",
|
||||
combos.contains("👩" + UTF16.valueOf(0x1F3FF) + "\u200D✈\uFE0F"));
|
||||
assertTrue("RGI_Emoji.contains(stopwatch+emoji)", rgi.contains("⏱\uFE0F"));
|
||||
assertTrue("RGI_Emoji.contains(chipmunk+emoji)", rgi.contains("🐿\uFE0F"));
|
||||
assertTrue("RGI_Emoji.contains(4+emoji+keycap)", rgi.contains("4\uFE0F\u20E3"));
|
||||
assertTrue("RGI_Emoji.contains([BE] is RGI_Emoji)", rgi.contains("🇧🇪"));
|
||||
assertTrue("RGI_Emoji.contains([flag])", rgi.contains(UTF16.valueOf(0x1F3F4)));
|
||||
assertTrue("RGI_Emoji.contains([Scotland])", rgi.contains("🏴"));
|
||||
assertTrue("RGI_Emoji.contains(bicyclist)", rgi.contains("🚴"));
|
||||
assertTrue("RGI_Emoji.contains(bicyclist+medium)",
|
||||
rgi.contains("🚴" + UTF16.valueOf(0x1F3FD)));
|
||||
assertTrue("RGI_Emoji.contains(woman pilot: dark skin tone)",
|
||||
rgi.contains("👩" + UTF16.valueOf(0x1F3FF) + "\u200D✈\uFE0F"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestIndicPositionalCategory() {
|
||||
UnicodeSet na = new UnicodeSet("[:InPC=NA:]");
|
||||
|
@ -3689,8 +3833,8 @@ public final class UCharacterTest extends TestFmwk
|
|||
// Spot-check getBinaryPropertySet() vs. hasBinaryProperty().
|
||||
for (int prop = 0; prop < UProperty.BINARY_LIMIT; ++prop) {
|
||||
UnicodeSet set = CharacterProperties.getBinaryPropertySet(prop);
|
||||
int size = set.size();
|
||||
if (size == 0) {
|
||||
int count = set.getRangeCount();
|
||||
if (count == 0) {
|
||||
assertFalse("!hasBinaryProperty(U+0020, " + prop + ')',
|
||||
UCharacter.hasBinaryProperty(0x20, prop));
|
||||
assertFalse("!hasBinaryProperty(U+0061, " + prop + ')',
|
||||
|
@ -3698,14 +3842,14 @@ public final class UCharacterTest extends TestFmwk
|
|||
assertFalse("!hasBinaryProperty(U+4E00, " + prop + ')',
|
||||
UCharacter.hasBinaryProperty(0x4e00, prop));
|
||||
} else {
|
||||
int c = set.charAt(0);
|
||||
int c = set.getRangeStart(0);
|
||||
if (c > 0) {
|
||||
assertFalse("!hasBinaryProperty(" + Utility.hex(c - 1) + ", " + prop + ')',
|
||||
UCharacter.hasBinaryProperty(c - 1, prop));
|
||||
}
|
||||
assertTrue("hasBinaryProperty(" + Utility.hex(c) + ", " + prop + ')',
|
||||
UCharacter.hasBinaryProperty(c, prop));
|
||||
c = set.charAt(size - 1);
|
||||
c = set.getRangeEnd(count - 1);
|
||||
assertTrue("hasBinaryProperty(" + Utility.hex(c) + ", " + prop + ')',
|
||||
UCharacter.hasBinaryProperty(c, prop));
|
||||
if (c < 0x10ffff) {
|
||||
|
|
|
@ -152,9 +152,20 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
}
|
||||
UnicodeSet collectedErrors = new UnicodeSet();
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(testSet); it.next();) {
|
||||
int value = UCharacter.getIntPropertyValue(it.codepoint, propNum);
|
||||
if (value != valueNum) {
|
||||
collectedErrors.add(it.codepoint);
|
||||
if (it.codepoint == UnicodeSetIterator.IS_STRING) {
|
||||
// For binary properties of strings, only [:propName=true:] _should_ yield strings.
|
||||
// Therefore, we should always have valueNum=1 and b=true.
|
||||
// TODO: ICU-21524 ^ and propName=N use complement() which leaves strings alone.
|
||||
boolean b = UCharacter.hasBinaryProperty(it.string, propNum);
|
||||
int value = b ? 1 : 0;
|
||||
if (value != valueNum && /* TODO: ICU-21524 */ valueNum != 0) {
|
||||
collectedErrors.add(it.string);
|
||||
}
|
||||
} else {
|
||||
int value = UCharacter.getIntPropertyValue(it.codepoint, propNum);
|
||||
if (value != valueNum) {
|
||||
collectedErrors.add(it.codepoint);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (collectedErrors.size() != 0) {
|
||||
|
|
|
@ -31,12 +31,13 @@ cc_binary(
|
|||
"//icu4c/source/common:bytestriebuilder",
|
||||
"//icu4c/source/common:propsvec",
|
||||
"//icu4c/source/common:errorcode",
|
||||
"//icu4c/source/common:ucharstriebuilder",
|
||||
"//icu4c/source/common:uniset",
|
||||
"//icu4c/source/common:uvector32",
|
||||
|
||||
"//icu4c/source/common:platform",
|
||||
"//icu4c/source/common:headers",
|
||||
|
||||
|
||||
"//icu4c/source/tools/toolutil:ppucd",
|
||||
"//icu4c/source/tools/toolutil:unewdata",
|
||||
"//icu4c/source/tools/toolutil:writesrc",
|
||||
|
|
|
@ -47,7 +47,7 @@ the udata API for loading ICU data. Especially, a UDataInfo structure
|
|||
precedes the actual data. It contains platform properties values and the
|
||||
file format version.
|
||||
|
||||
The following is a description of format version 7.7 .
|
||||
The following is a description of format version 7.8 .
|
||||
|
||||
Data contents:
|
||||
|
||||
|
@ -286,6 +286,11 @@ ICU 64 adds fraction-32 numeric values for new Unicode 12 Tamil fraction charact
|
|||
ICU 66 adds two bits for the UScriptCode or Script_Extensions index in vector word 0.
|
||||
The value is split across bits 21..20 & 7..0.
|
||||
|
||||
--- Changes in format version 7.8 ---
|
||||
|
||||
ICU 70 moves the emoji properties from uprops.icu to (new) uemoji.icu.
|
||||
The 6 bits in vector word 2 that stored emoji properties are unused again.
|
||||
|
||||
----------------------------------------------------------------------------- */
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
@ -301,8 +306,8 @@ static UDataInfo dataInfo={
|
|||
0,
|
||||
|
||||
{ 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */
|
||||
{ 7, 7, 0, 0 }, /* formatVersion */
|
||||
{ 10, 0, 0, 0 } /* dataVersion */
|
||||
{ 7, 8, 0, 0 }, /* formatVersion */
|
||||
{ 14, 0, 0, 0 } /* dataVersion */
|
||||
};
|
||||
|
||||
inline uint32_t splitScriptCodeOrIndex(uint32_t v) {
|
||||
|
@ -625,13 +630,7 @@ propToBinaries[]={
|
|||
{ UCHAR_ID_CONTINUE, 1, UPROPS_ID_CONTINUE },
|
||||
{ UCHAR_GRAPHEME_BASE, 1, UPROPS_GRAPHEME_BASE },
|
||||
|
||||
{ UCHAR_EMOJI, 2, UPROPS_2_EMOJI },
|
||||
{ UCHAR_EMOJI_PRESENTATION, 2, UPROPS_2_EMOJI_PRESENTATION },
|
||||
{ UCHAR_EMOJI_MODIFIER, 2, UPROPS_2_EMOJI_MODIFIER },
|
||||
{ UCHAR_EMOJI_MODIFIER_BASE, 2, UPROPS_2_EMOJI_MODIFIER_BASE },
|
||||
{ UCHAR_EMOJI_COMPONENT, 2, UPROPS_2_EMOJI_COMPONENT },
|
||||
{ UCHAR_PREPENDED_CONCATENATION_MARK, 1, UPROPS_PREPENDED_CONCATENATION_MARK },
|
||||
{ UCHAR_EXTENDED_PICTOGRAPHIC, 2, UPROPS_2_EXTENDED_PICTOGRAPHIC },
|
||||
};
|
||||
|
||||
struct PropToEnum {
|
||||
|
|
563
tools/unicode/c/genprops/emojipropsbuilder.cpp
Normal file
563
tools/unicode/c/genprops/emojipropsbuilder.cpp
Normal file
|
@ -0,0 +1,563 @@
|
|||
// © 2021 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: https://www.unicode.org/copyright.html
|
||||
|
||||
// emojipropsbuilder.cpp
|
||||
// created: 2021sep03 Markus W. Scherer
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ucharstriebuilder.h"
|
||||
#include "unicode/ucptrie.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/umutablecptrie.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "charstr.h"
|
||||
#include "cmemory.h"
|
||||
#include "emojiprops.h"
|
||||
#include "genprops.h"
|
||||
#include "uassert.h"
|
||||
#include "unewdata.h"
|
||||
#include "uparse.h"
|
||||
|
||||
/* Emoji properties file format ------------------------------------------------
|
||||
|
||||
The file format prepared and written here contains several data
|
||||
structures that store indexes or data.
|
||||
|
||||
Before the data contents described below, there are the headers required by
|
||||
the udata API for loading ICU data. Especially, a UDataInfo structure
|
||||
precedes the actual data. It contains platform properties values and the
|
||||
file format version.
|
||||
|
||||
The following is a description of format version 1.0 .
|
||||
|
||||
The file contains the following structures:
|
||||
|
||||
const int32_t indexes[] with values i0, i1, ...:
|
||||
(see EmojiProps::IX_... constants for names of indexes)
|
||||
|
||||
The length of the indexes[] array is indexes[IX_CPTRIE_OFFSET]/4;
|
||||
|
||||
The first 14 indexes are byte offsets in ascending order.
|
||||
Each byte offset marks the start of a part in the data file,
|
||||
and the limit (exclusive end) of the previous one.
|
||||
When two consecutive byte offsets are the same, then the corresponding part is empty.
|
||||
Byte offsets are offsets from after the header, that is, from the beginning of the indexes[].
|
||||
Each part starts at an offset with proper alignment for its data.
|
||||
If necessary, the previous part may include padding bytes to achieve this alignment.
|
||||
|
||||
i0 offset of cpTrie (and the limit offset of the indexes[] array)
|
||||
i1..i3 reserved, same as the limit offset of the previous part
|
||||
i4 offset of Basic_Emoji string trie
|
||||
i5 offset of Emoji_Keycap_Sequence string trie
|
||||
i6 offset of RGI_Emoji_Modifier_Sequence string trie
|
||||
i7 offset of RGI_Emoji_Flag_Sequence string trie
|
||||
i8 offset of RGI_Emoji_Tag_Sequence string trie
|
||||
i9 offset of RGI_Emoji_ZWJ_Sequence string trie
|
||||
i10..i12 reserved, same as the limit offset of the previous part
|
||||
i13 totalSize -- same as the limit offset of the previous part
|
||||
i14..i15 reserved, 0
|
||||
|
||||
After the indexes array follows a UCPTrie=CodePointTrie (type=fast, valueWidth=8)
|
||||
"cpTrie" with one bit each for multiple binary properties;
|
||||
see EmojiProps::BIT_... constants.
|
||||
|
||||
After that follow consecutive, serialized,
|
||||
single-property UCharsTrie=CharsTrie string tries for multiple properties of strings;
|
||||
see EmojiProps::IX_.._TRIE_OFFSET constants.
|
||||
|
||||
The Basic_Emoji property contains both single code points and multi-character strings.
|
||||
Its data is in both the code point trie and in one of the string tries.
|
||||
|
||||
----------------------------------------------------------------------------- */
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
// UDataInfo cf. udata.h
|
||||
static UDataInfo dataInfo={
|
||||
sizeof(UDataInfo),
|
||||
0,
|
||||
|
||||
U_IS_BIG_ENDIAN,
|
||||
U_CHARSET_FAMILY,
|
||||
U_SIZEOF_UCHAR,
|
||||
0,
|
||||
|
||||
{ u'E', u'm', u'o', u'j' }, // dataFormat="Emoj"
|
||||
{ 1, 0, 0, 0 }, // formatVersion
|
||||
{ 14, 0, 0, 0 } // dataVersion
|
||||
};
|
||||
|
||||
class EmojiPropsBuilder : public PropsBuilder {
|
||||
public:
|
||||
EmojiPropsBuilder(UErrorCode &errorCode);
|
||||
~EmojiPropsBuilder() override;
|
||||
|
||||
void setUnicodeVersion(const UVersionInfo version) override;
|
||||
void setProps(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode) override;
|
||||
void parseUnidataFiles(const char *unidataPath, UErrorCode &errorCode) override;
|
||||
void build(UErrorCode &errorCode) override;
|
||||
void writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode) override;
|
||||
|
||||
// visible for C adapter
|
||||
void parsePropsOfStringsLine(char *fields[][2], UErrorCode &errorCode);
|
||||
|
||||
private:
|
||||
void setBit(UChar32 start, UChar32 end, int32_t shift, bool on, UErrorCode &errorCode);
|
||||
void setBits(UChar32 start, UChar32 end, uint32_t value, uint32_t mask, UErrorCode &errorCode);
|
||||
void parsePropsOfStringsFile(const char *path, UErrorCode &errorCode);
|
||||
|
||||
static int32_t getTrieIndex(int32_t index) {
|
||||
U_ASSERT(TRIE_IX_START <= index);
|
||||
U_ASSERT(index < TRIE_IX_LIMIT);
|
||||
return index - TRIE_IX_START;
|
||||
}
|
||||
UCharsTrieBuilder &getTrieBuilder(int32_t index) {
|
||||
index = getTrieIndex(index);
|
||||
U_ASSERT(trieBuilders[index] != nullptr);
|
||||
return *trieBuilders[index];
|
||||
}
|
||||
UnicodeString &getTrieString(int32_t index) {
|
||||
index = getTrieIndex(index);
|
||||
return trieStrings[index];
|
||||
}
|
||||
int32_t &getNumStrings(int32_t index) {
|
||||
index = getTrieIndex(index);
|
||||
return numStrings[index];
|
||||
}
|
||||
|
||||
static constexpr int32_t TRIE_IX_START = EmojiProps::IX_BASIC_EMOJI_TRIE_OFFSET;
|
||||
static constexpr int32_t TRIE_IX_LIMIT = EmojiProps::IX_RESERVED10;
|
||||
|
||||
UMutableCPTrie *mutableCPTrie = nullptr;
|
||||
UCPTrie *cpTrie = nullptr;
|
||||
std::set<std::string> unrecognized;
|
||||
UCharsTrieBuilder *trieBuilders[TRIE_IX_LIMIT - TRIE_IX_START] = {
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr
|
||||
};
|
||||
UnicodeString trieStrings[TRIE_IX_LIMIT - TRIE_IX_START];
|
||||
int32_t numStrings[TRIE_IX_LIMIT - TRIE_IX_START];
|
||||
int32_t indexes[EmojiProps::IX_COUNT] = {
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0
|
||||
};
|
||||
uint8_t trieBlock[100000];
|
||||
int32_t trieSize = 0;
|
||||
};
|
||||
|
||||
EmojiPropsBuilder::EmojiPropsBuilder(UErrorCode &errorCode) {
|
||||
mutableCPTrie = umutablecptrie_open(0, 0, &errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "genprops/emoji error: umutablecptrie_open() failed: %s\n",
|
||||
u_errorName(errorCode));
|
||||
}
|
||||
bool isNull = false;
|
||||
for (auto &ptr : trieBuilders) {
|
||||
ptr = new UCharsTrieBuilder(errorCode);
|
||||
if (ptr == nullptr) {
|
||||
isNull = true;
|
||||
}
|
||||
}
|
||||
if (isNull && U_SUCCESS(errorCode)) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
if (U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "genprops/emoji error: new UCharsTrieBuilder() failed: %s\n",
|
||||
u_errorName(errorCode));
|
||||
}
|
||||
for (auto &num : numStrings) {
|
||||
num = 0;
|
||||
}
|
||||
}
|
||||
|
||||
EmojiPropsBuilder::~EmojiPropsBuilder() {
|
||||
umutablecptrie_close(mutableCPTrie);
|
||||
ucptrie_close(cpTrie);
|
||||
for (auto ptr : trieBuilders) {
|
||||
delete ptr;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
EmojiPropsBuilder::setUnicodeVersion(const UVersionInfo version) {
|
||||
uprv_memcpy(dataInfo.dataVersion, version, 4);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
struct PropToBinary {
|
||||
UProperty prop;
|
||||
int32_t shift;
|
||||
};
|
||||
|
||||
constexpr PropToBinary propToBinaries[] = {
|
||||
{ UCHAR_EMOJI, EmojiProps::BIT_EMOJI },
|
||||
{ UCHAR_EMOJI_PRESENTATION, EmojiProps::BIT_EMOJI_PRESENTATION },
|
||||
{ UCHAR_EMOJI_MODIFIER, EmojiProps::BIT_EMOJI_MODIFIER },
|
||||
{ UCHAR_EMOJI_MODIFIER_BASE, EmojiProps::BIT_EMOJI_MODIFIER_BASE },
|
||||
{ UCHAR_EMOJI_COMPONENT, EmojiProps::BIT_EMOJI_COMPONENT },
|
||||
{ UCHAR_EXTENDED_PICTOGRAPHIC, EmojiProps::BIT_EXTENDED_PICTOGRAPHIC },
|
||||
};
|
||||
|
||||
struct PropNameToIndex {
|
||||
const char *propName;
|
||||
int32_t emojiPropsIndex;
|
||||
};
|
||||
|
||||
constexpr PropNameToIndex propNameToIndex[] = {
|
||||
{ "Basic_Emoji", EmojiProps::IX_BASIC_EMOJI_TRIE_OFFSET },
|
||||
{ "Emoji_Keycap_Sequence", EmojiProps::IX_EMOJI_KEYCAP_SEQUENCE_TRIE_OFFSET },
|
||||
{ "RGI_Emoji_Modifier_Sequence", EmojiProps::IX_RGI_EMOJI_MODIFIER_SEQUENCE_TRIE_OFFSET },
|
||||
{ "RGI_Emoji_Flag_Sequence", EmojiProps::IX_RGI_EMOJI_FLAG_SEQUENCE_TRIE_OFFSET },
|
||||
{ "RGI_Emoji_Tag_Sequence", EmojiProps::IX_RGI_EMOJI_TAG_SEQUENCE_TRIE_OFFSET },
|
||||
{ "RGI_Emoji_ZWJ_Sequence", EmojiProps::IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET },
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
void
|
||||
EmojiPropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
|
||||
UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
|
||||
if (newValues.containsSome(0, UCHAR_BINARY_LIMIT-1)) {
|
||||
for (const auto &p2b : propToBinaries) {
|
||||
U_ASSERT(p2b.shift < 8);
|
||||
if (newValues.contains(p2b.prop)) {
|
||||
setBit(props.start, props.end, p2b.shift, props.binProps[p2b.prop], errorCode);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
EmojiPropsBuilder::setBit(UChar32 start, UChar32 end, int32_t shift, bool on,
|
||||
UErrorCode &errorCode) {
|
||||
uint32_t mask = U_MASK(shift);
|
||||
uint32_t value = on ? mask : 0;
|
||||
setBits(start, end, value, mask, errorCode);
|
||||
}
|
||||
|
||||
void
|
||||
EmojiPropsBuilder::setBits(UChar32 start, UChar32 end, uint32_t value, uint32_t mask,
|
||||
UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
|
||||
if (start == end) {
|
||||
uint32_t oldValue = umutablecptrie_get(mutableCPTrie, start);
|
||||
uint32_t newValue = (oldValue & ~mask) | value;
|
||||
if (newValue != oldValue) {
|
||||
umutablecptrie_set(mutableCPTrie, start, newValue, &errorCode);
|
||||
}
|
||||
return;
|
||||
}
|
||||
while (start <= end && U_SUCCESS(errorCode)) {
|
||||
uint32_t oldValue;
|
||||
UChar32 rangeEnd = umutablecptrie_getRange(
|
||||
mutableCPTrie, start, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &oldValue);
|
||||
if (rangeEnd > end) {
|
||||
rangeEnd = end;
|
||||
}
|
||||
uint32_t newValue = (oldValue & ~mask) | value;
|
||||
if (newValue != oldValue) {
|
||||
umutablecptrie_setRange(mutableCPTrie, start, rangeEnd, newValue, &errorCode);
|
||||
}
|
||||
start = rangeEnd + 1;
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
void U_CALLCONV
|
||||
parsePropsOfStringsLineFn(
|
||||
void *context,
|
||||
char *fields[][2], int32_t /* fieldCount */,
|
||||
UErrorCode *pErrorCode) {
|
||||
reinterpret_cast<EmojiPropsBuilder *>(context)->parsePropsOfStringsLine(fields, *pErrorCode);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void
|
||||
EmojiPropsBuilder::parseUnidataFiles(const char *unidataPath, UErrorCode &errorCode) {
|
||||
CharString path(unidataPath, errorCode);
|
||||
path.ensureEndsWithFileSeparator(errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
int32_t pathLength = path.length();
|
||||
path.append("emoji-sequences.txt", errorCode);
|
||||
parsePropsOfStringsFile(path.data(), errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
path.truncate(pathLength);
|
||||
path.append("emoji-zwj-sequences.txt", errorCode);
|
||||
parsePropsOfStringsFile(path.data(), errorCode);
|
||||
|
||||
if (U_SUCCESS(errorCode) && !unrecognized.empty()) {
|
||||
puts("\n*** genprops/emoji warning: sample of unrecognized property names:");
|
||||
int32_t i = 0;
|
||||
for (const auto &s : unrecognized) {
|
||||
printf(" \"%s\"\n", s.c_str());
|
||||
if (++i == 10) { break; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
EmojiPropsBuilder::parsePropsOfStringsFile(const char *path, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
char *fields[3][2];
|
||||
u_parseDelimitedFile(path, ';', fields, 3, parsePropsOfStringsLineFn, this, &errorCode);
|
||||
}
|
||||
|
||||
void EmojiPropsBuilder::parsePropsOfStringsLine(char *fields[][2], UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
// Format:
|
||||
// code_point(s) ; type_field ; description # comments
|
||||
*fields[1][1] = 0; // NUL-terminate the name field
|
||||
char *propName = const_cast<char *>(u_skipWhitespace(fields[1][0]));
|
||||
u_rtrim(propName);
|
||||
int32_t index = -1;
|
||||
for (const PropNameToIndex &pn2i : propNameToIndex) {
|
||||
if (strcmp(pn2i.propName, propName) == 0) {
|
||||
index = pn2i.emojiPropsIndex;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (index < 0) {
|
||||
// not a supported property
|
||||
unrecognized.insert(propName);
|
||||
return;
|
||||
}
|
||||
|
||||
const char *rangeOrString = fields[0][0];
|
||||
if (strstr(rangeOrString, "..") != nullptr) {
|
||||
// Code point range:
|
||||
// 231A..231B ; Basic_Emoji ; watch
|
||||
if (index != EmojiProps::IX_BASIC_EMOJI_TRIE_OFFSET) {
|
||||
fprintf(stderr,
|
||||
"genprops/emoji error: single code points %s for %s\n", rangeOrString, propName);
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
uint32_t start, end;
|
||||
u_parseCodePointRange(rangeOrString, &start, &end, &errorCode);
|
||||
setBit(start, end, EmojiProps::BIT_BASIC_EMOJI, true, errorCode);
|
||||
} else {
|
||||
// Code point or string:
|
||||
// 23F0 ; Basic_Emoji ; alarm clock
|
||||
// 23F1 FE0F ; Basic_Emoji ; stopwatch
|
||||
uint32_t first;
|
||||
UChar s[100];
|
||||
int32_t length = u_parseString(rangeOrString, s, UPRV_LENGTHOF(s), &first, &errorCode);
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
if (length == 0) {
|
||||
fprintf(stderr,
|
||||
"genprops/emoji error: empty string on line\n %s ; %s ; %s\n",
|
||||
rangeOrString, propName, fields[2][0]);
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
if (length == U16_LENGTH(first)) {
|
||||
// single code point
|
||||
if (index != EmojiProps::IX_BASIC_EMOJI_TRIE_OFFSET) {
|
||||
fprintf(stderr,
|
||||
"genprops/emoji error: single code point %s for %s\n", rangeOrString, propName);
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
setBit(first, first, EmojiProps::BIT_BASIC_EMOJI, true, errorCode);
|
||||
} else {
|
||||
// more than one code point
|
||||
UnicodeString us(false, s, length);
|
||||
getTrieBuilder(index).add(us, 0, errorCode);
|
||||
++getNumStrings(index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
EmojiPropsBuilder::build(UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
cpTrie = umutablecptrie_buildImmutable(
|
||||
mutableCPTrie, UCPTRIE_TYPE_FAST, UCPTRIE_VALUE_BITS_8, &errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
fprintf(stderr,
|
||||
"genprops/emoji error: umutablecptrie_buildImmutable() failed: %s\n",
|
||||
u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
trieSize = ucptrie_toBinary(cpTrie, trieBlock, sizeof(trieBlock), &errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
fprintf(stderr,
|
||||
"genprops/emoji error: ucptrie_toBinary() failed: %s (length %ld)\n",
|
||||
u_errorName(errorCode), (long)trieSize);
|
||||
return;
|
||||
}
|
||||
U_ASSERT((trieSize & 3) == 0); // multiple of 4 bytes
|
||||
|
||||
for (int32_t index = TRIE_IX_START; index < TRIE_IX_LIMIT; ++index) {
|
||||
if (getNumStrings(index) == 0) {
|
||||
fprintf(stderr, "genprops/emoji error: no strings for property index %d\n", (int)index);
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
UCharsTrieBuilder &builder = getTrieBuilder(index);
|
||||
UnicodeString &result = getTrieString(index);
|
||||
builder.buildUnicodeString(USTRINGTRIE_BUILD_SMALL, result, errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
fprintf(stderr,
|
||||
"genprops/emoji error: UCharsTrieBuilder[%d].buildUnicodeString() failed: %s\n",
|
||||
(int)index, u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Set indexes.
|
||||
int32_t length = sizeof(indexes);
|
||||
U_ASSERT(length == EmojiProps::IX_COUNT * 4);
|
||||
int32_t offset = length;
|
||||
indexes[EmojiProps::IX_CPTRIE_OFFSET] = offset;
|
||||
if (!beQuiet) {
|
||||
puts("* uemoji.icu stats *");
|
||||
printf("UCPTrie size in bytes: %5u\n", (int)trieSize);
|
||||
}
|
||||
offset += trieSize;
|
||||
|
||||
indexes[EmojiProps::IX_RESERVED1] = offset;
|
||||
indexes[EmojiProps::IX_RESERVED2] = offset;
|
||||
indexes[EmojiProps::IX_RESERVED3] = offset;
|
||||
|
||||
int32_t index = EmojiProps::IX_BASIC_EMOJI_TRIE_OFFSET;
|
||||
indexes[index] = offset;
|
||||
length = getTrieString(index).length() * 2;
|
||||
if (!beQuiet) {
|
||||
printf("UCharsTrie size in bytes: Basic_Emoji %5u num strings: %5u\n",
|
||||
(int)length, (int)getNumStrings(index));
|
||||
}
|
||||
offset += length;
|
||||
|
||||
index = EmojiProps::IX_EMOJI_KEYCAP_SEQUENCE_TRIE_OFFSET;
|
||||
indexes[index] = offset;
|
||||
length = getTrieString(index).length() * 2;
|
||||
if (!beQuiet) {
|
||||
printf("UCharsTrie size in bytes: Emoji_Keycap_Sequence %5u num strings: %5u\n",
|
||||
(int)length, (int)getNumStrings(index));
|
||||
}
|
||||
offset += length;
|
||||
|
||||
index = EmojiProps::IX_RGI_EMOJI_MODIFIER_SEQUENCE_TRIE_OFFSET;
|
||||
indexes[index] = offset;
|
||||
length = getTrieString(index).length() * 2;
|
||||
if (!beQuiet) {
|
||||
printf("UCharsTrie size in bytes: RGI_Emoji_Modifier_Sequence %5u num strings: %5u\n",
|
||||
(int)length, (int)getNumStrings(index));
|
||||
}
|
||||
offset += length;
|
||||
|
||||
index = EmojiProps::IX_RGI_EMOJI_FLAG_SEQUENCE_TRIE_OFFSET;
|
||||
indexes[index] = offset;
|
||||
length = getTrieString(index).length() * 2;
|
||||
if (!beQuiet) {
|
||||
printf("UCharsTrie size in bytes: RGI_Emoji_Flag_Sequence %5u num strings: %5u\n",
|
||||
(int)length, (int)getNumStrings(index));
|
||||
}
|
||||
offset += length;
|
||||
|
||||
index = EmojiProps::IX_RGI_EMOJI_TAG_SEQUENCE_TRIE_OFFSET;
|
||||
indexes[index] = offset;
|
||||
length = getTrieString(index).length() * 2;
|
||||
if (!beQuiet) {
|
||||
printf("UCharsTrie size in bytes: RGI_Emoji_Tag_Sequence %5u num strings: %5u\n",
|
||||
(int)length, (int)getNumStrings(index));
|
||||
}
|
||||
offset += length;
|
||||
|
||||
index = EmojiProps::IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET;
|
||||
indexes[index] = offset;
|
||||
length = getTrieString(index).length() * 2;
|
||||
if (!beQuiet) {
|
||||
printf("UCharsTrie size in bytes: RGI_Emoji_ZWJ_Sequence %5u num strings: %5u\n",
|
||||
(int)length, (int)getNumStrings(index));
|
||||
}
|
||||
offset += length;
|
||||
|
||||
indexes[EmojiProps::IX_RESERVED10] = offset;
|
||||
indexes[EmojiProps::IX_RESERVED11] = offset;
|
||||
indexes[EmojiProps::IX_RESERVED12] = offset;
|
||||
indexes[EmojiProps::IX_TOTAL_SIZE] = offset;
|
||||
|
||||
if (!beQuiet) {
|
||||
printf("data size: %6ld\n", (long)offset);
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
void writeTrieBlock(UNewDataMemory *pData, const UnicodeString &s) {
|
||||
udata_writeBlock(pData, s.getBuffer(), s.length() * 2);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void
|
||||
EmojiPropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
|
||||
UNewDataMemory *pData = udata_create(path, "icu", "uemoji", &dataInfo,
|
||||
withCopyright ? U_COPYRIGHT_STRING : nullptr, &errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "genprops/emoji error: udata_create(%s, uemoji.icu) failed: %s\n",
|
||||
path, u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
|
||||
udata_writeBlock(pData, indexes, sizeof(indexes));
|
||||
udata_writeBlock(pData, trieBlock, trieSize);
|
||||
writeTrieBlock(pData, getTrieString(EmojiProps::IX_BASIC_EMOJI_TRIE_OFFSET));
|
||||
writeTrieBlock(pData, getTrieString(EmojiProps::IX_EMOJI_KEYCAP_SEQUENCE_TRIE_OFFSET));
|
||||
writeTrieBlock(pData, getTrieString(EmojiProps::IX_RGI_EMOJI_MODIFIER_SEQUENCE_TRIE_OFFSET));
|
||||
writeTrieBlock(pData, getTrieString(EmojiProps::IX_RGI_EMOJI_FLAG_SEQUENCE_TRIE_OFFSET));
|
||||
writeTrieBlock(pData, getTrieString(EmojiProps::IX_RGI_EMOJI_TAG_SEQUENCE_TRIE_OFFSET));
|
||||
writeTrieBlock(pData, getTrieString(EmojiProps::IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET));
|
||||
|
||||
long dataLength = udata_finish(pData, &errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
fprintf(stderr,
|
||||
"genprops/emoji error: error %s writing the output file\n",
|
||||
u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t totalSize = indexes[EmojiProps::IX_TOTAL_SIZE];
|
||||
if (dataLength != (long)totalSize) {
|
||||
fprintf(stderr,
|
||||
"udata_finish(uemoji.icu) reports %ld bytes written but should be %ld\n",
|
||||
dataLength, (long)totalSize);
|
||||
errorCode = U_INTERNAL_PROGRAM_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
PropsBuilder *
|
||||
createEmojiPropsBuilder(UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
PropsBuilder *pb = new EmojiPropsBuilder(errorCode);
|
||||
if (pb == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
return pb;
|
||||
}
|
||||
|
||||
/*
|
||||
* Hey, Emacs, please set the following:
|
||||
*
|
||||
* Local Variables:
|
||||
* indent-tabs-mode: nil
|
||||
* End:
|
||||
*
|
||||
*/
|
|
@ -42,6 +42,7 @@ void PropsBuilder::setUnicodeVersion(const UVersionInfo) {}
|
|||
void PropsBuilder::setAlgNamesRange(UChar32, UChar32,
|
||||
const char *, const char *, UErrorCode &) {}
|
||||
void PropsBuilder::setProps(const UniProps &, const UnicodeSet &, UErrorCode &) {}
|
||||
void PropsBuilder::parseUnidataFiles(const char *, UErrorCode &) {}
|
||||
void PropsBuilder::build(UErrorCode &) {}
|
||||
void PropsBuilder::writeCSourceFile(const char *, UErrorCode &) {}
|
||||
void PropsBuilder::writeJavaSourceFile(const char *, UErrorCode &) {}
|
||||
|
@ -108,6 +109,7 @@ main(int argc, char* argv[]) {
|
|||
LocalPointer<PropsBuilder> bidiPropsBuilder(createBiDiPropsBuilder(errorCode));
|
||||
LocalPointer<PropsBuilder> casePropsBuilder(createCasePropsBuilder(errorCode));
|
||||
LocalPointer<PropsBuilder> layoutPropsBuilder(createLayoutPropsBuilder(errorCode));
|
||||
LocalPointer<PropsBuilder> emojiPropsBuilder(createEmojiPropsBuilder(errorCode));
|
||||
LocalPointer<PropsBuilder> namesPropsBuilder(createNamesPropsBuilder(errorCode));
|
||||
if(errorCode.isFailure()) {
|
||||
fprintf(stderr, "genprops: unable to create PropsBuilders - %s\n", errorCode.errorName());
|
||||
|
@ -122,8 +124,10 @@ main(int argc, char* argv[]) {
|
|||
CharString icuSourceData(icuSource, errorCode);
|
||||
icuSourceData.appendPathPart("data", errorCode);
|
||||
|
||||
CharString ppucdPath(icuSourceData, errorCode);
|
||||
ppucdPath.appendPathPart("unidata", errorCode);
|
||||
CharString unidataPath(icuSourceData, errorCode);
|
||||
unidataPath.appendPathPart("unidata", errorCode);
|
||||
|
||||
CharString ppucdPath(unidataPath, errorCode);
|
||||
ppucdPath.appendPathPart("ppucd.txt", errorCode);
|
||||
|
||||
PreparsedUCD ppucd(ppucdPath.data(), errorCode);
|
||||
|
@ -151,6 +155,7 @@ main(int argc, char* argv[]) {
|
|||
bidiPropsBuilder->setProps(*props, newValues, errorCode);
|
||||
casePropsBuilder->setProps(*props, newValues, errorCode);
|
||||
layoutPropsBuilder->setProps(*props, newValues, errorCode);
|
||||
emojiPropsBuilder->setProps(*props, newValues, errorCode);
|
||||
namesPropsBuilder->setProps(*props, newValues, errorCode);
|
||||
} else if(lineType==PreparsedUCD::UNICODE_VERSION_LINE) {
|
||||
const UVersionInfo &version=ppucd.getUnicodeVersion();
|
||||
|
@ -158,6 +163,7 @@ main(int argc, char* argv[]) {
|
|||
bidiPropsBuilder->setUnicodeVersion(version);
|
||||
casePropsBuilder->setUnicodeVersion(version);
|
||||
layoutPropsBuilder->setUnicodeVersion(version);
|
||||
emojiPropsBuilder->setUnicodeVersion(version);
|
||||
namesPropsBuilder->setUnicodeVersion(version);
|
||||
} else if(lineType==PreparsedUCD::ALG_NAMES_RANGE_LINE) {
|
||||
UChar32 start, end;
|
||||
|
@ -175,10 +181,19 @@ main(int argc, char* argv[]) {
|
|||
}
|
||||
}
|
||||
|
||||
emojiPropsBuilder->parseUnidataFiles(unidataPath.data(), errorCode);
|
||||
|
||||
if (!beQuiet) { puts(""); }
|
||||
corePropsBuilder->build(errorCode);
|
||||
if (!beQuiet) { puts(""); }
|
||||
bidiPropsBuilder->build(errorCode);
|
||||
if (!beQuiet) { puts(""); }
|
||||
casePropsBuilder->build(errorCode);
|
||||
if (!beQuiet) { puts(""); }
|
||||
layoutPropsBuilder->build(errorCode);
|
||||
if (!beQuiet) { puts(""); }
|
||||
emojiPropsBuilder->build(errorCode);
|
||||
if (!beQuiet) { puts(""); }
|
||||
namesPropsBuilder->build(errorCode);
|
||||
if(errorCode.isFailure()) {
|
||||
fprintf(stderr, "genprops error: failure finalizing the data - %s\n",
|
||||
|
@ -205,6 +220,7 @@ main(int argc, char* argv[]) {
|
|||
casePropsBuilder->writeBinaryData(sourceDataIn.data(), withCopyright, errorCode);
|
||||
namesPropsBuilder->writeBinaryData(sourceDataIn.data(), withCopyright, errorCode);
|
||||
layoutPropsBuilder->writeBinaryData(sourceDataIn.data(), withCopyright, errorCode);
|
||||
emojiPropsBuilder->writeBinaryData(sourceDataIn.data(), withCopyright, errorCode);
|
||||
|
||||
return errorCode;
|
||||
}
|
||||
|
|
|
@ -35,6 +35,7 @@ public:
|
|||
virtual void setAlgNamesRange(UChar32 start, UChar32 end,
|
||||
const char *type, const char *prefix, UErrorCode &errorCode);
|
||||
virtual void setProps(const icu::UniProps &props, const icu::UnicodeSet &newValues, UErrorCode &errorCode);
|
||||
virtual void parseUnidataFiles(const char *unidataPath, UErrorCode &errorCode);
|
||||
virtual void build(UErrorCode &errorCode);
|
||||
virtual void writeCSourceFile(const char *path, UErrorCode &errorCode);
|
||||
virtual void writeJavaSourceFile(const char *path, UErrorCode &errorCode);
|
||||
|
@ -51,6 +52,7 @@ PropsBuilder *createCorePropsBuilder(UErrorCode &errorCode);
|
|||
PropsBuilder *createBiDiPropsBuilder(UErrorCode &errorCode);
|
||||
PropsBuilder *createCasePropsBuilder(UErrorCode &errorCode);
|
||||
PropsBuilder *createLayoutPropsBuilder(UErrorCode &errorCode);
|
||||
PropsBuilder *createEmojiPropsBuilder(UErrorCode &errorCode);
|
||||
PropsBuilder *createNamesPropsBuilder(UErrorCode &errorCode);
|
||||
|
||||
/* global flags */
|
||||
|
|
|
@ -1188,7 +1188,7 @@ static const Value VALUES_gcm[38] = {
|
|||
Value((int32_t)U_GC_ZS_MASK, "Zs Space_Separator"),
|
||||
};
|
||||
|
||||
static const Property PROPERTIES[107] = {
|
||||
static const Property PROPERTIES[114] = {
|
||||
Property(UCHAR_ALPHABETIC, "Alpha Alphabetic"),
|
||||
Property(UCHAR_ASCII_HEX_DIGIT, "AHex ASCII_Hex_Digit"),
|
||||
Property(UCHAR_BIDI_CONTROL, "Bidi_C Bidi_Control"),
|
||||
|
@ -1254,6 +1254,13 @@ static const Property PROPERTIES[107] = {
|
|||
Property(UCHAR_REGIONAL_INDICATOR, "RI Regional_Indicator"),
|
||||
Property(UCHAR_PREPENDED_CONCATENATION_MARK, "PCM Prepended_Concatenation_Mark"),
|
||||
Property(UCHAR_EXTENDED_PICTOGRAPHIC, "ExtPict Extended_Pictographic"),
|
||||
Property(UCHAR_BASIC_EMOJI, "Basic_Emoji Basic_Emoji"),
|
||||
Property(UCHAR_EMOJI_KEYCAP_SEQUENCE, "Emoji_Keycap_Sequence Emoji_Keycap_Sequence"),
|
||||
Property(UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE, "RGI_Emoji_Modifier_Sequence RGI_Emoji_Modifier_Sequence"),
|
||||
Property(UCHAR_RGI_EMOJI_FLAG_SEQUENCE, "RGI_Emoji_Flag_Sequence RGI_Emoji_Flag_Sequence"),
|
||||
Property(UCHAR_RGI_EMOJI_TAG_SEQUENCE, "RGI_Emoji_Tag_Sequence RGI_Emoji_Tag_Sequence"),
|
||||
Property(UCHAR_RGI_EMOJI_ZWJ_SEQUENCE, "RGI_Emoji_ZWJ_Sequence RGI_Emoji_ZWJ_Sequence"),
|
||||
Property(UCHAR_RGI_EMOJI, "RGI_Emoji RGI_Emoji"),
|
||||
Property(UCHAR_BIDI_CLASS, "bc Bidi_Class", VALUES_bc, 23),
|
||||
Property(UCHAR_BLOCK, "blk Block", VALUES_blk, 321),
|
||||
Property(UCHAR_CANONICAL_COMBINING_CLASS, "ccc Canonical_Combining_Class", VALUES_ccc, 58),
|
||||
|
|
|
@ -409,6 +409,15 @@ def AddBinaryProperty(short_name, long_name):
|
|||
_properties[NormPropName(long_name)] = prop
|
||||
|
||||
|
||||
def AddSingleNameBinaryProperty(name):
|
||||
# For some properties, the short name is the same as the long name.
|
||||
_null_values[name] = False
|
||||
bin_prop = _properties["Math"]
|
||||
prop = ("Binary", [name, name], bin_prop[2], bin_prop[3])
|
||||
_properties[name] = prop
|
||||
_properties[NormPropName(name)] = prop
|
||||
|
||||
|
||||
def AddPOSIXBinaryProperty(name):
|
||||
# We only define a long name for ICU-specific (non-UCD) POSIX properties.
|
||||
_null_values[name] = False
|
||||
|
@ -521,13 +530,21 @@ def ParsePropertyAliases(in_file):
|
|||
AddBinaryProperty("nfcinert", "NFC_Inert")
|
||||
AddBinaryProperty("nfkcinert", "NFKC_Inert")
|
||||
AddBinaryProperty("segstart", "Segment_Starter")
|
||||
# http://www.unicode.org/reports/tr51/#Emoji_Properties
|
||||
# https://www.unicode.org/reports/tr51/#Emoji_Properties
|
||||
AddBinaryProperty("Emoji", "Emoji")
|
||||
AddBinaryProperty("EPres", "Emoji_Presentation")
|
||||
AddBinaryProperty("EMod", "Emoji_Modifier")
|
||||
AddBinaryProperty("EBase", "Emoji_Modifier_Base")
|
||||
AddBinaryProperty("EComp", "Emoji_Component")
|
||||
AddBinaryProperty("ExtPict", "Extended_Pictographic")
|
||||
# https://www.unicode.org/reports/tr51/#Emoji_Sets
|
||||
AddSingleNameBinaryProperty("Basic_Emoji")
|
||||
AddSingleNameBinaryProperty("Emoji_Keycap_Sequence")
|
||||
AddSingleNameBinaryProperty("RGI_Emoji_Modifier_Sequence")
|
||||
AddSingleNameBinaryProperty("RGI_Emoji_Flag_Sequence")
|
||||
AddSingleNameBinaryProperty("RGI_Emoji_Tag_Sequence")
|
||||
AddSingleNameBinaryProperty("RGI_Emoji_ZWJ_Sequence")
|
||||
AddSingleNameBinaryProperty("RGI_Emoji")
|
||||
# C/POSIX character classes that do not have Unicode property [value] aliases.
|
||||
# See uchar.h.
|
||||
AddPOSIXBinaryProperty("alnum")
|
||||
|
@ -1609,6 +1626,8 @@ _files = {
|
|||
"DerivedNumericValues.txt": (DontCopy, ParseDerivedNumericValues),
|
||||
"EastAsianWidth.txt": (DontCopy, ParseEastAsianWidth),
|
||||
"emoji-data.txt": (DontCopy, ParseNamedProperties),
|
||||
"emoji-sequences.txt": (CopyOnly,),
|
||||
"emoji-zwj-sequences.txt": (CopyOnly,),
|
||||
"GraphemeBreakProperty.txt": (DontCopy, ParseGraphemeBreakProperty),
|
||||
"GraphemeBreakTest-cldr.txt": (CopyOnly, "testdata"),
|
||||
"IdnaTestV2.txt": (CopyOnly, "testdata"),
|
||||
|
|
Loading…
Add table
Reference in a new issue