mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-03 20:45:30 +00:00
ICU-22503 add property Indic_Conjunct_Break
This commit is contained in:
parent
2cbfd134ef
commit
06c077bd35
22 changed files with 3406 additions and 3287 deletions
|
@ -6,4 +6,4 @@
|
|||
# for running Bazel commands while ensuring, through configuration, that only a
|
||||
# specific version of Bazel is executed.
|
||||
|
||||
USE_BAZEL_VERSION=7.1.1
|
||||
USE_BAZEL_VERSION=7.2.1
|
||||
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -677,13 +677,19 @@ typedef enum UProperty {
|
|||
* @draft ICU 75
|
||||
*/
|
||||
UCHAR_IDENTIFIER_STATUS=0x1019,
|
||||
/**
|
||||
* Enumerated property Indic_Conjunct_Break.
|
||||
* Used in the grapheme cluster break algorithm in UAX #29.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
UCHAR_INDIC_CONJUNCT_BREAK=0x101A,
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
#ifndef U_HIDE_DEPRECATED_API
|
||||
/**
|
||||
* One more than the last constant for enumerated/integer Unicode properties.
|
||||
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
|
||||
*/
|
||||
UCHAR_INT_LIMIT=0x101A,
|
||||
UCHAR_INT_LIMIT=0x101B,
|
||||
#endif // U_HIDE_DEPRECATED_API
|
||||
|
||||
/** Bitmask property General_Category_Mask.
|
||||
|
@ -2729,6 +2735,31 @@ typedef enum UIndicSyllabicCategory {
|
|||
U_INSC_REORDERING_KILLER,
|
||||
} UIndicSyllabicCategory;
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* Indic Conjunct Break constants.
|
||||
*
|
||||
* @see UCHAR_INDIC_CONJUNCT_BREAK
|
||||
* @draft ICU 76
|
||||
*/
|
||||
typedef enum UIndicConjunctBreak {
|
||||
/*
|
||||
* Note: UIndicConjunctBreak constants are parsed by preparseucd.py.
|
||||
* It matches lines like
|
||||
* U_INCB_<Unicode Indic_Conjunct_Break value name>
|
||||
*/
|
||||
|
||||
/** @draft ICU 76 */
|
||||
U_INCB_NONE,
|
||||
/** @draft ICU 76 */
|
||||
U_INCB_CONSONANT,
|
||||
/** @draft ICU 76 */
|
||||
U_INCB_EXTEND,
|
||||
/** @draft ICU 76 */
|
||||
U_INCB_LINKER,
|
||||
} UIndicConjunctBreak;
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Vertical Orientation constants.
|
||||
*
|
||||
|
|
|
@ -728,6 +728,7 @@ static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={
|
|||
{ UPROPS_SRC_INSC, 0, 0, getInSC, layoutGetMaxValue },
|
||||
{ UPROPS_SRC_VO, 0, 0, getVo, layoutGetMaxValue },
|
||||
{ UPROPS_SRC_PROPSVEC, 0, static_cast<int32_t>(U_ID_STATUS_ALLOWED), getIDStatusValue, getMaxValueFromShift },
|
||||
{ 0, UPROPS_INCB_MASK, UPROPS_INCB_SHIFT,defaultGetValue, defaultGetMaxValue },
|
||||
};
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
|
|
|
@ -127,7 +127,8 @@ namespace {
|
|||
// Bits
|
||||
// 31..26 Age major version (major=0..63)
|
||||
// 25..24 Age minor version (minor=0..3)
|
||||
// 23..15 reserved
|
||||
// 23..17 reserved
|
||||
// 16..15 Indic Conjunct Break
|
||||
// 14..12 East Asian Width
|
||||
// 11..10 3..1: Bits 9..0 = Script_Extensions index
|
||||
// 3: Script value from Script_Extensions
|
||||
|
@ -158,6 +159,9 @@ inline constexpr uint8_t UPROPS_AGE_MINOR_MAX = 3;
|
|||
inline constexpr uint32_t UPROPS_EA_MASK = 0x00007000;
|
||||
inline constexpr int32_t UPROPS_EA_SHIFT = 12;
|
||||
|
||||
inline constexpr uint32_t UPROPS_INCB_MASK = 0x00018000;
|
||||
inline constexpr int32_t UPROPS_INCB_SHIFT = 15;
|
||||
|
||||
/** Script_Extensions: mask includes Script */
|
||||
inline constexpr uint32_t UPROPS_SCRIPT_X_MASK = 0x00000fff;
|
||||
|
||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -103,6 +103,9 @@ export UNICODE_TOOLS=~/oss/unicodetools/mine/src
|
|||
so that the makefiles see the new version number.
|
||||
cd $ICU_OUT/icu4c
|
||||
ICU_DATA_BUILDTOOL_OPTS=--include_uni_core_data CXXFLAGS="-DU_USING_ICU_NAMESPACE=0 -Wimplicit-fallthrough" CPPFLAGS="-DU_NO_DEFAULT_INCLUDE_UTF_HEADERS=1 -fsanitize=bounds" LDFLAGS=-fsanitize=bounds ../../src/icu4c/source/runConfigureICU --enable-debug --disable-release Linux/clang --prefix=/usr/local/google/home/mscherer/icu/mine/inst/icu4c > config.out 2>&1 ; tail config.out
|
||||
+ Elango's version (diff default C++ compiler & in-source build paths):
|
||||
cd $ICU_OUT/icu4c/source
|
||||
ICU_DATA_BUILDTOOL_OPTS=--include_uni_core_data CXXFLAGS="-DU_USING_ICU_NAMESPACE=0 -Wimplicit-fallthrough" CPPFLAGS="-DU_NO_DEFAULT_INCLUDE_UTF_HEADERS=1 -fsanitize=bounds" LDFLAGS=-fsanitize=bounds ./runConfigureICU --enable-debug --disable-release Linux/gcc --prefix=/usr/local/google/home/elango/oss/icu/icu4c > config.out 2>&1 ; tail config.out
|
||||
|
||||
*** data files & enums & parser code
|
||||
|
||||
|
@ -360,8 +363,6 @@ copying that version number into the $ICU_SRC/.bazeliskrc config file.
|
|||
|
||||
* run & fix ICU4J tests
|
||||
|
||||
TODO
|
||||
|
||||
*** API additions
|
||||
- send notice to icu-design about new born-@stable API (enum constants etc.)
|
||||
|
||||
|
|
|
@ -2802,6 +2802,14 @@ TestAdditionalProperties(void) {
|
|||
{ 0x0606, UCHAR_PREPENDED_CONCATENATION_MARK, false },
|
||||
{ 0x110BD, UCHAR_PREPENDED_CONCATENATION_MARK, true },
|
||||
|
||||
/* Indic_Conjunct_Break values */
|
||||
{ 0x094D, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_LINKER },
|
||||
{ 0x09B9, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_CONSONANT },
|
||||
{ 0x05BE, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_NONE },
|
||||
{ 0x05BF, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_EXTEND },
|
||||
{ 0x05C0, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_NONE },
|
||||
{ 0xD800, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_NONE },
|
||||
|
||||
/* undefined UProperty values */
|
||||
{ 0x61, 0x4a7, 0 },
|
||||
{ 0x234bc, 0x15ed, 0 }
|
||||
|
|
|
@ -21,6 +21,9 @@
|
|||
#include "testutil.h"
|
||||
#include "uparse.h"
|
||||
#include "ucdtest.h"
|
||||
#include "usettest.h"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
static const char *ignorePropNames[]={
|
||||
"FC_NFKC",
|
||||
|
@ -1092,6 +1095,10 @@ void UnicodeTest::TestPropertiesUsingPpucd() {
|
|||
{ UCHAR_NFC_QUICK_CHECK, UNORM_MAYBE },
|
||||
{ UCHAR_NFKC_QUICK_CHECK, UNORM_MAYBE },
|
||||
#endif // !UCONFIG_NO_NORMALIZATION
|
||||
{ UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_NONE },
|
||||
{ UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_CONSONANT },
|
||||
{ UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_EXTEND },
|
||||
{ UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_LINKER },
|
||||
};
|
||||
|
||||
// Iterate through PPUCD file, accumulating each line's data into each UnicodeSet per property
|
||||
|
@ -1133,7 +1140,7 @@ void UnicodeTest::TestPropertiesUsingPpucd() {
|
|||
if (!tp.isBinary()) {
|
||||
msg = msg + "=" + u_getPropertyValueName(tp.prop, tp.value, U_LONG_PROPERTY_NAME);
|
||||
}
|
||||
assertTrue(msg.c_str(), tp.set == icuPropSet);
|
||||
UnicodeSetTest::checkEqual(*this, tp.set, icuPropSet, msg.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -2114,20 +2114,26 @@ void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool
|
|||
}
|
||||
|
||||
UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
|
||||
assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
|
||||
assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
|
||||
return checkEqual(*this, s, t, message);
|
||||
}
|
||||
|
||||
UBool UnicodeSetTest::checkEqual(
|
||||
IntlTest& intlTest,
|
||||
const UnicodeSet& s, const UnicodeSet& t, const char* message) {
|
||||
intlTest.assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
|
||||
intlTest.assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
|
||||
UnicodeString source; s.toPattern(source, true);
|
||||
UnicodeString result; t.toPattern(result, true);
|
||||
if (s != t) {
|
||||
errln(UnicodeString("FAIL: ") + message
|
||||
+ "; source = " + source
|
||||
+ "; result = " + result
|
||||
intlTest.errln((UnicodeString)"FAIL: " + message
|
||||
+ "\nsource = " + source
|
||||
+ "\nresult = " + result
|
||||
);
|
||||
return false;
|
||||
} else {
|
||||
logln(UnicodeString("Ok: ") + message
|
||||
+ "; source = " + source
|
||||
+ "; result = " + result
|
||||
intlTest.logln((UnicodeString)"Ok: " + message
|
||||
+ "\nsource = " + source
|
||||
+ "\nresult = " + result
|
||||
);
|
||||
}
|
||||
return true;
|
||||
|
|
|
@ -33,6 +33,8 @@ public:
|
|||
UnicodeSetTest();
|
||||
~UnicodeSetTest();
|
||||
|
||||
static UBool checkEqual(IntlTest& intlTest, const UnicodeSet& s, const UnicodeSet& t, const char* message);
|
||||
|
||||
private:
|
||||
void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par=nullptr) override;
|
||||
|
||||
|
|
|
@ -904,6 +904,7 @@ public final class UCharacterProperty
|
|||
return IdentifierStatus.ALLOWED.ordinal();
|
||||
}
|
||||
},
|
||||
new IntProperty(0, INCB_MASK, INCB_SHIFT), // INDIC_CONJUNCT_BREAK
|
||||
};
|
||||
|
||||
public int getIntPropertyValue(int c, int which) {
|
||||
|
@ -1378,7 +1379,8 @@ public final class UCharacterProperty
|
|||
// Bits
|
||||
// 31..26 Age major version (major=0..63)
|
||||
// 25..24 Age minor version (minor=0..3)
|
||||
// 23..15 reserved
|
||||
// 23..17 reserved
|
||||
// 16..15 Indic Conjunct Break
|
||||
// 14..12 East Asian Width
|
||||
// 11..10 3..1: Bits 9..0 = Script_Extensions index
|
||||
// 3: Script value from Script_Extensions
|
||||
|
@ -1390,6 +1392,9 @@ public final class UCharacterProperty
|
|||
private static final int EAST_ASIAN_MASK_ = 0x00007000;
|
||||
private static final int EAST_ASIAN_SHIFT_ = 12;
|
||||
|
||||
private static final int INCB_MASK = 0x00018000;
|
||||
private static final int INCB_SHIFT = 15;
|
||||
|
||||
/** Script_Extensions: mask includes Script */
|
||||
public static final int SCRIPT_X_MASK = 0x00000fff;
|
||||
|
||||
|
|
|
@ -4124,6 +4124,24 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
|||
public static final int REORDERING_KILLER = 36;
|
||||
}
|
||||
|
||||
/**
|
||||
* Indic Conjunct Break constants.
|
||||
* See https://unicode.org/reports/tr44/#Indic_Conjunct_Break
|
||||
*
|
||||
* @see UProperty#INDIC_CONJUNCT_BREAK
|
||||
* @draft ICU 76
|
||||
*/
|
||||
public enum IndicConjunctBreak {
|
||||
/** @draft ICU 76 */
|
||||
NONE,
|
||||
/** @draft ICU 76 */
|
||||
CONSONANT,
|
||||
/** @draft ICU 76 */
|
||||
EXTEND,
|
||||
/** @draft ICU 76 */
|
||||
LINKER,
|
||||
}
|
||||
|
||||
/**
|
||||
* Vertical Orientation constants.
|
||||
*
|
||||
|
|
|
@ -859,12 +859,19 @@ public interface UProperty
|
|||
*/
|
||||
public static final int IDENTIFIER_STATUS = 0x1019;
|
||||
|
||||
/**
|
||||
* Enumerated property Indic_Conjunct_Break.
|
||||
* Used in the grapheme cluster break algorithm in UAX #29.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
public static final int INDIC_CONJUNCT_BREAK = 0x101A;
|
||||
|
||||
/**
|
||||
* One more than the last constant for enumerated/integer Unicode properties.
|
||||
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final int INT_LIMIT = 0x101A;
|
||||
public static final int INT_LIMIT = 0x101B;
|
||||
|
||||
/**
|
||||
* Bitmask property General_Category_Mask.
|
||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -2212,6 +2212,14 @@ public final class UCharacterTest extends CoreTestFmwk
|
|||
{ 0x0606, UProperty.PREPENDED_CONCATENATION_MARK, FALSE },
|
||||
{ 0x110BD, UProperty.PREPENDED_CONCATENATION_MARK, TRUE },
|
||||
|
||||
/* Indic_Conjunct_Break values */
|
||||
{ 0x094D, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.LINKER.ordinal() },
|
||||
{ 0x09B9, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.CONSONANT.ordinal() },
|
||||
{ 0x05BE, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.NONE.ordinal() },
|
||||
{ 0x05BF, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.EXTEND.ordinal() },
|
||||
{ 0x05C0, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.NONE.ordinal() },
|
||||
{ 0xD800, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.NONE.ordinal() },
|
||||
|
||||
/* undefined UProperty values */
|
||||
{ 0x61, 0x4a7, 0 },
|
||||
{ 0x234bc, 0x15ed, 0 }
|
||||
|
|
|
@ -312,6 +312,10 @@ although the trie can hold 16-bit values.
|
|||
|
||||
Props vector 0 bits shuffled so that script and script extensions bits are contiguous.
|
||||
|
||||
Used 2 bits from props vector 0 to add Indic_Conjunct_Break. The bits used were freed up
|
||||
by the preceding move of the Block property out of props vector 0 and the bit shuffling
|
||||
("defragmentation") of Script and Script_Extensions.
|
||||
|
||||
----------------------------------------------------------------------------- */
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
@ -712,6 +716,7 @@ struct PropToEnum {
|
|||
const PropToEnum
|
||||
propToEnums[]={
|
||||
{ UCHAR_EAST_ASIAN_WIDTH, 0, UPROPS_EA_SHIFT, UPROPS_EA_MASK },
|
||||
{ UCHAR_INDIC_CONJUNCT_BREAK, 0, UPROPS_INCB_SHIFT, UPROPS_INCB_MASK },
|
||||
{ UCHAR_DECOMPOSITION_TYPE, 2, 0, UPROPS_DT_MASK },
|
||||
{ UCHAR_GRAPHEME_CLUSTER_BREAK, 2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK },
|
||||
{ UCHAR_WORD_BREAK, 2, UPROPS_WB_SHIFT, UPROPS_WB_MASK },
|
||||
|
|
|
@ -1186,6 +1186,13 @@ static const Value VALUES_ID_Status[2] = {
|
|||
Value(U_ID_STATUS_ALLOWED, "Allowed Allowed"),
|
||||
};
|
||||
|
||||
static const Value VALUES_InCB[4] = {
|
||||
Value(U_INCB_NONE, "None None"),
|
||||
Value(U_INCB_CONSONANT, "Consonant Consonant"),
|
||||
Value(U_INCB_EXTEND, "Extend Extend"),
|
||||
Value(U_INCB_LINKER, "Linker Linker"),
|
||||
};
|
||||
|
||||
static const Value VALUES_gcm[38] = {
|
||||
Value((int32_t)U_GC_C_MASK, "C Other"),
|
||||
Value((int32_t)U_GC_CC_MASK, "Cc Control cntrl"),
|
||||
|
@ -1242,7 +1249,7 @@ static const Value VALUES_ID_Type[12] = {
|
|||
Value(U_ID_TYPE_RECOMMENDED, "Recommended Recommended"),
|
||||
};
|
||||
|
||||
static const Property PROPERTIES[119] = {
|
||||
static const Property PROPERTIES[120] = {
|
||||
Property(UCHAR_ALPHABETIC, "Alpha Alphabetic"),
|
||||
Property(UCHAR_ASCII_HEX_DIGIT, "AHex ASCII_Hex_Digit"),
|
||||
Property(UCHAR_BIDI_CONTROL, "Bidi_C Bidi_Control"),
|
||||
|
@ -1344,6 +1351,7 @@ static const Property PROPERTIES[119] = {
|
|||
Property(UCHAR_INDIC_SYLLABIC_CATEGORY, "InSC Indic_Syllabic_Category", VALUES_InSC, 37),
|
||||
Property(UCHAR_VERTICAL_ORIENTATION, "vo Vertical_Orientation", VALUES_vo, 4),
|
||||
Property(UCHAR_IDENTIFIER_STATUS, "ID_Status Identifier_Status", VALUES_ID_Status, 2),
|
||||
Property(UCHAR_INDIC_CONJUNCT_BREAK, "InCB Indic_Conjunct_Break", VALUES_InCB, 4),
|
||||
Property(UCHAR_GENERAL_CATEGORY_MASK, "gcm General_Category_Mask", VALUES_gcm, 38),
|
||||
Property(UCHAR_NUMERIC_VALUE, "nv Numeric_Value"),
|
||||
Property(UCHAR_AGE, "age Age"),
|
||||
|
|
|
@ -2012,7 +2012,7 @@ _ublock_re = re.compile(" *(UBLOCK_[0-9A-Z_]+) *= *[0-9]+,")
|
|||
# Sample line to match:
|
||||
# U_EA_AMBIGUOUS,
|
||||
_prop_and_value_re = re.compile(
|
||||
" *(U_(BPT|DT|EA|GCB|HST|ID_STATUS|ID_TYPE|INPC|INSC|LB|JG|JT|NT|SB|VO|WB)_([0-9A-Z_]+))")
|
||||
" *(U_(BPT|DT|EA|GCB|HST|ID_STATUS|ID_TYPE|INCB|INPC|INSC|LB|JG|JT|NT|SB|VO|WB)_([0-9A-Z_]+))")
|
||||
|
||||
# Sample line to match if it has matched _prop_and_value_re
|
||||
# (we want to exclude aliases):
|
||||
|
|
Loading…
Add table
Reference in a new issue