ICU-22503 add property Indic_Conjunct_Break

This commit is contained in:
Elango Cheran 2024-06-25 17:51:04 -07:00
parent 2cbfd134ef
commit 06c077bd35
22 changed files with 3406 additions and 3287 deletions

View file

@ -6,4 +6,4 @@
# for running Bazel commands while ensuring, through configuration, that only a
# specific version of Bazel is executed.
USE_BAZEL_VERSION=7.1.1
USE_BAZEL_VERSION=7.2.1

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -677,13 +677,19 @@ typedef enum UProperty {
* @draft ICU 75
*/
UCHAR_IDENTIFIER_STATUS=0x1019,
/**
* Enumerated property Indic_Conjunct_Break.
* Used in the grapheme cluster break algorithm in UAX #29.
* @draft ICU 76
*/
UCHAR_INDIC_CONJUNCT_BREAK=0x101A,
#endif // U_HIDE_DRAFT_API
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the last constant for enumerated/integer Unicode properties.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
UCHAR_INT_LIMIT=0x101A,
UCHAR_INT_LIMIT=0x101B,
#endif // U_HIDE_DEPRECATED_API
/** Bitmask property General_Category_Mask.
@ -2729,6 +2735,31 @@ typedef enum UIndicSyllabicCategory {
U_INSC_REORDERING_KILLER,
} UIndicSyllabicCategory;
#ifndef U_HIDE_DRAFT_API
/**
* Indic Conjunct Break constants.
*
* @see UCHAR_INDIC_CONJUNCT_BREAK
* @draft ICU 76
*/
typedef enum UIndicConjunctBreak {
/*
* Note: UIndicConjunctBreak constants are parsed by preparseucd.py.
* It matches lines like
* U_INCB_<Unicode Indic_Conjunct_Break value name>
*/
/** @draft ICU 76 */
U_INCB_NONE,
/** @draft ICU 76 */
U_INCB_CONSONANT,
/** @draft ICU 76 */
U_INCB_EXTEND,
/** @draft ICU 76 */
U_INCB_LINKER,
} UIndicConjunctBreak;
#endif // U_HIDE_DRAFT_API
/**
* Vertical Orientation constants.
*

View file

@ -728,6 +728,7 @@ static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={
{ UPROPS_SRC_INSC, 0, 0, getInSC, layoutGetMaxValue },
{ UPROPS_SRC_VO, 0, 0, getVo, layoutGetMaxValue },
{ UPROPS_SRC_PROPSVEC, 0, static_cast<int32_t>(U_ID_STATUS_ALLOWED), getIDStatusValue, getMaxValueFromShift },
{ 0, UPROPS_INCB_MASK, UPROPS_INCB_SHIFT,defaultGetValue, defaultGetMaxValue },
};
U_CAPI int32_t U_EXPORT2

View file

@ -127,7 +127,8 @@ namespace {
// Bits
// 31..26 Age major version (major=0..63)
// 25..24 Age minor version (minor=0..3)
// 23..15 reserved
// 23..17 reserved
// 16..15 Indic Conjunct Break
// 14..12 East Asian Width
// 11..10 3..1: Bits 9..0 = Script_Extensions index
// 3: Script value from Script_Extensions
@ -158,6 +159,9 @@ inline constexpr uint8_t UPROPS_AGE_MINOR_MAX = 3;
inline constexpr uint32_t UPROPS_EA_MASK = 0x00007000;
inline constexpr int32_t UPROPS_EA_SHIFT = 12;
inline constexpr uint32_t UPROPS_INCB_MASK = 0x00018000;
inline constexpr int32_t UPROPS_INCB_SHIFT = 15;
/** Script_Extensions: mask includes Script */
inline constexpr uint32_t UPROPS_SCRIPT_X_MASK = 0x00000fff;

Binary file not shown.

Binary file not shown.

View file

@ -103,6 +103,9 @@ export UNICODE_TOOLS=~/oss/unicodetools/mine/src
so that the makefiles see the new version number.
cd $ICU_OUT/icu4c
ICU_DATA_BUILDTOOL_OPTS=--include_uni_core_data CXXFLAGS="-DU_USING_ICU_NAMESPACE=0 -Wimplicit-fallthrough" CPPFLAGS="-DU_NO_DEFAULT_INCLUDE_UTF_HEADERS=1 -fsanitize=bounds" LDFLAGS=-fsanitize=bounds ../../src/icu4c/source/runConfigureICU --enable-debug --disable-release Linux/clang --prefix=/usr/local/google/home/mscherer/icu/mine/inst/icu4c > config.out 2>&1 ; tail config.out
+ Elango's version (diff default C++ compiler & in-source build paths):
cd $ICU_OUT/icu4c/source
ICU_DATA_BUILDTOOL_OPTS=--include_uni_core_data CXXFLAGS="-DU_USING_ICU_NAMESPACE=0 -Wimplicit-fallthrough" CPPFLAGS="-DU_NO_DEFAULT_INCLUDE_UTF_HEADERS=1 -fsanitize=bounds" LDFLAGS=-fsanitize=bounds ./runConfigureICU --enable-debug --disable-release Linux/gcc --prefix=/usr/local/google/home/elango/oss/icu/icu4c > config.out 2>&1 ; tail config.out
*** data files & enums & parser code
@ -360,8 +363,6 @@ copying that version number into the $ICU_SRC/.bazeliskrc config file.
* run & fix ICU4J tests
TODO
*** API additions
- send notice to icu-design about new born-@stable API (enum constants etc.)

View file

@ -2802,6 +2802,14 @@ TestAdditionalProperties(void) {
{ 0x0606, UCHAR_PREPENDED_CONCATENATION_MARK, false },
{ 0x110BD, UCHAR_PREPENDED_CONCATENATION_MARK, true },
/* Indic_Conjunct_Break values */
{ 0x094D, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_LINKER },
{ 0x09B9, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_CONSONANT },
{ 0x05BE, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_NONE },
{ 0x05BF, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_EXTEND },
{ 0x05C0, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_NONE },
{ 0xD800, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_NONE },
/* undefined UProperty values */
{ 0x61, 0x4a7, 0 },
{ 0x234bc, 0x15ed, 0 }

View file

@ -21,6 +21,9 @@
#include "testutil.h"
#include "uparse.h"
#include "ucdtest.h"
#include "usettest.h"
#include <iostream>
static const char *ignorePropNames[]={
"FC_NFKC",
@ -1092,6 +1095,10 @@ void UnicodeTest::TestPropertiesUsingPpucd() {
{ UCHAR_NFC_QUICK_CHECK, UNORM_MAYBE },
{ UCHAR_NFKC_QUICK_CHECK, UNORM_MAYBE },
#endif // !UCONFIG_NO_NORMALIZATION
{ UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_NONE },
{ UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_CONSONANT },
{ UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_EXTEND },
{ UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_LINKER },
};
// Iterate through PPUCD file, accumulating each line's data into each UnicodeSet per property
@ -1133,7 +1140,7 @@ void UnicodeTest::TestPropertiesUsingPpucd() {
if (!tp.isBinary()) {
msg = msg + "=" + u_getPropertyValueName(tp.prop, tp.value, U_LONG_PROPERTY_NAME);
}
assertTrue(msg.c_str(), tp.set == icuPropSet);
UnicodeSetTest::checkEqual(*this, tp.set, icuPropSet, msg.c_str());
}
}

View file

@ -2114,20 +2114,26 @@ void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool
}
UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
return checkEqual(*this, s, t, message);
}
UBool UnicodeSetTest::checkEqual(
IntlTest& intlTest,
const UnicodeSet& s, const UnicodeSet& t, const char* message) {
intlTest.assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
intlTest.assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
UnicodeString source; s.toPattern(source, true);
UnicodeString result; t.toPattern(result, true);
if (s != t) {
errln(UnicodeString("FAIL: ") + message
+ "; source = " + source
+ "; result = " + result
intlTest.errln((UnicodeString)"FAIL: " + message
+ "\nsource = " + source
+ "\nresult = " + result
);
return false;
} else {
logln(UnicodeString("Ok: ") + message
+ "; source = " + source
+ "; result = " + result
intlTest.logln((UnicodeString)"Ok: " + message
+ "\nsource = " + source
+ "\nresult = " + result
);
}
return true;

View file

@ -33,6 +33,8 @@ public:
UnicodeSetTest();
~UnicodeSetTest();
static UBool checkEqual(IntlTest& intlTest, const UnicodeSet& s, const UnicodeSet& t, const char* message);
private:
void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par=nullptr) override;

View file

@ -904,6 +904,7 @@ public final class UCharacterProperty
return IdentifierStatus.ALLOWED.ordinal();
}
},
new IntProperty(0, INCB_MASK, INCB_SHIFT), // INDIC_CONJUNCT_BREAK
};
public int getIntPropertyValue(int c, int which) {
@ -1378,7 +1379,8 @@ public final class UCharacterProperty
// Bits
// 31..26 Age major version (major=0..63)
// 25..24 Age minor version (minor=0..3)
// 23..15 reserved
// 23..17 reserved
// 16..15 Indic Conjunct Break
// 14..12 East Asian Width
// 11..10 3..1: Bits 9..0 = Script_Extensions index
// 3: Script value from Script_Extensions
@ -1390,6 +1392,9 @@ public final class UCharacterProperty
private static final int EAST_ASIAN_MASK_ = 0x00007000;
private static final int EAST_ASIAN_SHIFT_ = 12;
private static final int INCB_MASK = 0x00018000;
private static final int INCB_SHIFT = 15;
/** Script_Extensions: mask includes Script */
public static final int SCRIPT_X_MASK = 0x00000fff;

View file

@ -4124,6 +4124,24 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
public static final int REORDERING_KILLER = 36;
}
/**
* Indic Conjunct Break constants.
* See https://unicode.org/reports/tr44/#Indic_Conjunct_Break
*
* @see UProperty#INDIC_CONJUNCT_BREAK
* @draft ICU 76
*/
public enum IndicConjunctBreak {
/** @draft ICU 76 */
NONE,
/** @draft ICU 76 */
CONSONANT,
/** @draft ICU 76 */
EXTEND,
/** @draft ICU 76 */
LINKER,
}
/**
* Vertical Orientation constants.
*

View file

@ -859,12 +859,19 @@ public interface UProperty
*/
public static final int IDENTIFIER_STATUS = 0x1019;
/**
* Enumerated property Indic_Conjunct_Break.
* Used in the grapheme cluster break algorithm in UAX #29.
* @draft ICU 76
*/
public static final int INDIC_CONJUNCT_BREAK = 0x101A;
/**
* One more than the last constant for enumerated/integer Unicode properties.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
@Deprecated
public static final int INT_LIMIT = 0x101A;
public static final int INT_LIMIT = 0x101B;
/**
* Bitmask property General_Category_Mask.

View file

@ -2212,6 +2212,14 @@ public final class UCharacterTest extends CoreTestFmwk
{ 0x0606, UProperty.PREPENDED_CONCATENATION_MARK, FALSE },
{ 0x110BD, UProperty.PREPENDED_CONCATENATION_MARK, TRUE },
/* Indic_Conjunct_Break values */
{ 0x094D, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.LINKER.ordinal() },
{ 0x09B9, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.CONSONANT.ordinal() },
{ 0x05BE, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.NONE.ordinal() },
{ 0x05BF, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.EXTEND.ordinal() },
{ 0x05C0, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.NONE.ordinal() },
{ 0xD800, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.NONE.ordinal() },
/* undefined UProperty values */
{ 0x61, 0x4a7, 0 },
{ 0x234bc, 0x15ed, 0 }

View file

@ -312,6 +312,10 @@ although the trie can hold 16-bit values.
Props vector 0 bits shuffled so that script and script extensions bits are contiguous.
Used 2 bits from props vector 0 to add Indic_Conjunct_Break. The bits used were freed up
by the preceding move of the Block property out of props vector 0 and the bit shuffling
("defragmentation") of Script and Script_Extensions.
----------------------------------------------------------------------------- */
U_NAMESPACE_USE
@ -712,6 +716,7 @@ struct PropToEnum {
const PropToEnum
propToEnums[]={
{ UCHAR_EAST_ASIAN_WIDTH, 0, UPROPS_EA_SHIFT, UPROPS_EA_MASK },
{ UCHAR_INDIC_CONJUNCT_BREAK, 0, UPROPS_INCB_SHIFT, UPROPS_INCB_MASK },
{ UCHAR_DECOMPOSITION_TYPE, 2, 0, UPROPS_DT_MASK },
{ UCHAR_GRAPHEME_CLUSTER_BREAK, 2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK },
{ UCHAR_WORD_BREAK, 2, UPROPS_WB_SHIFT, UPROPS_WB_MASK },

View file

@ -1186,6 +1186,13 @@ static const Value VALUES_ID_Status[2] = {
Value(U_ID_STATUS_ALLOWED, "Allowed Allowed"),
};
static const Value VALUES_InCB[4] = {
Value(U_INCB_NONE, "None None"),
Value(U_INCB_CONSONANT, "Consonant Consonant"),
Value(U_INCB_EXTEND, "Extend Extend"),
Value(U_INCB_LINKER, "Linker Linker"),
};
static const Value VALUES_gcm[38] = {
Value((int32_t)U_GC_C_MASK, "C Other"),
Value((int32_t)U_GC_CC_MASK, "Cc Control cntrl"),
@ -1242,7 +1249,7 @@ static const Value VALUES_ID_Type[12] = {
Value(U_ID_TYPE_RECOMMENDED, "Recommended Recommended"),
};
static const Property PROPERTIES[119] = {
static const Property PROPERTIES[120] = {
Property(UCHAR_ALPHABETIC, "Alpha Alphabetic"),
Property(UCHAR_ASCII_HEX_DIGIT, "AHex ASCII_Hex_Digit"),
Property(UCHAR_BIDI_CONTROL, "Bidi_C Bidi_Control"),
@ -1344,6 +1351,7 @@ static const Property PROPERTIES[119] = {
Property(UCHAR_INDIC_SYLLABIC_CATEGORY, "InSC Indic_Syllabic_Category", VALUES_InSC, 37),
Property(UCHAR_VERTICAL_ORIENTATION, "vo Vertical_Orientation", VALUES_vo, 4),
Property(UCHAR_IDENTIFIER_STATUS, "ID_Status Identifier_Status", VALUES_ID_Status, 2),
Property(UCHAR_INDIC_CONJUNCT_BREAK, "InCB Indic_Conjunct_Break", VALUES_InCB, 4),
Property(UCHAR_GENERAL_CATEGORY_MASK, "gcm General_Category_Mask", VALUES_gcm, 38),
Property(UCHAR_NUMERIC_VALUE, "nv Numeric_Value"),
Property(UCHAR_AGE, "age Age"),

View file

@ -2012,7 +2012,7 @@ _ublock_re = re.compile(" *(UBLOCK_[0-9A-Z_]+) *= *[0-9]+,")
# Sample line to match:
# U_EA_AMBIGUOUS,
_prop_and_value_re = re.compile(
" *(U_(BPT|DT|EA|GCB|HST|ID_STATUS|ID_TYPE|INPC|INSC|LB|JG|JT|NT|SB|VO|WB)_([0-9A-Z_]+))")
" *(U_(BPT|DT|EA|GCB|HST|ID_STATUS|ID_TYPE|INCB|INPC|INSC|LB|JG|JT|NT|SB|VO|WB)_([0-9A-Z_]+))")
# Sample line to match if it has matched _prop_and_value_re
# (we want to exclude aliases):