diff --git a/icu4c/source/i18n/rbt_pars.cpp b/icu4c/source/i18n/rbt_pars.cpp index 00aec9cac14..21cc093a84c 100644 --- a/icu4c/source/i18n/rbt_pars.cpp +++ b/icu4c/source/i18n/rbt_pars.cpp @@ -142,6 +142,71 @@ UnicodeString ParseData::parseReference(const UnicodeString& text, // Segments //---------------------------------------------------------------------- +/** + * Segments are parentheses-enclosed regions of the input string. + * These are referenced in the output string using the notation $1, + * $2, etc. Numbering is in order of appearance of the left + * parenthesis. Number is one-based. Segments are defined as start, + * limit pairs. Segments may nest. + * + * During parsing, segment data is encoded in an object of class + * Segments. At runtime, the same data is encoded in compact form as + * an array of integers in a TransliterationRule. The runtime encoding + * must satisfy three goals: + * + * 1. Iterate over the offsets in a pattern, from left to right, + * and indicate all segment boundaries, in order. This is done + * during matching. + * + * 2. Given a reference $n, produce the start and limit offsets + * for that segment. This is done during replacement. + * + * 3. Similar to goal 1, but in addition, indicate whether each + * segment boundary is a start or a limit, in other words, whether + * each is an open paren or a close paren. This is required by + * the toRule() method. + * + * Goal 1 must be satisfied at high speed since this is done during + * matching. Goal 2 is next most important. Goal 3 is not performance + * critical since it is only needed by toRule(). + * + * The array of integers is actually two arrays concatenated. The + * first gives the index values of the open and close parentheses in + * the order they appear. The second maps segment numbers to the + * indices of the first array. The two arrays have the same length. + * Iterating over the first array satisfies goal 1. Indexing into the + * second array satisfies goal 2. Goal 3 is satisfied by iterating + * over the second array and constructing the required data when + * needed. This is what toRule() does. + * + * Example: (a b(c d)e f) + * 0 1 2 3 4 5 6 + * + * First array: Indices are 0, 2, 4, and 6. + + * Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the + * second array is 0, 3, 1 2 -- these give the indices in the + * first array at which $1:open, $1:close, $2:open, and $2:close + * occur. + * + * The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1 + * + * Each subarray is terminated with a -1, and two leading entries + * give the number of segments and the offset to the first entry + * of the second array. In addition, the second array value are + * all offset by 2 so they index directly into the final array. + * The total array size is 4*segments[0] + 4. The second index is + * 2*segments[0] + 3. + * + * In the output string, a segment reference is indicated by a + * character in a special range, as defined by + * RuleBasedTransliterator.Data. + * + * Most rules have no segments, in which case segments is null, and the + * output string need not be checked for segment reference characters. + * + * See also rbt_rule.h/cpp. + */ class Segments { UVector offsets; UVector isOpenParen; @@ -222,48 +287,6 @@ UBool Segments::extractLastParenSubstring(int32_t& start, int32_t& limit) { // Assume caller has already gotten a TRUE validate(). int32_t* Segments::createArray() const { - /** - * >>> Duplicated in rbt_pars.cpp and rbt_rule.h <<< - * - * The segments array encodes information about parentheses- - * enclosed regions of the input string. These are referenced in - * the output string using the notation $1, $2, etc. Numbering is - * in order of appearance of the left parenthesis. Number is - * one-based. Segments are defined as start, limit pairs. - * Segments may nest. - * - * In order two avoid allocating two subobjects, the segments - * array actually comprises two arrays. The first is gives the - * index values of the open and close parentheses in the order - * they appear. The second maps segment numbers to the indices of - * the first array. The two arrays have the same length. - * - * Example: (a b(c d)e f) - * 0 1 2 3 4 5 6 - * - * First array: Indices are 0, 2, 4, and 6. - - * Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the - * second array is 0, 3, 1 2 -- these give the indices in the - * first array at which $1:open, $1:close, $2:open, and $2:close - * occur. - * - * The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1 - * - * Each subarray is terminated with a -1, and two leading entries - * give the number of segments and the offset to the first entry - * of the second array. In addition, the second array value are - * all offset by 2 so they index directly into the final array. - * The total array size is 4*segments[0] + 4. The second index is - * 2*segments[0] + 3. - * - * In the output string, a segment reference is indicated by a - * character in a special range, as defined by - * RuleBasedTransliterator.Data. - * - * Most rules have no segments, in which case segments is null, and the - * output string need not be checked for segment reference characters. - */ int32_t c = count(); // number of segments int32_t arrayLen = 4*c + 4; int32_t *array = new int32_t[arrayLen]; diff --git a/icu4c/source/i18n/rbt_rule.cpp b/icu4c/source/i18n/rbt_rule.cpp index 2f627001981..100d592fbb4 100644 --- a/icu4c/source/i18n/rbt_rule.cpp +++ b/icu4c/source/i18n/rbt_rule.cpp @@ -25,6 +25,8 @@ static const UChar BACKSLASH = 0x005C; // \ // Otherwise, we allocate heap space. #define MAX_STATIC_SEGS 20 +// Macros for accessing the array of integers encoding the position of +// the segments. See rbt_pars.cpp::Segments for more details. #define FIRST_SEG_POS_INDEX 2 #define SEGMENTS_COUNT segments[0] #define SEGMENTS_LEN (SEGMENTS_COUNT*2+4) diff --git a/icu4c/source/i18n/rbt_rule.h b/icu4c/source/i18n/rbt_rule.h index 590eb4133c8..41add19258e 100644 --- a/icu4c/source/i18n/rbt_rule.h +++ b/icu4c/source/i18n/rbt_rule.h @@ -63,46 +63,8 @@ private: UnicodeString output; /** - * >>> Duplicated in rbt_pars.cpp and rbt_rule.h <<< - * - * The segments array encodes information about parentheses- - * enclosed regions of the input string. These are referenced in - * the output string using the notation $1, $2, etc. Numbering is - * in order of appearance of the left parenthesis. Number is - * one-based. Segments are defined as start, limit pairs. - * Segments may nest. - * - * In order two avoid allocating two subobjects, the segments - * array actually comprises two arrays. The first is gives the - * index values of the open and close parentheses in the order - * they appear. The second maps segment numbers to the indices of - * the first array. The two arrays have the same length. - * - * Example: (a b(c d)e f) - * 0 1 2 3 4 5 6 - * - * First array: Indices are 0, 2, 4, and 6. - - * Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the - * second array is 0, 3, 1 2 -- these give the indices in the - * first array at which $1:open, $1:close, $2:open, and $2:close - * occur. - * - * The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1 - * - * Each subarray is terminated with a -1, and two leading entries - * give the number of segments and the offset to the first entry - * of the second array. In addition, the second array value are - * all offset by 2 so they index directly into the final array. - * The total array size is 4*segments[0] + 4. The second index is - * 2*segments[0] + 3. - * - * In the output string, a segment reference is indicated by a - * character in a special range, as defined by - * RuleBasedTransliterator.Data. - * - * Most rules have no segments, in which case segments is null, and the - * output string need not be checked for segment reference characters. + * An array of integers encoding the position of the segments. + * See rbt_pars.cpp::Segments for more details. */ int32_t* segments;