ICU-1076 edit internal source documentation

X-SVN-Rev: 5399
2025-04-08 06:53:45 +00:00 · 2001-07-31 18:22:15 +00:00 · 2001-07-31 18:22:15 +00:00 · a5e74ef787
commit a5e74ef787
parent 772c6a5bd5
3 changed files with 69 additions and 82 deletions
--- a/icu4c/source/i18n/rbt_pars.cpp
+++ b/icu4c/source/i18n/rbt_pars.cpp
@ -142,6 +142,71 @@ UnicodeString ParseData::parseReference(const UnicodeString& text,
 // Segments
 //----------------------------------------------------------------------

+/**
+ * Segments are parentheses-enclosed regions of the input string.
+ * These are referenced in the output string using the notation $1,
+ * $2, etc.  Numbering is in order of appearance of the left
+ * parenthesis.  Number is one-based.  Segments are defined as start,
+ * limit pairs.  Segments may nest.
+ *
+ * During parsing, segment data is encoded in an object of class
+ * Segments.  At runtime, the same data is encoded in compact form as
+ * an array of integers in a TransliterationRule.  The runtime encoding
+ * must satisfy three goals:
+ *
+ * 1. Iterate over the offsets in a pattern, from left to right,
+ *    and indicate all segment boundaries, in order.  This is done
+ *    during matching.
+ *
+ * 2. Given a reference $n, produce the start and limit offsets
+ *    for that segment.  This is done during replacement.
+ *
+ * 3. Similar to goal 1, but in addition, indicate whether each
+ *    segment boundary is a start or a limit, in other words, whether
+ *    each is an open paren or a close paren.  This is required by
+ *    the toRule() method.
+ *
+ * Goal 1 must be satisfied at high speed since this is done during
+ * matching.  Goal 2 is next most important.  Goal 3 is not performance
+ * critical since it is only needed by toRule().
+ *
+ * The array of integers is actually two arrays concatenated.  The
+ * first gives the index values of the open and close parentheses in
+ * the order they appear.  The second maps segment numbers to the
+ * indices of the first array.  The two arrays have the same length.
+ * Iterating over the first array satisfies goal 1.  Indexing into the
+ * second array satisfies goal 2.  Goal 3 is satisfied by iterating
+ * over the second array and constructing the required data when
+ * needed.  This is what toRule() does.
+ *
+ * Example:  (a b(c d)e f)
+ *            0 1 2 3 4 5 6
+ *
+ * First array: Indices are 0, 2, 4, and 6.
+ 
+ * Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
+ * second array is 0, 3, 1 2 -- these give the indices in the
+ * first array at which $1:open, $1:close, $2:open, and $2:close
+ * occur.
+ *
+ * The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
+ *
+ * Each subarray is terminated with a -1, and two leading entries
+ * give the number of segments and the offset to the first entry
+ * of the second array.  In addition, the second array value are
+ * all offset by 2 so they index directly into the final array.
+ * The total array size is 4*segments[0] + 4.  The second index is
+ * 2*segments[0] + 3.
+ *
+ * In the output string, a segment reference is indicated by a
+ * character in a special range, as defined by
+ * RuleBasedTransliterator.Data.
+ *
+ * Most rules have no segments, in which case segments is null, and the
+ * output string need not be checked for segment reference characters.
+ *
+ * See also rbt_rule.h/cpp.
+ */
 class Segments {
    UVector offsets;
    UVector isOpenParen;
@ -222,48 +287,6 @@ UBool Segments::extractLastParenSubstring(int32_t& start, int32_t& limit) {

 // Assume caller has already gotten a TRUE validate().
 int32_t* Segments::createArray() const {
-    /**
-     * >>> Duplicated in rbt_pars.cpp and rbt_rule.h <<<
-     *
-     * The segments array encodes information about parentheses-
-     * enclosed regions of the input string.  These are referenced in
-     * the output string using the notation $1, $2, etc.  Numbering is
-     * in order of appearance of the left parenthesis.  Number is
-     * one-based.  Segments are defined as start, limit pairs.
-     * Segments may nest.
-     * 
-     * In order two avoid allocating two subobjects, the segments
-     * array actually comprises two arrays.  The first is gives the
-     * index values of the open and close parentheses in the order
-     * they appear.  The second maps segment numbers to the indices of
-     * the first array.  The two arrays have the same length.
-     *
-     * Example:  (a b(c d)e f)
-     *            0 1 2 3 4 5 6
-     *
-     * First array: Indices are 0, 2, 4, and 6.
-
-     * Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
-     * second array is 0, 3, 1 2 -- these give the indices in the
-     * first array at which $1:open, $1:close, $2:open, and $2:close
-     * occur.
-     *
-     * The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
-     *
-     * Each subarray is terminated with a -1, and two leading entries
-     * give the number of segments and the offset to the first entry
-     * of the second array.  In addition, the second array value are
-     * all offset by 2 so they index directly into the final array.
-     * The total array size is 4*segments[0] + 4.  The second index is
-     * 2*segments[0] + 3.
-     *
-     * In the output string, a segment reference is indicated by a
-     * character in a special range, as defined by
-     * RuleBasedTransliterator.Data.
-     *
-     * Most rules have no segments, in which case segments is null, and the
-     * output string need not be checked for segment reference characters.
-     */
    int32_t c = count(); // number of segments
    int32_t arrayLen = 4*c + 4;
    int32_t *array = new int32_t[arrayLen];
--- a/icu4c/source/i18n/rbt_rule.cpp
+++ b/icu4c/source/i18n/rbt_rule.cpp
@ -25,6 +25,8 @@ static const UChar BACKSLASH  = 0x005C; // \
 // Otherwise, we allocate heap space.
 #define MAX_STATIC_SEGS 20

+// Macros for accessing the array of integers encoding the position of
+// the segments.  See rbt_pars.cpp::Segments for more details.
 #define FIRST_SEG_POS_INDEX 2
 #define SEGMENTS_COUNT segments[0]
 #define SEGMENTS_LEN (SEGMENTS_COUNT*2+4)
--- a/icu4c/source/i18n/rbt_rule.h
+++ b/icu4c/source/i18n/rbt_rule.h
@ -63,46 +63,8 @@ private:
    UnicodeString output;

    /**
-     * >>> Duplicated in rbt_pars.cpp and rbt_rule.h <<<
-     *
-     * The segments array encodes information about parentheses-
-     * enclosed regions of the input string.  These are referenced in
-     * the output string using the notation $1, $2, etc.  Numbering is
-     * in order of appearance of the left parenthesis.  Number is
-     * one-based.  Segments are defined as start, limit pairs.
-     * Segments may nest.
-     * 
-     * In order two avoid allocating two subobjects, the segments
-     * array actually comprises two arrays.  The first is gives the
-     * index values of the open and close parentheses in the order
-     * they appear.  The second maps segment numbers to the indices of
-     * the first array.  The two arrays have the same length.
-     *
-     * Example:  (a b(c d)e f)
-     *            0 1 2 3 4 5 6
-     *
-     * First array: Indices are 0, 2, 4, and 6.
-
-     * Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
-     * second array is 0, 3, 1 2 -- these give the indices in the
-     * first array at which $1:open, $1:close, $2:open, and $2:close
-     * occur.
-     *
-     * The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
-     *
-     * Each subarray is terminated with a -1, and two leading entries
-     * give the number of segments and the offset to the first entry
-     * of the second array.  In addition, the second array value are
-     * all offset by 2 so they index directly into the final array.
-     * The total array size is 4*segments[0] + 4.  The second index is
-     * 2*segments[0] + 3.
-     *
-     * In the output string, a segment reference is indicated by a
-     * character in a special range, as defined by
-     * RuleBasedTransliterator.Data.
-     *
-     * Most rules have no segments, in which case segments is null, and the
-     * output string need not be checked for segment reference characters.
+     * An array of integers encoding the position of the segments.
+     * See rbt_pars.cpp::Segments for more details.
     */
    int32_t* segments;