ICU-1076 edit internal source documentation

X-SVN-Rev: 5399
This commit is contained in:
Alan Liu 2001-07-31 18:22:15 +00:00
parent 772c6a5bd5
commit a5e74ef787
3 changed files with 69 additions and 82 deletions

View file

@ -142,6 +142,71 @@ UnicodeString ParseData::parseReference(const UnicodeString& text,
// Segments
//----------------------------------------------------------------------
/**
* Segments are parentheses-enclosed regions of the input string.
* These are referenced in the output string using the notation $1,
* $2, etc. Numbering is in order of appearance of the left
* parenthesis. Number is one-based. Segments are defined as start,
* limit pairs. Segments may nest.
*
* During parsing, segment data is encoded in an object of class
* Segments. At runtime, the same data is encoded in compact form as
* an array of integers in a TransliterationRule. The runtime encoding
* must satisfy three goals:
*
* 1. Iterate over the offsets in a pattern, from left to right,
* and indicate all segment boundaries, in order. This is done
* during matching.
*
* 2. Given a reference $n, produce the start and limit offsets
* for that segment. This is done during replacement.
*
* 3. Similar to goal 1, but in addition, indicate whether each
* segment boundary is a start or a limit, in other words, whether
* each is an open paren or a close paren. This is required by
* the toRule() method.
*
* Goal 1 must be satisfied at high speed since this is done during
* matching. Goal 2 is next most important. Goal 3 is not performance
* critical since it is only needed by toRule().
*
* The array of integers is actually two arrays concatenated. The
* first gives the index values of the open and close parentheses in
* the order they appear. The second maps segment numbers to the
* indices of the first array. The two arrays have the same length.
* Iterating over the first array satisfies goal 1. Indexing into the
* second array satisfies goal 2. Goal 3 is satisfied by iterating
* over the second array and constructing the required data when
* needed. This is what toRule() does.
*
* Example: (a b(c d)e f)
* 0 1 2 3 4 5 6
*
* First array: Indices are 0, 2, 4, and 6.
* Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
* second array is 0, 3, 1 2 -- these give the indices in the
* first array at which $1:open, $1:close, $2:open, and $2:close
* occur.
*
* The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
*
* Each subarray is terminated with a -1, and two leading entries
* give the number of segments and the offset to the first entry
* of the second array. In addition, the second array value are
* all offset by 2 so they index directly into the final array.
* The total array size is 4*segments[0] + 4. The second index is
* 2*segments[0] + 3.
*
* In the output string, a segment reference is indicated by a
* character in a special range, as defined by
* RuleBasedTransliterator.Data.
*
* Most rules have no segments, in which case segments is null, and the
* output string need not be checked for segment reference characters.
*
* See also rbt_rule.h/cpp.
*/
class Segments {
UVector offsets;
UVector isOpenParen;
@ -222,48 +287,6 @@ UBool Segments::extractLastParenSubstring(int32_t& start, int32_t& limit) {
// Assume caller has already gotten a TRUE validate().
int32_t* Segments::createArray() const {
/**
* >>> Duplicated in rbt_pars.cpp and rbt_rule.h <<<
*
* The segments array encodes information about parentheses-
* enclosed regions of the input string. These are referenced in
* the output string using the notation $1, $2, etc. Numbering is
* in order of appearance of the left parenthesis. Number is
* one-based. Segments are defined as start, limit pairs.
* Segments may nest.
*
* In order two avoid allocating two subobjects, the segments
* array actually comprises two arrays. The first is gives the
* index values of the open and close parentheses in the order
* they appear. The second maps segment numbers to the indices of
* the first array. The two arrays have the same length.
*
* Example: (a b(c d)e f)
* 0 1 2 3 4 5 6
*
* First array: Indices are 0, 2, 4, and 6.
* Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
* second array is 0, 3, 1 2 -- these give the indices in the
* first array at which $1:open, $1:close, $2:open, and $2:close
* occur.
*
* The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
*
* Each subarray is terminated with a -1, and two leading entries
* give the number of segments and the offset to the first entry
* of the second array. In addition, the second array value are
* all offset by 2 so they index directly into the final array.
* The total array size is 4*segments[0] + 4. The second index is
* 2*segments[0] + 3.
*
* In the output string, a segment reference is indicated by a
* character in a special range, as defined by
* RuleBasedTransliterator.Data.
*
* Most rules have no segments, in which case segments is null, and the
* output string need not be checked for segment reference characters.
*/
int32_t c = count(); // number of segments
int32_t arrayLen = 4*c + 4;
int32_t *array = new int32_t[arrayLen];

View file

@ -25,6 +25,8 @@ static const UChar BACKSLASH = 0x005C; // \
// Otherwise, we allocate heap space.
#define MAX_STATIC_SEGS 20
// Macros for accessing the array of integers encoding the position of
// the segments. See rbt_pars.cpp::Segments for more details.
#define FIRST_SEG_POS_INDEX 2
#define SEGMENTS_COUNT segments[0]
#define SEGMENTS_LEN (SEGMENTS_COUNT*2+4)

View file

@ -63,46 +63,8 @@ private:
UnicodeString output;
/**
* >>> Duplicated in rbt_pars.cpp and rbt_rule.h <<<
*
* The segments array encodes information about parentheses-
* enclosed regions of the input string. These are referenced in
* the output string using the notation $1, $2, etc. Numbering is
* in order of appearance of the left parenthesis. Number is
* one-based. Segments are defined as start, limit pairs.
* Segments may nest.
*
* In order two avoid allocating two subobjects, the segments
* array actually comprises two arrays. The first is gives the
* index values of the open and close parentheses in the order
* they appear. The second maps segment numbers to the indices of
* the first array. The two arrays have the same length.
*
* Example: (a b(c d)e f)
* 0 1 2 3 4 5 6
*
* First array: Indices are 0, 2, 4, and 6.
* Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
* second array is 0, 3, 1 2 -- these give the indices in the
* first array at which $1:open, $1:close, $2:open, and $2:close
* occur.
*
* The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
*
* Each subarray is terminated with a -1, and two leading entries
* give the number of segments and the offset to the first entry
* of the second array. In addition, the second array value are
* all offset by 2 so they index directly into the final array.
* The total array size is 4*segments[0] + 4. The second index is
* 2*segments[0] + 3.
*
* In the output string, a segment reference is indicated by a
* character in a special range, as defined by
* RuleBasedTransliterator.Data.
*
* Most rules have no segments, in which case segments is null, and the
* output string need not be checked for segment reference characters.
* An array of integers encoding the position of the segments.
* See rbt_pars.cpp::Segments for more details.
*/
int32_t* segments;