ICU-105 Regular Expressions, ongoing development

X-SVN-Rev: 10132
This commit is contained in:
Andy Heninger 2002-10-31 01:58:01 +00:00
parent 0ad551db67
commit ccd8fc3536
8 changed files with 267 additions and 50 deletions

View file

@ -72,10 +72,27 @@ static UnicodeSet *gUnescapeCharSet;
// will handle.
//
static const UChar gUnescapeCharPattern[] = {
// [ a b c e f n r t u U ]
0x5b, 0x61, 0x62, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x5d};
// [ a c e f n r t u U ]
0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x5d, 0};
//----------------------------------------------------------------------------------------
//
// Unicode Set Definitions for Regular Expression composite properties
//
//----------------------------------------------------------------------------------------
static const UChar gIsWordPattern[] = {
// [ \ p { L l } \ p { L u }
0x5b, 0x5c, 0x70, 0x7b, 0x4c, 0x6c, 0x7d, 0x5c, 0x70, 0x7b, 0x4c, 0x75, 0x7d,
// \ p { L t } \ p { L o }
0x5c, 0x70, 0x7b, 0x4c, 0x74, 0x7d, 0x5c, 0x70, 0x7b, 0x4c, 0x6f, 0x7d,
// \ p { N d } ]
0x5c, 0x70, 0x7b, 0x4e, 0x64, 0x7d, 0x5d, 0};
static const UnicodeSet *gPropSets[URX_LAST_SET];
//----------------------------------------------------------------------------------------
//
// Constructor.
@ -101,7 +118,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
}
//
// Set up the constant Unicode Sets.
// Set up the constant (static) Unicode Sets.
//
if (gRuleSets[kRuleSet_rule_char-128] == NULL) {
// TODO: Make thread safe.
@ -110,6 +127,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
gRuleSets[kRuleSet_white_space-128] = new UnicodeSet(UnicodePropertySet::getRuleWhiteSpaceSet(status));
gRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(gRuleSet_digit_char_pattern, status);
gUnescapeCharSet = new UnicodeSet(gUnescapeCharPattern, status);
gPropSets[URX_ISWORD_SET] = new UnicodeSet(gIsWordPattern, status);
if (U_FAILURE(status)) {
delete gRuleSets[kRuleSet_rule_char-128];
delete gRuleSets[kRuleSet_white_space-128];
@ -119,6 +137,11 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
gRuleSets[kRuleSet_white_space-128] = NULL;
gRuleSets[kRuleSet_digit_char-128] = NULL;
gUnescapeCharSet = NULL;
int i;
for (i=0; i<URX_LAST_SET; i++) {
delete gPropSets[i];
gPropSets[i] = NULL;
}
return;
}
}
@ -164,6 +187,7 @@ void RegexCompile::compile(
// Prepare the RegexPattern object to receive the compiled pattern.
fRXPat->fPattern = pat;
fRXPat->fStaticSets = gPropSets;
// Initialize the pattern scanning state machine
@ -685,16 +709,26 @@ UBool RegexCompile::doParseActions(EParseAction action)
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 0), *fStatus);
break;
case doBackslashD:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus);
break;
case doBackslashd:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus);
break;
case doBackslashG:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
break;
case doBackslashW:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_W, 1), *fStatus);
fRXPat->fCompiledPat->addElement(
URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET | URX_NEG_SET), *fStatus);
break;
case doBackslashw:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_W, 0), *fStatus);
fRXPat->fCompiledPat->addElement(
URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus);
break;
case doBackslashX:
@ -772,7 +806,6 @@ int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
if (reserveLoc) {
int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);
int32_t prevType = URX_TYPE(opAtTheLoc);
U_ASSERT(prevType==URX_ONECHAR || prevType==URX_SETREF || prevType==URX_DOTANY);
int32_t nop = URX_BUILD(URX_NOP, 0);
fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
}

View file

@ -47,6 +47,7 @@ enum Regex_PatternParseAction {
doBackslashB,
doNGPlus,
doPatFinish,
doBackslashD,
doIntervalMinValue,
doIntervalDigit,
doPossesiveOpt,
@ -66,6 +67,7 @@ enum Regex_PatternParseAction {
doPatStart,
doBackslashb,
doEndString,
doBackslashd,
doOpenLookBehindNeg,
doSplitString,
rbbiLastAction};
@ -96,7 +98,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doDotAny, 46 /* . */, 18,0, TRUE} // 7
, {doNOP, 92 /* \ */, 59,0, TRUE} // 8
, {doNOP, 253, 2,0, FALSE} // 9
, {doRuleError, 255, 71,0, FALSE} // 10
, {doRuleError, 255, 73,0, FALSE} // 10
, {doStringChar, 254, 11,0, TRUE} // 11 string
, {doStringChar, 130, 11,0, TRUE} // 12
, {doSplitString, 63 /* ? */, 18,0, FALSE} // 13
@ -118,10 +120,10 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doOpenLookAhead, 61 /* = */, 3, 22, TRUE} // 29
, {doOpenLookAheadNeg, 33 /* ! */, 3, 22, TRUE} // 30
, {doNOP, 60 /* < */, 33,0, TRUE} // 31
, {doBadOpenParenType, 255, 71,0, FALSE} // 32
, {doBadOpenParenType, 255, 73,0, FALSE} // 32
, {doOpenLookBehind, 61 /* = */, 3, 22, TRUE} // 33 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 3, 22, TRUE} // 34
, {doBadOpenParenType, 255, 71,0, FALSE} // 35
, {doBadOpenParenType, 255, 73,0, FALSE} // 35
, {doNGStar, 63 /* ? */, 22,0, TRUE} // 36 quant-star
, {doPossesiveStar, 43 /* + */, 22,0, TRUE} // 37
, {doStar, 255, 22,0, FALSE} // 38
@ -133,14 +135,14 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doOpt, 255, 22,0, FALSE} // 44
, {doNOP, 129, 45,0, TRUE} // 45 interval-open
, {doIntervalMinValue, 128, 48,0, FALSE} // 46
, {doNumberExpectedError, 255, 71,0, FALSE} // 47
, {doNumberExpectedError, 255, 73,0, FALSE} // 47
, {doNOP, 129, 52,0, TRUE} // 48 interval-value
, {doNOP, 125 /* } */, 52,0, FALSE} // 49
, {doIntervalDigit, 128, 48,0, TRUE} // 50
, {doNumberExpectedError, 255, 71,0, FALSE} // 51
, {doNumberExpectedError, 255, 73,0, FALSE} // 51
, {doNOP, 129, 52,0, TRUE} // 52 interval-close
, {doTagValue, 125 /* } */, 55,0, TRUE} // 53
, {doNumberExpectedError, 255, 71,0, FALSE} // 54
, {doNumberExpectedError, 255, 73,0, FALSE} // 54
, {doNOP, 254, 3,0, FALSE} // 55 expr-cont-no-interval
, {doExprOrOperator, 124 /* | */, 3,0, TRUE} // 56
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 57
@ -148,16 +150,18 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doBackslashA, 65 /* A */, 3,0, TRUE} // 59 backslash
, {doBackslashB, 66 /* B */, 3,0, TRUE} // 60
, {doBackslashb, 98 /* b */, 3,0, TRUE} // 61
, {doBackslashG, 71 /* G */, 3,0, TRUE} // 62
, {doProperty, 112 /* p */, 18,0, FALSE} // 63
, {doProperty, 80 /* P */, 18,0, FALSE} // 64
, {doBackslashW, 87 /* W */, 3,0, TRUE} // 65
, {doBackslashw, 119 /* w */, 3,0, TRUE} // 66
, {doBackslashX, 88 /* X */, 3,0, TRUE} // 67
, {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 68
, {doBackslashz, 122 /* z */, 3,0, TRUE} // 69
, {doStartString, 255, 11,0, TRUE} // 70
, {doExit, 255, 71,0, TRUE} // 71 errorDeath
, {doBackslashd, 100 /* d */, 18,0, TRUE} // 62
, {doBackslashD, 68 /* D */, 18,0, TRUE} // 63
, {doBackslashG, 71 /* G */, 3,0, TRUE} // 64
, {doProperty, 112 /* p */, 18,0, FALSE} // 65
, {doProperty, 80 /* P */, 18,0, FALSE} // 66
, {doBackslashW, 87 /* W */, 18,0, TRUE} // 67
, {doBackslashw, 119 /* w */, 18,0, TRUE} // 68
, {doBackslashX, 88 /* X */, 18,0, TRUE} // 69
, {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 70
, {doBackslashz, 122 /* z */, 3,0, TRUE} // 71
, {doStartString, 255, 11,0, TRUE} // 72
, {doExit, 255, 73,0, TRUE} // 73 errorDeath
};
static const char *RegexStateNames[] = { 0,
"start",
@ -229,6 +233,8 @@ static const char *RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
0,
"errorDeath",
0};

View file

@ -214,12 +214,14 @@ backslash:
'A' n term doBackslashA
'B' n term doBackslashB
'b' n term doBackslashb
'd' n expr-quant doBackslashd
'D' n expr-quant doBackslashD
'G' n term doBackslashG
'p' expr-quant doProperty # \p{Lu} style property
'P' expr-quant doProperty
'W' n term doBackslashW
'w' n term doBackslashw
'X' n term doBackslashX
'W' n expr-quant doBackslashW
'w' n expr-quant doBackslashw
'X' n expr-quant doBackslashX
'Z' n term doBackslashZ
'z' n term doBackslashz

View file

@ -26,7 +26,7 @@ static const uint32_t URX_STATE_SAVE = 6; // Value field is pattern po
static const uint32_t URX_NOP = 7;
static const uint32_t URX_START_CAPTURE = 8; // Value field is capture group number.
static const uint32_t URX_END_CAPTURE = 9; // Value field is capture group number
static const uint32_t URX_UNUSED10 = 10;
static const uint32_t URX_STATIC_SETREF = 10; // Value field is index of set in array of sets.
static const uint32_t URX_SETREF = 11; // Value field is index of set in array of sets.
static const uint32_t URX_DOTANY = 12;
static const uint32_t URX_JMP = 13; // Value field is destination position in
@ -35,11 +35,14 @@ static const uint32_t URX_FAIL = 14; // Stop match operation; No
static const uint32_t URX_BACKSLASH_A = 15;
static const uint32_t URX_BACKSLASH_B = 16; // Value field: 0: \b 1: \B
static const uint32_t URX_BACKSLASH_D = 22; // Value field: 0: \d 1: \D
static const uint32_t URX_BACKSLASH_G = 17;
static const uint32_t URX_BACKSLASH_W = 18; // Value field: 0: \w 1: \W
static const uint32_t URX_BACKSLASH_X = 19;
static const uint32_t URX_BACKSLASH_Z = 20; // Value field: 0: \z 1: \Z
static const uint32_t URX_DOTANY_ALL = 21; // ., in the . matches any mode.
//
// Convenience macros for assembling and disassembling a compiled operation.
@ -49,5 +52,16 @@ static const uint32_t URX_BACKSLASH_Z = 20; // Value field: 0: \z 1
#define URX_VAL(x) ((x) & 0xffffff)
//
// Access to Unicode Sets for composite properties
// The sets are accessed by the match engine for things like \w (word boundary)
//
static const uint32_t URX_ISWORD_SET = 1;
static const uint32_t URX_ISALNUM_SET = 2;
static const uint32_t URX_ISALPHA_SET = 3;
static const uint32_t URX_LAST_SET = 4;
static const uint32_t URX_NEG_SET = 0x800000; // Flag bit to reverse sense of set
// membership test.
#endif

View file

@ -183,15 +183,15 @@ int32_t RegexMatcher::end(UErrorCode &err) const {
int32_t RegexMatcher::end(int group, UErrorCode &err) const {
if (U_FAILURE(err)) {
return 0;
return -1;
}
if (fMatch == FALSE) {
err = U_REGEX_INVALID_STATE;
return 0;
return -1;
}
if (group < 0 || group > fPattern->fNumCaptureGroups) {
err = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
return -1;
}
int32_t e = -1;
if (group == 0) {
@ -404,15 +404,15 @@ int32_t RegexMatcher::start(UErrorCode &err) const {
int32_t RegexMatcher::start(int group, UErrorCode &err) const {
if (U_FAILURE(err)) {
return 0;
return -1;
}
if (fMatch == FALSE) {
err = U_REGEX_INVALID_STATE;
return 0;
return -1;
}
if (group < 0 || group > fPattern->fNumCaptureGroups) {
err = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
return -1;
}
int32_t s;
if (group == 0) {
@ -426,6 +426,54 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
//--------------------------------------------------------------------------------
//
// isWordBoundary
// in perl, "xab..cd..", \b is true at positions 0,3,5,7
// For us,
// If the current char is a combining mark, \b is FALSE
// Scan backwards to the first non-combining char
// Pos is a boundary if the current and previous chars are
// opposite in membership in \w set
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::isWordBoundary(int32_t pos) {
UBool isBoundary = FALSE;
if (pos >= fInputLength) {
// off end of string. Not a boundary.
return FALSE;
}
// Determine whether char c at Pos is a member of the word set of chars.
UChar32 c = fInput->char32At(pos);
int8_t ctype = u_charType(c);
if (ctype==U_NON_SPACING_MARK || ctype==U_ENCLOSING_MARK) {
// Current char is a combining one. Not a boundary.
return FALSE;
}
UBool cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
// Back up until we come to a non-combining char, determine whether
// that char is a word char.
UBool prevCIsWord = FALSE;
int32_t prevPos = pos;
for (;;) {
if (prevPos == 0) {
break;
}
prevPos = fInput->moveIndex32(prevPos, -1);
UChar32 prevChar = fInput->char32At(prevPos);
int8_t prevCType = u_charType(prevChar);
if (!(prevCType==U_NON_SPACING_MARK || prevCType==U_ENCLOSING_MARK)) {
prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
break;
}
}
isBoundary = cIsWord ^ prevCIsWord;
return isBoundary;
}
//--------------------------------------------------------------------------------
//
// getCaptureText We have encountered a '\' that might preceed a
@ -597,23 +645,44 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
break;
case URX_BACKSLASH_B: // Test for word boundaries
if (FALSE) {
backTrack(inputIdx, patIdx);
{
UBool success = isWordBoundary(inputIdx);
success ^= (opValue != 0); // flip sense for \B
if (!success) {
backTrack(inputIdx, patIdx);
}
}
break;
case URX_BACKSLASH_D:
{
if (inputIdx >= fInputLength) {
backTrack(inputIdx, patIdx);
break;
}
UChar32 c = fInput->char32At(inputIdx);
int8_t ctype = u_charType(c);
UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
success ^= (opValue != 0);
if (success) {
inputIdx = fInput->moveIndex32(inputIdx, 1);
} else {
backTrack(inputIdx, patIdx);
}
}
break;
case URX_BACKSLASH_G: // Test for position at end of previous match
if (FALSE) {
backTrack(inputIdx, patIdx);
}
break;
case URX_BACKSLASH_W: // Match word chars (TODO: doesn't belong here?
if (FALSE) {
backTrack(inputIdx, patIdx);
}
break;
case URX_BACKSLASH_X: // Match combining character sequence
if (FALSE) {
@ -629,6 +698,33 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
case URX_STATIC_SETREF:
{
// Test input character against one of the predefined sets
// (Word Characters, for example)
// The high bit of the op value is a flag for the match polarity.
// 0: success if input char is in set.
// 1: success if input char is not in set.
UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
opValue &= ~URX_NEG_SET;
if (inputIdx < fInputLength) {
// There is input left. Pick up one char and test it for set membership.
UChar32 c = fInput->char32At(inputIdx);
U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
const UnicodeSet *s = fPattern->fStaticSets[opValue];
if (s->contains(c)) {
success = !success;
}
}
if (success) {
inputIdx = fInput->moveIndex32(inputIdx, 1);
} else {
backTrack(inputIdx, patIdx);
}
}
break;
case URX_SETREF:
if (inputIdx < fInputLength) {
// There is input left. Pick up one char and test it for set membership.
@ -648,12 +744,44 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
case URX_DOTANY:
// . matches anything, but does not match if we've run out of input.
if (inputIdx < fInputLength) {
// There is input left. Advance one character in it.
{
// . matches anything
if (inputIdx >= fInputLength) {
// At end of input. Match failed. Backtrack out.
backTrack(inputIdx, patIdx);
break;
}
// There is input left. Advance over one char, unless we've hit end-of-line
UChar32 c = fInput->char32At(inputIdx);
inputIdx = fInput->moveIndex32(inputIdx, 1);
} else {
backTrack(inputIdx, patIdx);
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
// End of line in normal mode. . does not match.
backTrack(inputIdx, patIdx);
break;
}
}
break;
case URX_DOTANY_ALL:
{
// ., in dot-matches-all (including new lines) mode
// . matches anything
if (inputIdx >= fInputLength) {
// At end of input. Match failed. Backtrack out.
backTrack(inputIdx, patIdx);
break;
}
// There is input left. Advance over one char, unless we've hit end-of-line
UChar32 c = fInput->char32At(inputIdx);
inputIdx = fInput->moveIndex32(inputIdx, 1);
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
// In the case of a CR/LF, we need to advance over both.
UChar32 nextc = fInput->char32At(inputIdx);
if (c == 0x0d && nextc == 0x0a) {
inputIdx = fInput->moveIndex32(inputIdx, 1);
}
}
}
break;

View file

@ -411,7 +411,7 @@ static char *opNames[] = {
"NOP",
"START_CAPTURE",
"END_CAPTURE",
"UNUSED10",
"URX_STATIC_SETREF",
"SETREF",
"DOTANY",
"JMP",
@ -466,6 +466,7 @@ void RegexPattern::dump() {
case URX_START_CAPTURE:
case URX_END_CAPTURE:
case URX_SETREF:
case URX_STATIC_SETREF:
case URX_STATE_SAVE:
case URX_JMP:
case URX_BACKSLASH_B:

View file

@ -19,6 +19,7 @@ U_NAMESPACE_BEGIN
class RegexMatcher;
class UVector;
class UStack;
class UnicodeSet;
//---------------------------------------------------------------------------------
@ -26,7 +27,7 @@ class UStack;
// Flags for Regular Expression Modes.
// TODO: Move to C header once one exists.
// All flags default to off or false
// All are as defined by Java Regexes.
// All are as defined by Java Regexps.
//
//---------------------------------------------------------------------------------
enum {
@ -34,7 +35,7 @@ enum {
UREGEX_CASE_INSENSITIVE = 2, // Enable case insensitive matching.
UREGEX_COMMENTS = 4, // Allow white space and comments within patterns
UREGEX_DOTALL = 32, // If set, "." matches line terminators.
// otherwise matching stops at line end.
// otherwise . matching stops at line end.
UREGEX_MULTILINE = 8, // Control behavior of "$" and "^".
// If set, recognize line terminators within string
// otherwise, match only at start and end of
@ -165,7 +166,7 @@ private:
//
UnicodeString fPattern; // The original pattern string.
int32_t fFlags; // The flags used when compiling the pattern.
// TODO: make an enum type for the flags.
//
UVector *fCompiledPat; // The compiled, tokenized pattern.
UnicodeString fLiteralText; // Any literal string data from the pattern,
// after un-escaping, for use during the match.
@ -180,6 +181,9 @@ private:
int32_t fNumCaptureGroups;
int32_t fMaxCaptureDigits;
const UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
// regex character classes, e.g. Word.
friend class RegexCompile;
friend class RegexMatcher;
@ -428,6 +432,7 @@ private:
int32_t &repIdx,
int32_t &textStart,
int32_t &textEnd);
UBool isWordBoundary(int32_t pos); // perform the \b test
const RegexPattern *fPattern;

View file

@ -263,6 +263,7 @@ void RegexTest::regex_find(char *pat, char *input, UErrorCode expectedStatus, in
// matcher->groupCount does not include group 0, the entire match, hence the +1.
if (isMatch == FALSE && groupStarts.size() != 0) {
errln("Error at line %d: Match expected, but none found.\n", line);
failed = true;
goto cleanupAndReturn;
}
int i;
@ -315,7 +316,7 @@ void RegexTest::Basic() {
//
#if 0
{
REGEX_FIND( "\\p{Lu}+", "here we go ... <0>ABC</0> and no more.")
REGEX_FIND("\\D+", "<0>non digits</0>");
}
exit(1);
#endif
@ -428,7 +429,6 @@ void RegexTest::Basic() {
// REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
REGEX_TESTLM("\\b", "\\u0008", TRUE, TRUE); // BS
// REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L (or whatever) TODO: bug in Unescape
// REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape TODO: bug in Unescape
REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
@ -1012,6 +1012,34 @@ void RegexTest::Extended() {
REGEX_FIND( "\\p{Lu}+", "here we go ... <0>ABC</0> and no more.");
REGEX_FIND( "(\\p{L}+)(\\P{L}*?) (\\p{Zs}*)", "7999<0><1>letters</1><2>4949%^&*(</2> <3> </3></0>");
// \w and \W
REGEX_FIND( "\\w+", " $%^&*( <0>hello123</0>%^&*(");
REGEX_FIND( "\\W+", "<0> $%^&*( </0>hello123%^&*(");
// \b \B
REGEX_FIND( ".*?\\b(.).*", "<0> $%^&*( <1>h</1>ello123%^&*()gxx</0>");
// Finds first chars of up to 5 words
REGEX_FIND( "(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?",
"<0><1>T</1>the <2>q</2>ick <3>b</3>rown <4>f</4></0>ox");
REGEX_FIND( "H.*?((?:\\B.)+)", "<0>H<1>ello</1></0> ");
REGEX_FIND( ".*?((?:\\B.)+).*?((?:\\B.)+).*?((?:\\B.)+)",
"<0>H<1>ello</1> <2> </2>g<3>oodbye</3></0> ");
REGEX_FIND("(?:.*?\\b(.))?(?:.*?\\b(.))?(?:.*?\\b(.))?(?:.*?\\b(.))?(?:.*?\\b(.))?.*",
"<0> \\u0301 \\u0301<1>A</1>\\u0302BC\\u0303\\u0304<2> </2>\\u0305 \\u0306"
"<3>X</3>\\u0307Y\\u0308</0>");
// . does not match new-lines
REGEX_FIND(".", "\\u000a\\u000d\\u0085\\u000c\\u2028\\u2029<0>X</0>\\u000aY");
REGEX_FIND("A.", "A\\u000a "); // no match
// \d for decimal digits
REGEX_FIND("\\d*", "<0>0123456789\\u0660\\u06F9\\u0969\\u0A66\\u1369"
"\\u17E2\\uFF10\\U0001D7CE\\U0001D7FF</0>non-digits");
REGEX_FIND("\\D+", "<0>non digits</0>");
REGEX_FIND("\\D*(\\d*)(\\D*)", "<0>non-digits<1>3456666</1><2>more non digits</2></0>");
}