diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index 7274ca9a925..5f6db594073 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -148,9 +148,6 @@ void RegexCompile::compile( if (U_FAILURE(*fStatus)) { return; } - fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets; - fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8; - // Initialize the pattern scanning state machine fPatternLength = utext_nativeLength(pat); @@ -1565,15 +1562,15 @@ UBool RegexCompile::doParseActions(int32_t action) case doSetBackslash_s: { UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); - set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); + set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); break; } case doSetBackslash_S: { UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); - UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); - SSet.complement(); + UnicodeSet SSet; + SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement(); set->addAll(SSet); break; } @@ -1642,15 +1639,15 @@ UBool RegexCompile::doParseActions(int32_t action) case doSetBackslash_w: { UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); - set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); + set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); break; } case doSetBackslash_W: { UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); - UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); - SSet.complement(); + UnicodeSet SSet; + SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement(); set->addAll(SSet); break; } @@ -2425,6 +2422,7 @@ void RegexCompile::compileSet(UnicodeSet *theSet) { // The set contains two or more chars. (the normal case) // Put it into the compiled pattern as a set. + theSet->freeze(); int32_t setNumber = fRXPat->fSets->size(); fRXPat->fSets->addElement(theSet, *fStatus); appendOp(URX_SETREF, setNumber); @@ -2818,8 +2816,8 @@ void RegexCompile::matchStartType() { if (currentLen == 0) { int32_t sn = URX_VAL(op); U_ASSERT(sn>0 && snfStaticSets[sn]; - fRXPat->fInitialChars->addAll(*s); + const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[sn]; + fRXPat->fInitialChars->addAll(s); numInitialStrings += 2; } currentLen = safeIncrement(currentLen, 1); @@ -2831,9 +2829,8 @@ void RegexCompile::matchStartType() { case URX_STAT_SETREF_N: if (currentLen == 0) { int32_t sn = URX_VAL(op); - const UnicodeSet *s = fRXPat->fStaticSets[sn]; - UnicodeSet sc(*s); - sc.complement(); + UnicodeSet sc; + sc.addAll(RegexStaticSets::gStaticSets->fPropSets[sn]).complement(); fRXPat->fInitialChars->addAll(sc); numInitialStrings += 2; } @@ -4420,7 +4417,8 @@ UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB status = U_ZERO_ERROR; if (propName.caseCompare(u"word", -1, 0) == 0) { - set.adoptInsteadAndCheckErrorCode(new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])), status); + set.adoptInsteadAndCheckErrorCode( + RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].cloneAsThawed(), status); break; } if (propName.compare(u"all", -1) == 0) { diff --git a/icu4c/source/i18n/regexcst.h b/icu4c/source/i18n/regexcst.h index 8b12096ac7c..d44c2aec2be 100644 --- a/icu4c/source/i18n/regexcst.h +++ b/icu4c/source/i18n/regexcst.h @@ -20,117 +20,117 @@ U_NAMESPACE_BEGIN // // Character classes for regex pattern scanning. // - static const uint8_t kRuleSet_ascii_letter = 128; - static const uint8_t kRuleSet_digit_char = 129; + static const uint8_t kRuleSet_digit_char = 128; + static const uint8_t kRuleSet_ascii_letter = 129; static const uint8_t kRuleSet_rule_char = 130; - + constexpr uint32_t kRuleSet_count = 131-128; enum Regex_PatternParseAction { - doSetBackslash_V, - doSetBackslash_h, - doBeginNamedBackRef, - doSetMatchMode, - doEnterQuoteMode, - doOpenCaptureParen, - doContinueNamedCapture, - doSetBackslash_d, - doBeginMatchMode, - doBackslashX, - doSetPosixProp, - doIntervalError, - doSetLiteralEscaped, - doSetBackslash_s, - doNOP, - doBackslashv, - doOpenLookBehind, - doPatStart, - doPossessiveInterval, - doOpenAtomicParen, - doOpenLookAheadNeg, - doBackslashd, - doBackslashZ, - doIntervalUpperDigit, - doBadNamedCapture, - doSetDifference2, - doSetAddAmp, - doSetNamedChar, - doNamedChar, - doSetBackslash_H, - doBackslashb, - doBackslashz, - doSetBeginDifference1, - doOpenLookAhead, - doMatchModeParen, - doBackslashV, - doIntevalLowerDigit, - doCaret, - doSetEnd, - doSetNegate, - doBackslashS, - doOrOperator, - doBackslashB, - doBackslashw, - doBackslashR, - doRuleError, - doDotAny, - doMatchMode, - doSetBackslash_W, - doNGPlus, doSetBackslash_D, - doPossessiveOpt, - doSetNamedRange, - doConditionalExpr, - doBackslashs, - doPossessiveStar, - doPlus, - doBadOpenParenType, - doCloseParen, - doNGInterval, - doSetProp, - doBackRef, - doSetBeginUnion, - doEscapeError, - doOpt, - doSetBeginIntersection1, - doPossessivePlus, - doBackslashD, - doOpenLookBehindNeg, - doSetBegin, - doSetIntersection2, - doCompleteNamedBackRef, - doSetRange, - doDollar, - doBackslashH, - doExit, - doNGOpt, - doOpenNonCaptureParen, - doBackslashA, - doSetBackslash_v, doBackslashh, - doBadModeFlag, - doSetNoCloseError, - doIntervalSame, - doSetAddDash, - doBackslashW, - doPerlInline, - doSetOpError, + doBackslashH, + doSetLiteralEscaped, + doOpenLookAheadNeg, + doCompleteNamedBackRef, + doPatStart, + doBackslashS, + doBackslashD, + doNGStar, + doNOP, + doBackslashX, doSetLiteral, - doPatFinish, - doBeginNamedCapture, + doContinueNamedCapture, + doBackslashG, + doBackslashR, + doSetBegin, + doSetBackslash_v, + doPossessivePlus, + doPerlInline, + doBackslashZ, + doSetAddAmp, + doSetBeginDifference1, + doIntervalError, + doSetNegate, + doIntervalInit, + doSetIntersection2, + doPossessiveInterval, + doRuleError, + doBackslashW, + doContinueNamedBackRef, + doOpenNonCaptureParen, + doExit, + doSetNamedChar, + doSetBackslash_V, + doConditionalExpr, + doEscapeError, + doBadOpenParenType, + doPossessiveStar, + doSetAddDash, doEscapedLiteralChar, + doSetBackslash_w, + doIntervalUpperDigit, + doBackslashv, + doSetBackslash_S, + doSetNoCloseError, + doSetProp, + doBackslashB, + doSetEnd, + doSetRange, + doMatchModeParen, + doPlus, + doBackslashV, + doSetMatchMode, + doBackslashz, + doSetNamedRange, + doOpenLookBehindNeg, + doInterval, + doBadNamedCapture, + doBeginMatchMode, + doBackslashd, + doPatFinish, + doNamedChar, + doNGPlus, + doSetDifference2, + doSetBackslash_H, + doCloseParen, + doDotAny, + doOpenCaptureParen, + doEnterQuoteMode, + doOpenAtomicParen, + doBadModeFlag, + doSetBackslash_d, + doSetFinish, + doProperty, + doBeginNamedBackRef, + doBackRef, + doOpt, + doDollar, + doBeginNamedCapture, + doNGInterval, + doSetOpError, + doSetPosixProp, + doSetBeginIntersection1, + doBackslashb, + doSetBeginUnion, + doIntevalLowerDigit, + doSetBackslash_h, + doStar, + doMatchMode, + doBackslashA, + doOpenLookBehind, + doPossessiveOpt, + doOrOperator, + doBackslashw, + doBackslashs, doLiteralChar, doSuppressComments, + doCaret, + doIntervalSame, + doNGOpt, + doOpenLookAhead, + doSetBackslash_W, doMismatchedParenErr, - doNGStar, - doSetFinish, - doInterval, - doBackslashG, - doStar, - doSetBackslash_w, - doSetBackslash_S, - doProperty, - doContinueNamedBackRef, - doIntervalInit, + doSetBackslash_s, rbbiLastAction}; //------------------------------------------------------------------------------- @@ -197,7 +197,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doBadOpenParenType, 255, 206,0, FALSE} // 45 , {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 46 open-paren-lookbehind , {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 47 - , {doBeginNamedCapture, 128, 64,0, FALSE} // 48 + , {doBeginNamedCapture, 129, 64,0, FALSE} // 48 , {doBadOpenParenType, 255, 206,0, FALSE} // 49 , {doNOP, 41 /* ) */, 255,0, TRUE} // 50 paren-comment , {doMismatchedParenErr, 253, 206,0, FALSE} // 51 @@ -213,8 +213,8 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 61 , {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 62 , {doBadModeFlag, 255, 206,0, FALSE} // 63 - , {doContinueNamedCapture, 128, 64,0, TRUE} // 64 named-capture - , {doContinueNamedCapture, 129, 64,0, TRUE} // 65 + , {doContinueNamedCapture, 129, 64,0, TRUE} // 64 named-capture + , {doContinueNamedCapture, 128, 64,0, TRUE} // 65 , {doOpenCaptureParen, 62 /* > */, 2, 14, TRUE} // 66 , {doBadNamedCapture, 255, 206,0, FALSE} // 67 , {doNGStar, 63 /* ? */, 20,0, TRUE} // 68 quant-star @@ -226,13 +226,13 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doNGOpt, 63 /* ? */, 20,0, TRUE} // 74 quant-opt , {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 75 , {doOpt, 255, 20,0, FALSE} // 76 - , {doNOP, 129, 79,0, FALSE} // 77 interval-open + , {doNOP, 128, 79,0, FALSE} // 77 interval-open , {doIntervalError, 255, 206,0, FALSE} // 78 - , {doIntevalLowerDigit, 129, 79,0, TRUE} // 79 interval-lower + , {doIntevalLowerDigit, 128, 79,0, TRUE} // 79 interval-lower , {doNOP, 44 /* , */, 83,0, TRUE} // 80 , {doIntervalSame, 125 /* } */, 86,0, TRUE} // 81 , {doIntervalError, 255, 206,0, FALSE} // 82 - , {doIntervalUpperDigit, 129, 83,0, TRUE} // 83 interval-upper + , {doIntervalUpperDigit, 128, 83,0, TRUE} // 83 interval-upper , {doNOP, 125 /* } */, 86,0, TRUE} // 84 , {doIntervalError, 255, 206,0, FALSE} // 85 , {doNGInterval, 63 /* ? */, 20,0, TRUE} // 86 interval-type @@ -261,15 +261,15 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doBackslashX, 88 /* X */, 14,0, TRUE} // 109 , {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 110 , {doBackslashz, 122 /* z */, 2,0, TRUE} // 111 - , {doBackRef, 129, 14,0, TRUE} // 112 + , {doBackRef, 128, 14,0, TRUE} // 112 , {doEscapeError, 253, 206,0, FALSE} // 113 , {doEscapedLiteralChar, 255, 14,0, TRUE} // 114 , {doBeginNamedBackRef, 60 /* < */, 117,0, TRUE} // 115 named-backref , {doBadNamedCapture, 255, 206,0, FALSE} // 116 - , {doContinueNamedBackRef, 128, 119,0, TRUE} // 117 named-backref-2 + , {doContinueNamedBackRef, 129, 119,0, TRUE} // 117 named-backref-2 , {doBadNamedCapture, 255, 206,0, FALSE} // 118 - , {doContinueNamedBackRef, 128, 119,0, TRUE} // 119 named-backref-3 - , {doContinueNamedBackRef, 129, 119,0, TRUE} // 120 + , {doContinueNamedBackRef, 129, 119,0, TRUE} // 119 named-backref-3 + , {doContinueNamedBackRef, 128, 119,0, TRUE} // 120 , {doCompleteNamedBackRef, 62 /* > */, 14,0, TRUE} // 121 , {doBadNamedCapture, 255, 206,0, FALSE} // 122 , {doSetNegate, 94 /* ^ */, 126,0, TRUE} // 123 set-open diff --git a/icu4c/source/i18n/regexcst.pl b/icu4c/source/i18n/regexcst.pl index 384281ffbad..3d656ed2310 100755 --- a/icu4c/source/i18n/regexcst.pl +++ b/icu4c/source/i18n/regexcst.pl @@ -10,13 +10,13 @@ # regexcst.pl # Compile the regular expression paser state table data into initialized C data. # Usage: -# cd icu/source/i18n +# cd icu4c/source/i18n # perl regexcst.pl < regexcst.txt > regexcst.h # # The output file, regexcst.h, is included by some of the .cpp regex # implementation files. This perl script is NOT run as part # of a normal ICU build. It is run by hand when needed, and the -# regexcst.h generated file is put back into cvs. +# regexcst.h generated file is put back into the source code repository. # # See regexcst.txt for a description of the input format for this script. # @@ -201,6 +201,8 @@ for ($state=1; $state<$num_states; $state++) { die if ($errors>0); +print "// © 2016 and later: Unicode, Inc. and others.\n"; +print "// License & terms of use: http://www.unicode.org/copyright.html\n"; print "//---------------------------------------------------------------------------------\n"; print "//\n"; print "// Generated Header File. Do not edit by hand.\n"; @@ -246,6 +248,7 @@ foreach $setName (keys %charClasses) { $i++; } } +print " constexpr uint32_t kRuleSet_count = $i-128;"; print "\n\n"; # diff --git a/icu4c/source/i18n/regexst.cpp b/icu4c/source/i18n/regexst.cpp index 4f12e87bc66..97e417ab5a8 100644 --- a/icu4c/source/i18n/regexst.cpp +++ b/icu4c/source/i18n/regexst.cpp @@ -35,216 +35,101 @@ // generated by a Perl script. #include "regexst.h" - - U_NAMESPACE_BEGIN - -//------------------------------------------------------------------------------ -// -// Unicode Set pattern strings for all of the required constant sets. -// Initialized with hex values for portability to EBCDIC based machines. -// Really ugly, but there's no good way to avoid it. -// -//------------------------------------------------------------------------------ - -// "Rule Char" Characters are those with no special meaning, and therefore do not -// need to be escaped to appear as literals in a regexp. Expressed -// as the inverse of those needing escaping -- [^\*\?\+\[\(\)\{\}\^\$\|\\\.] -static const UChar gRuleSet_rule_char_pattern[] = { - // [ ^ \ * \ ? \ + \ [ \ ( / ) - 0x5b, 0x5e, 0x5c, 0x2a, 0x5c, 0x3f, 0x5c, 0x2b, 0x5c, 0x5b, 0x5c, 0x28, 0x5c, 0x29, - // \ { \ } \ ^ \ $ \ | \ \ \ . ] - 0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0}; +// "Rule Char" Characters are those with special meaning, and therefore +// need to be escaped to appear as literals in a regexp. +constexpr char16_t const *gRuleSet_rule_chars = u"*?+[(){}^$|\\."; // -// Here are the backslash escape characters that ICU's unescape() function -// will handle. +// The backslash escape characters that ICU's unescape() function will handle. // -static const UChar gUnescapeCharPattern[] = { -// [ a c e f n r t u U x ] - 0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x78, 0x5d, 0}; - +constexpr char16_t const *gUnescapeChars = u"acefnrtuUx"; // -// Unicode Set Definitions for Regular Expression \w +// Unicode Set pattern for Regular Expression \w // -static const UChar gIsWordPattern[] = { -// [ \ p { A l p h a b e t i c } - 0x5b, 0x5c, 0x70, 0x7b, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x62, 0x65, 0x74, 0x69, 0x63, 0x7d, -// \ p { M } Mark - 0x5c, 0x70, 0x7b, 0x4d, 0x7d, -// \ p { N d } Digit_Numeric - 0x5c, 0x70, 0x7b, 0x4e, 0x64, 0x7d, -// \ p { P c } Connector_Punctuation - 0x5c, 0x70, 0x7b, 0x50, 0x63, 0x7d, -// \ u 2 0 0 c \ u 2 0 0 d ] - 0x5c, 0x75, 0x32, 0x30, 0x30, 0x63, 0x5c, 0x75, 0x32, 0x30, 0x30, 0x64, 0x5d, 0}; - +constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]"; // // Unicode Set Definitions for Regular Expression \s // -static const UChar gIsSpacePattern[] = { -// [ \ p { W h i t e S p a c e } ] - 0x5b, 0x5c, 0x70, 0x7b, 0x57, 0x68, 0x69, 0x74, 0x65, 0x53, 0x70, 0x61, 0x63, 0x65, 0x7d, 0x5d, 0}; - +constexpr char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]"; // // UnicodeSets used in implementation of Grapheme Cluster detection, \X // -static const UChar gGC_ControlPattern[] = { -// [ [ : Z l : ] [ : Z p : ] - 0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d, -// [ : C c : ] [ : C f : ] - - 0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x2d, -// [ : G r a p h e m e _ - 0x5b, 0x3a, 0x47, 0x72, 0x61, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x5f, -// E x t e n d : ] ] - 0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x3a, 0x5d, 0x5d, 0}; - -static const UChar gGC_ExtendPattern[] = { -// [ \ p { G r a p h e m e _ - 0x5b, 0x5c, 0x70, 0x7b, 0x47, 0x72, 0x61, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x5f, -// E x t e n d } ] - 0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x5d, 0}; - -static const UChar gGC_LPattern[] = { -// [ \ p { H a n g u l _ S y l - 0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c, -// l a b l e _ T y p e = L } ] - 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d, 0x5d, 0}; - -static const UChar gGC_VPattern[] = { -// [ \ p { H a n g u l _ S y l - 0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c, -// l a b l e _ T y p e = V } ] - 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d, 0x5d, 0}; - -static const UChar gGC_TPattern[] = { -// [ \ p { H a n g u l _ S y l - 0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c, -// l a b l e _ T y p e = T } ] - 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0}; - -static const UChar gGC_LVPattern[] = { -// [ \ p { H a n g u l _ S y l - 0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c, -// l a b l e _ T y p e = L V } ] - 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0}; - -static const UChar gGC_LVTPattern[] = { -// [ \ p { H a n g u l _ S y l - 0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c, -// l a b l e _ T y p e = L V T } ] - 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x54, 0x7d, 0x5d, 0}; +constexpr char16_t const *gGC_ControlPattern = u"[[:Zl:][:Zp:][:Cc:][:Cf:]-[:Grapheme_Extend:]]"; +constexpr char16_t const *gGC_ExtendPattern = u"[\\p{Grapheme_Extend}]"; +constexpr char16_t const *gGC_LPattern = u"[\\p{Hangul_Syllable_Type=L}]"; +constexpr char16_t const *gGC_VPattern = u"[\\p{Hangul_Syllable_Type=V}]"; +constexpr char16_t const *gGC_TPattern = u"[\\p{Hangul_Syllable_Type=T}]"; +constexpr char16_t const *gGC_LVPattern = u"[\\p{Hangul_Syllable_Type=LV}]"; +constexpr char16_t const *gGC_LVTPattern = u"[\\p{Hangul_Syllable_Type=LVT}]"; -RegexStaticSets *RegexStaticSets::gStaticSets = NULL; +RegexStaticSets *RegexStaticSets::gStaticSets = nullptr; UInitOnce gStaticSetsInitOnce = U_INITONCE_INITIALIZER; -RegexStaticSets::RegexStaticSets(UErrorCode *status) -: -fUnescapeCharSet(UnicodeString(TRUE, gUnescapeCharPattern, -1), *status), -fRuleDigitsAlias(NULL), -fEmptyText(NULL) -{ - // First zero out everything - int i; - for (i=0; iremove(0xac00, 0xd7a4); - fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_CONTROL]); - fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_L]); - fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_V]); - fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_T]); + fPropSets[URX_GC_NORMAL].complement(); + fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4); + fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]); + fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]); + fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]); + fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]); + fPropSets[URX_GC_NORMAL].freeze(); // Initialize the 8-bit fast bit sets from the parallel full // UnicodeSets. - for (i=0; icompact(); - fPropSets8[i].init(fPropSets[i]); - } + // + // TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping? + // Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x" + // This runs in exponential time, making it easy to adjust the time for + // convenient measuring. + // + // This 8 bit optimization dates from the early days of ICU, + // with a less optimized UnicodeSet. At the time, the difference + // was substantial. + + for (int32_t i=0; ifStaticSets[URX_ISWORD_SET]->contains(c); + cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c); } // Back up until we come to a non-combining char, determine whether @@ -2555,7 +2555,7 @@ UBool RegexMatcher::isWordBoundary(int64_t pos) { UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) || u_charType(prevChar) == U_FORMAT_CHAR)) { - prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); + prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar); break; } } @@ -2580,7 +2580,7 @@ UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { // Current char is a combining one. Not a boundary. return FALSE; } - cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); + cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c); } // Back up until we come to a non-combining char, determine whether @@ -2594,7 +2594,7 @@ UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { U16_PREV(inputBuf, fLookStart, pos, prevChar); if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) || u_charType(prevChar) == U_FORMAT_CHAR)) { - prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); + prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar); break; } } @@ -3203,14 +3203,14 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { UChar32 c; c = UTEXT_NEXT32(fInputText); fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); - UnicodeSet **sets = fPattern->fStaticSets; - if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; - if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; - if (sets[URX_GC_L]->contains(c)) goto GC_L; - if (sets[URX_GC_LV]->contains(c)) goto GC_V; - if (sets[URX_GC_LVT]->contains(c)) goto GC_T; - if (sets[URX_GC_V]->contains(c)) goto GC_V; - if (sets[URX_GC_T]->contains(c)) goto GC_T; + UnicodeSet *sets = RegexStaticSets::gStaticSets->fPropSets; + if (sets[URX_GC_NORMAL].contains(c)) goto GC_Extend; + if (sets[URX_GC_CONTROL].contains(c)) goto GC_Control; + if (sets[URX_GC_L].contains(c)) goto GC_L; + if (sets[URX_GC_LV].contains(c)) goto GC_V; + if (sets[URX_GC_LVT].contains(c)) goto GC_T; + if (sets[URX_GC_V].contains(c)) goto GC_V; + if (sets[URX_GC_T].contains(c)) goto GC_T; goto GC_Extend; @@ -3219,10 +3219,10 @@ GC_L: if (fp->fInputIdx >= fActiveLimit) goto GC_Done; c = UTEXT_NEXT32(fInputText); fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); - if (sets[URX_GC_L]->contains(c)) goto GC_L; - if (sets[URX_GC_LV]->contains(c)) goto GC_V; - if (sets[URX_GC_LVT]->contains(c)) goto GC_T; - if (sets[URX_GC_V]->contains(c)) goto GC_V; + if (sets[URX_GC_L].contains(c)) goto GC_L; + if (sets[URX_GC_LV].contains(c)) goto GC_V; + if (sets[URX_GC_LVT].contains(c)) goto GC_T; + if (sets[URX_GC_V].contains(c)) goto GC_V; (void)UTEXT_PREVIOUS32(fInputText); fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); goto GC_Extend; @@ -3231,8 +3231,8 @@ GC_V: if (fp->fInputIdx >= fActiveLimit) goto GC_Done; c = UTEXT_NEXT32(fInputText); fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); - if (sets[URX_GC_V]->contains(c)) goto GC_V; - if (sets[URX_GC_T]->contains(c)) goto GC_T; + if (sets[URX_GC_V].contains(c)) goto GC_V; + if (sets[URX_GC_T].contains(c)) goto GC_T; (void)UTEXT_PREVIOUS32(fInputText); fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); goto GC_Extend; @@ -3241,7 +3241,7 @@ GC_T: if (fp->fInputIdx >= fActiveLimit) goto GC_Done; c = UTEXT_NEXT32(fInputText); fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); - if (sets[URX_GC_T]->contains(c)) goto GC_T; + if (sets[URX_GC_T].contains(c)) goto GC_T; (void)UTEXT_PREVIOUS32(fInputText); fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); goto GC_Extend; @@ -3253,7 +3253,7 @@ GC_Extend: break; } c = UTEXT_CURRENT32(fInputText); - if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { + if (sets[URX_GC_EXTEND].contains(c) == FALSE) { break; } (void)UTEXT_NEXT32(fInputText); @@ -3310,13 +3310,13 @@ GC_Done: UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UChar32 c = UTEXT_NEXT32(fInputText); if (c < 256) { - Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; - if (s8->contains(c)) { + Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue]; + if (s8.contains(c)) { success = !success; } } else { - const UnicodeSet *s = fPattern->fStaticSets[opValue]; - if (s->contains(c)) { + const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue]; + if (s.contains(c)) { success = !success; } } @@ -3346,14 +3346,14 @@ GC_Done: UChar32 c = UTEXT_NEXT32(fInputText); if (c < 256) { - Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; - if (s8->contains(c) == FALSE) { + Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue]; + if (s8.contains(c) == FALSE) { fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); break; } } else { - const UnicodeSet *s = fPattern->fStaticSets[opValue]; - if (s->contains(c) == FALSE) { + const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue]; + if (s.contains(c) == FALSE) { fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); break; } @@ -4778,14 +4778,14 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu // Dispatch into a little state machine, based on the char. UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); - UnicodeSet **sets = fPattern->fStaticSets; - if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; - if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; - if (sets[URX_GC_L]->contains(c)) goto GC_L; - if (sets[URX_GC_LV]->contains(c)) goto GC_V; - if (sets[URX_GC_LVT]->contains(c)) goto GC_T; - if (sets[URX_GC_V]->contains(c)) goto GC_V; - if (sets[URX_GC_T]->contains(c)) goto GC_T; + UnicodeSet *sets = RegexStaticSets::gStaticSets->fPropSets; + if (sets[URX_GC_NORMAL].contains(c)) goto GC_Extend; + if (sets[URX_GC_CONTROL].contains(c)) goto GC_Control; + if (sets[URX_GC_L].contains(c)) goto GC_L; + if (sets[URX_GC_LV].contains(c)) goto GC_V; + if (sets[URX_GC_LVT].contains(c)) goto GC_T; + if (sets[URX_GC_V].contains(c)) goto GC_V; + if (sets[URX_GC_T].contains(c)) goto GC_T; goto GC_Extend; @@ -4793,25 +4793,25 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu GC_L: if (fp->fInputIdx >= fActiveLimit) goto GC_Done; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); - if (sets[URX_GC_L]->contains(c)) goto GC_L; - if (sets[URX_GC_LV]->contains(c)) goto GC_V; - if (sets[URX_GC_LVT]->contains(c)) goto GC_T; - if (sets[URX_GC_V]->contains(c)) goto GC_V; + if (sets[URX_GC_L].contains(c)) goto GC_L; + if (sets[URX_GC_LV].contains(c)) goto GC_V; + if (sets[URX_GC_LVT].contains(c)) goto GC_T; + if (sets[URX_GC_V].contains(c)) goto GC_V; U16_PREV(inputBuf, 0, fp->fInputIdx, c); goto GC_Extend; GC_V: if (fp->fInputIdx >= fActiveLimit) goto GC_Done; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); - if (sets[URX_GC_V]->contains(c)) goto GC_V; - if (sets[URX_GC_T]->contains(c)) goto GC_T; + if (sets[URX_GC_V].contains(c)) goto GC_V; + if (sets[URX_GC_T].contains(c)) goto GC_T; U16_PREV(inputBuf, 0, fp->fInputIdx, c); goto GC_Extend; GC_T: if (fp->fInputIdx >= fActiveLimit) goto GC_Done; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); - if (sets[URX_GC_T]->contains(c)) goto GC_T; + if (sets[URX_GC_T].contains(c)) goto GC_T; U16_PREV(inputBuf, 0, fp->fInputIdx, c); goto GC_Extend; @@ -4822,7 +4822,7 @@ GC_Extend: break; } U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); - if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { + if (sets[URX_GC_EXTEND].contains(c) == FALSE) { U16_BACK_1(inputBuf, 0, fp->fInputIdx); break; } @@ -4877,13 +4877,13 @@ GC_Done: UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (c < 256) { - Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; - if (s8->contains(c)) { + Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue]; + if (s8.contains(c)) { success = !success; } } else { - const UnicodeSet *s = fPattern->fStaticSets[opValue]; - if (s->contains(c)) { + const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue]; + if (s.contains(c)) { success = !success; } } @@ -4909,13 +4909,13 @@ GC_Done: UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (c < 256) { - Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; - if (s8->contains(c) == FALSE) { + Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue]; + if (s8.contains(c) == FALSE) { break; } } else { - const UnicodeSet *s = fPattern->fStaticSets[opValue]; - if (s->contains(c) == FALSE) { + const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue]; + if (s.contains(c) == FALSE) { break; } } diff --git a/icu4c/source/i18n/repattrn.cpp b/icu4c/source/i18n/repattrn.cpp index bf18695995b..b3028e04f7a 100644 --- a/icu4c/source/i18n/repattrn.cpp +++ b/icu4c/source/i18n/repattrn.cpp @@ -97,8 +97,6 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) { fMinMatchLen = other.fMinMatchLen; fFrameSize = other.fFrameSize; fDataSize = other.fDataSize; - fStaticSets = other.fStaticSets; - fStaticSets8 = other.fStaticSets8; fStartType = other.fStartType; fInitialStringIdx = other.fInitialStringIdx; @@ -175,8 +173,6 @@ void RegexPattern::init() { fFrameSize = 0; fDataSize = 0; fGroupMap = NULL; - fStaticSets = NULL; - fStaticSets8 = NULL; fStartType = START_NO_INFO; fInitialStringIdx = 0; fInitialStringLen = 0; @@ -805,8 +801,8 @@ void RegexPattern::dumpOp(int32_t index) const { printf("NOT "); val &= ~URX_NEG_SET; } - UnicodeSet *set = fStaticSets[val]; - set->toPattern(s, TRUE); + UnicodeSet &set = RegexStaticSets::gStaticSets->fPropSets[val]; + set.toPattern(s, TRUE); printf("%s", CStr(s)()); } break; diff --git a/icu4c/source/i18n/unicode/regex.h b/icu4c/source/i18n/unicode/regex.h index 6338eb7c754..9be2771bb4a 100644 --- a/icu4c/source/i18n/unicode/regex.h +++ b/icu4c/source/i18n/unicode/regex.h @@ -612,12 +612,6 @@ private: UVector32 *fGroupMap; // Map from capture group number to position of // the group's variables in the matcher stack frame. - UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined - // regex character classes, e.g. Word. - - Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only - // sets for predefined regex classes. - int32_t fStartType; // Info on how a match must start. int32_t fInitialStringIdx; // int32_t fInitialStringLen; diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index 681ae1150e2..04867db9e51 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -3500,11 +3500,15 @@ void RegexTest::regex_find(const UnicodeString &pattern, // positions. // parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status); - REGEX_CHECK_STATUS_L(line); + if (!assertSuccess(WHERE, status) ) { + goto cleanupAndReturn; + } unEscapedInput = inputString.unescape(); parseMatcher = parsePat->matcher(unEscapedInput, status); - REGEX_CHECK_STATUS_L(line); + if (!assertSuccess(WHERE, status) ) { + goto cleanupAndReturn; + } while(parseMatcher->find()) { parseMatcher->appendReplacement(deTaggedInput, "", status); REGEX_CHECK_STATUS; @@ -4203,6 +4207,8 @@ void RegexTest::PerlTests() { if (expected != found) { errln("line %d: Expected %smatch, got %smatch", lineNum, expected?"":"no ", found?"":"no " ); + delete testMat; + delete testPat; continue; } @@ -4598,6 +4604,8 @@ void RegexTest::PerlTestsUTF8() { if (expected != found) { errln("line %d: Expected %smatch, got %smatch", lineNum, expected?"":"no ", found?"":"no " ); + delete testMat; + delete testPat; continue; }