ICU-1130 use perl \N{name} notation in Any-Name and Name-Any

X-SVN-Rev: 9849
This commit is contained in:
Alan Liu 2002-09-06 23:30:29 +00:00
parent c8f160bcab
commit 85d23479b6
5 changed files with 143 additions and 127 deletions

View file

@ -10,11 +10,11 @@
#include "unicode/unifilt.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "name2uni.h"
#include "cmemory.h"
#include "uprops.h"
// As of Unicode 3.0.0, the longest name is 83 characters long.
#define LONGEST_NAME 83
#include "util.h"
U_NAMESPACE_BEGIN
@ -22,25 +22,17 @@ const char NameUnicodeTransliterator::fgClassID=0;
const char NameUnicodeTransliterator::_ID[] = "Name-Any";
/**
* Constructs a transliterator.
*/
NameUnicodeTransliterator::NameUnicodeTransliterator(
UChar32 openDelim, UChar32 closeDelim,
UnicodeFilter* adoptedFilter) :
Transliterator(_ID, adoptedFilter),
openDelimiter(openDelim),
closeDelimiter(closeDelim) {
}
static const UChar OPEN[] = {92,78,126,123,126,0}; // "\N~{~"
static const UChar OPEN_DELIM = 92; // '\\' first char of OPEN
static const UChar CLOSE_DELIM = 125; // '}'
static const UChar SPACE = 32; // ' '
/**
* Constructs a transliterator with the default delimiters '{' and
* '}'.
*/
NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
Transliterator(_ID, adoptedFilter),
openDelimiter((UChar) 0x007B /*{*/),
closeDelimiter((UChar) 0x007D /*}*/) {
Transliterator(_ID, adoptedFilter) {
}
/**
@ -52,9 +44,7 @@ NameUnicodeTransliterator::~NameUnicodeTransliterator() {}
* Copy constructor.
*/
NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) :
Transliterator(o),
openDelimiter(o.openDelimiter),
closeDelimiter(o.closeDelimiter) {}
Transliterator(o) {}
/**
* Assignment operator.
@ -62,8 +52,6 @@ NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliter
NameUnicodeTransliterator& NameUnicodeTransliterator::operator=(
const NameUnicodeTransliterator& o) {
Transliterator::operator=(o);
openDelimiter = o.openDelimiter;
closeDelimiter = o.closeDelimiter;
return *this;
}
@ -79,13 +67,30 @@ Transliterator* NameUnicodeTransliterator::clone(void) const {
*/
void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
UBool isIncremental) const {
// Accomodate the longest possible name plus padding
UChar buf[LONGEST_NAME + 8];
char cbuf[LONGEST_NAME + 8]; // Default converter
// The failure mode, here and below, is to behave like Any-Null,
// if either there is no name data (max len == 0) or there is no
// memory (malloc() => NULL).
// The only characters used in names are (as of Unicode 3.0.0):
// -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ
// (first character is a space).
int32_t maxLen = uprv_getMaxCharNameLength();
if (maxLen == 0) {
offsets.start = offsets.limit;
return;
}
// Accomodate the longest possible name
++maxLen; // allow for temporary trailing space
char* cbuf = (char*) uprv_malloc(maxLen);
if (cbuf == NULL) {
offsets.start = offsets.limit;
return;
}
UnicodeString openPat(TRUE, OPEN, -1);
UnicodeString str, name;
// Get the legal character set
UnicodeSet legal;
uprv_getCharNameCharacters((USet*) &legal); // USet* == UnicodeSet*
int32_t cursor = offsets.start;
int32_t limit = offsets.limit;
@ -94,63 +99,77 @@ void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPos
// 0 - looking for open delimiter
// 1 - after open delimiter
int32_t mode = 0;
int32_t ibuf = 0;
int32_t openPos = offsets.start; // position of openDelimiter
UnicodeString str;
int32_t openPos = -1; // open delim candidate pos
UChar32 c;
for (; cursor < limit; cursor+=UTF_CHAR_LENGTH(c)) {
while (cursor < limit) {
c = text.char32At(cursor);
switch (mode) {
case 0: // looking for open delimiter
if (c == openDelimiter) {
if (c == OPEN_DELIM) { // quick check first
openPos = cursor;
mode = 1;
ibuf = 0;
int32_t i =
ICU_Utility::parsePattern(openPat, text, cursor, limit);
if (i >= 0 && i < limit) {
mode = 1;
name.truncate(0);
cursor = i;
continue; // *** reprocess char32At(cursor)
}
}
break;
case 1: // after open delimiter
// Look for [-a-zA-Z0-9<>]. If \s+ is found, convert it
// Look for legal chars. If \s+ is found, convert it
// to a single space. If closeDelimiter is found, exit
// the loop. If any other character is found, exit the
// loop. If the limit is found, exit the loop.
// loop. If the limit is reached, exit the loop.
// Convert \s+ => SPACE. This assumes there are no
// runs of >1 space characters in names.
if (uprv_isRuleWhiteSpace(c)) {
// Ignore leading whitespace
if (ibuf != 0 && buf[ibuf-1] != (UChar)0x0020) {
buf[ibuf++] = (UChar)0x0020 /* */;
// If we go a bit past the longest possible name then abort
if (ibuf == (LONGEST_NAME + 4)) {
if (name.length() > 0 &&
name.charAt(name.length()-1) != SPACE) {
name.append(SPACE);
// If we are too long then abort. maxLen includes
// temporary trailing space, so use '>'.
if (name.length() > maxLen) {
mode = 0;
}
}
continue;
break;
}
if (c == closeDelimiter) {
if (c == CLOSE_DELIM) {
int32_t len = name.length();
// Delete trailing space, if any
if (ibuf > 0 && buf[ibuf-1] == (UChar)0x0020) {
--ibuf;
if (len > 0 &&
name.charAt(len-1) == SPACE) {
--len;
}
buf[ibuf] = 0; // Add terminating zero
name.extract(0, len, cbuf, "");
UErrorCode status = U_ZERO_ERROR;
UChar32 ch;
u_UCharsToChars(buf, cbuf, ibuf+1);
ch = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status);
c = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status);
if (U_SUCCESS(status)) {
// Lookup succeeded
// assert(UTF_CHAR_LENGTH(CLOSE_DELIM) == 1);
cursor++; // advance over CLOSE_DELIM
str.truncate(0);
str.append(ch);
text.handleReplaceBetween(openPos, cursor+1, str);
str.append(c);
text.handleReplaceBetween(openPos, cursor, str);
// Adjust indices for the change in the length of
// the string. Do not assume that str.length() ==
// 1, in case of surrogates.
int32_t delta = cursor + 1 - openPos - str.length();
int32_t delta = cursor - openPos - str.length();
cursor -= delta;
limit -= delta;
// assert(cursor == openPos + str.length());
@ -158,18 +177,18 @@ void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPos
// If the lookup failed, we leave things as-is and
// still switch to mode 0 and continue.
mode = 0;
continue;
openPos = -1; // close off candidate
continue; // *** reprocess char32At(cursor)
}
// Check if c =~ [-A-Za-z0-9<> ]
if (c == (UChar)0x002D ||
(c >= (UChar)0x0041 && c <= (UChar)0x005A) ||
(c >= (UChar)0x0061 && c <= (UChar)0x007A) ||
(c >= (UChar)0x0030 && c <= (UChar)0x0039) ||
c == (UChar)0x003C || c == (UChar)0x003E) {
buf[ibuf++] = (char) c;
// If we go a bit past the longest possible name then abort
if (ibuf == (LONGEST_NAME + 4)) {
// Check if c is a legal char. We assume here that
// legal.contains(OPEN_DELIM) is FALSE, so when we abort a
// name, we don't have to go back to openPos+1.
if (legal.contains(c)) {
name.append(c);
// If we go past the longest possible name then abort.
// maxLen includes temporary trailing space, so use '>='.
if (name.length() >= maxLen) {
mode = 0;
}
}
@ -182,13 +201,17 @@ void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPos
break;
}
cursor += UTF_CHAR_LENGTH(c);
}
offsets.contextLimit += limit - offsets.limit;
offsets.limit = limit;
// In incremental mode, only advance the cursor up to the last
// open delimiter, if we are in mode 1.
offsets.start = (mode == 1 && isIncremental) ? openPos : cursor;
// open delimiter candidate.
offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor;
uprv_free(cbuf);
}
U_NAMESPACE_END

View file

@ -16,14 +16,12 @@ U_NAMESPACE_BEGIN
/**
* A transliterator that performs name to character mapping.
* It recognizes the Perl syntax \N{name}.
* @author Alan Liu
* @draft ICU 2.0
*/
class U_I18N_API NameUnicodeTransliterator : public Transliterator {
UChar32 openDelimiter;
UChar32 closeDelimiter;
/**
* The address of this static class variable serves as this class's ID
* for ICU "poor man's RTTI".
@ -34,17 +32,6 @@ class U_I18N_API NameUnicodeTransliterator : public Transliterator {
/**
* Constructs a transliterator.
* @param openDelimiter the open delimiter character.
* @param closeDelimiter the close delimiter character.
* @param adoptedFilter the filter for this transliterator.
* @draft ICU 2.0
*/
NameUnicodeTransliterator(UChar32 openDelimiter, UChar32 closeDelimiter,
UnicodeFilter* adoptedFilter = 0);
/**
* Constructs a transliterator with the default delimiters '{' and
* '}'.
* @param adoptedFilter the filter for this transliterator.
* @draft ICU 2.0
*/

View file

@ -12,6 +12,8 @@
#include "unicode/uchar.h"
#include "uni2name.h"
#include "cstring.h"
#include "cmemory.h"
#include "uprops.h"
U_NAMESPACE_BEGIN
@ -19,25 +21,15 @@ const char UnicodeNameTransliterator::fgClassID=0;
const char UnicodeNameTransliterator::_ID[] = "Any-Name";
static const UChar OPEN_DELIM[] = {92,78,123,0}; // "\N{"
static const UChar CLOSE_DELIM = 125; // "}"
#define OPEN_DELIM_LEN 3
/**
* Constructs a transliterator.
*/
UnicodeNameTransliterator::UnicodeNameTransliterator(
UChar32 openDelim, UChar32 closeDelim,
UnicodeFilter* adoptedFilter) :
Transliterator(_ID, adoptedFilter),
openDelimiter(openDelim),
closeDelimiter(closeDelim) {
}
/**
* Constructs a transliterator with the default delimiters '{' and
* '}'.
*/
UnicodeNameTransliterator::UnicodeNameTransliterator(UnicodeFilter* adoptedFilter) :
Transliterator(_ID, adoptedFilter),
openDelimiter((UChar) 0x007B /*{*/),
closeDelimiter((UChar) 0x007D /*}*/) {
Transliterator(_ID, adoptedFilter) {
}
/**
@ -49,9 +41,7 @@ UnicodeNameTransliterator::~UnicodeNameTransliterator() {}
* Copy constructor.
*/
UnicodeNameTransliterator::UnicodeNameTransliterator(const UnicodeNameTransliterator& o) :
Transliterator(o),
openDelimiter(o.openDelimiter),
closeDelimiter(o.closeDelimiter) {}
Transliterator(o) {}
/**
* Assignment operator.
@ -59,8 +49,6 @@ UnicodeNameTransliterator::UnicodeNameTransliterator(const UnicodeNameTransliter
UnicodeNameTransliterator& UnicodeNameTransliterator::operator=(
const UnicodeNameTransliterator& o) {
Transliterator::operator=(o);
openDelimiter = o.openDelimiter;
closeDelimiter = o.closeDelimiter;
return *this;
}
@ -78,15 +66,27 @@ Transliterator* UnicodeNameTransliterator::clone(void) const {
*/
void UnicodeNameTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
UBool /*isIncremental*/) const {
// As of Unicode 3.0.0, the longest name is 83 characters long.
// Adjust this buffer size as needed.
// The failure mode, here and below, is to behave like Any-Null,
// if either there is no name data (max len == 0) or there is no
// memory (malloc() => NULL).
char buf[128];
int32_t maxLen = uprv_getMaxCharNameLength();
if (maxLen == 0) {
offsets.start = offsets.limit;
return;
}
// Accomodate the longest possible name plus padding
char* buf = (char*) uprv_malloc(maxLen);
if (buf == NULL) {
offsets.start = offsets.limit;
return;
}
int32_t cursor = offsets.start;
int32_t limit = offsets.limit;
UnicodeString str(openDelimiter);
UnicodeString str(FALSE, OPEN_DELIM, OPEN_DELIM_LEN);
UErrorCode status;
int32_t len;
@ -94,11 +94,11 @@ void UnicodeNameTransliterator::handleTransliterate(Replaceable& text, UTransPos
UChar32 c = text.char32At(cursor);
int32_t clen = UTF_CHAR_LENGTH(c);
status = U_ZERO_ERROR;
if ((len = u_charName(c, U_EXTENDED_CHAR_NAME, buf, sizeof(buf), &status)) >0 && !U_FAILURE(status)) {
str.truncate(1);
str.append(UnicodeString(buf, len, "")).append(closeDelimiter);
if ((len = u_charName(c, U_EXTENDED_CHAR_NAME, buf, maxLen, &status)) >0 && !U_FAILURE(status)) {
str.truncate(OPEN_DELIM_LEN);
str.append(UnicodeString(buf, len, "")).append(CLOSE_DELIM);
text.handleReplaceBetween(cursor, cursor+clen, str);
len += 2; // adjust for delimiters
len += OPEN_DELIM_LEN + 1; // adjust for delimiters
cursor += len; // advance cursor and adjust for new text
limit += len-clen; // change in length
} else {
@ -109,6 +109,8 @@ void UnicodeNameTransliterator::handleTransliterate(Replaceable& text, UTransPos
offsets.contextLimit += limit - offsets.limit;
offsets.limit = limit;
offsets.start = cursor;
uprv_free(buf);
}
U_NAMESPACE_END

View file

@ -16,27 +16,15 @@ U_NAMESPACE_BEGIN
/**
* A transliterator that performs character to name mapping.
* It generates the Perl syntax \N{name}.
* @author Alan Liu
*/
class U_I18N_API UnicodeNameTransliterator : public Transliterator {
UChar32 openDelimiter;
UChar32 closeDelimiter;
public:
/**
* Constructs a transliterator.
* @param openDelimiter the open delimiter character.
* @param closeDelimiter the close delimiter character.
* @param adoptedFilter the filter to be adopted.
*/
UnicodeNameTransliterator(UChar32 openDelimiter, UChar32 closeDelimiter,
UnicodeFilter* adoptedFilter = 0);
/**
* Constructs a transliterator with the default delimiters '{' and
* '}'.
* @param adoptedFilter the filter to be adopted.
*/
UnicodeNameTransliterator(UnicodeFilter* adoptedFilter = 0);

View file

@ -1180,13 +1180,28 @@ void TransliteratorTest::TestNameMap(void) {
return;
}
// Careful: CharsToUS will convert "\\N" => "N"; use "\\\\N" for \N
expect(*uni2name, CharsToUnicodeString("\\u00A0abc\\u4E01\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF"),
CharsToUnicodeString("{NO-BREAK SPACE}abc{CJK UNIFIED IDEOGRAPH-4E01}{MICRO SIGN}{GUJARATI SIGN CANDRABINDU}{REPLACEMENT CHARACTER}{END OF TRANSMISSION}{CHARACTER TABULATION}{<control-0081>}{<noncharacter-FFFF>}"));
expect(*name2uni, "{ NO-BREAK SPACE}abc{ CJK UNIFIED IDEOGRAPH-4E01 }{x{MICRO SIGN}{GUJARATI SIGN CANDRABINDU}{REPLACEMENT CHARACTER}{END OF TRANSMISSION}{CHARACTER TABULATION}{<control-0081>}{<noncharacter-FFFF>}{<control-0004>}{",
CharsToUnicodeString("\\u00A0abc\\u4E01{x\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF\\u0004{"));
CharsToUnicodeString("\\\\N{NO-BREAK SPACE}abc\\\\N{CJK UNIFIED IDEOGRAPH-4E01}\\\\N{MICRO SIGN}\\\\N{GUJARATI SIGN CANDRABINDU}\\\\N{REPLACEMENT CHARACTER}\\\\N{END OF TRANSMISSION}\\\\N{CHARACTER TABULATION}\\\\N{<control-0081>}\\\\N{<noncharacter-FFFF>}"));
expect(*name2uni, "{\\N { NO-BREAK SPACE}abc\\N{ CJK UNIFIED IDEOGRAPH-4E01 }\\N{x\\N{MICRO SIGN}\\N{GUJARATI SIGN CANDRABINDU}\\N{REPLACEMENT CHARACTER}\\N{END OF TRANSMISSION}\\N{CHARACTER TABULATION}\\N{<control-0081>}\\N{<noncharacter-FFFF>}\\N{<control-0004>}\\N{",
CharsToUnicodeString("{\\u00A0abc\\u4E01\\\\N{x\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF\\u0004\\\\N{"));
delete uni2name;
delete name2uni;
// round trip
Transliterator* t =
Transliterator::createInstance("Any-Name;Name-Any", UTRANS_FORWARD, parseError, status);
if (t==0) {
errln("FAIL: createInstance returned NULL");
delete t;
return;
}
// Careful: CharsToUS will convert "\\N" => "N"; use "\\\\N" for \N
UnicodeString s = CharsToUnicodeString("{\\u00A0abc\\u4E01\\\\N{x\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF\\u0004\\\\N{");
expect(*t, s, s);
delete t;
}
/**
@ -1678,7 +1693,7 @@ void TransliteratorTest::TestSupplemental() {
expectT("Any-Name",
CharsToUnicodeString("\\U00010330\\U000E0061\\u00A0"),
"{GOTHIC LETTER AHSA}{TAG LATIN SMALL LETTER A}{NO-BREAK SPACE}");
"\\N{GOTHIC LETTER AHSA}\\N{TAG LATIN SMALL LETTER A}\\N{NO-BREAK SPACE}");
expectT("Any-Hex/Unicode",
CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
@ -3541,12 +3556,13 @@ void TransliteratorTest::TestUserFunction() {
_TUFReg("Any-gif", t, 0);
t = Transliterator::createFromRules("RemoveCurly",
"[\\{\\}] > ;",
"[\\{\\}] > ; '\\N' > ;",
UTRANS_FORWARD, pe, ec);
if (t == NULL || U_FAILURE(ec)) {
errln((UnicodeString)"FAIL: createFromRules RemoveCurly " + u_errorName(ec));
goto FAIL;
}
expect(*t, "\\N{name}", "name");
_TUFReg("Any-RemoveCurly", t, 1);
logln("Trying &hex");
@ -3588,7 +3604,7 @@ void TransliteratorTest::TestUserFunction() {
// Test that filters are allowed after &
t = Transliterator::createFromRules("test",
"(.) > &Hex($1) ' ' &[\\{\\}]Remove(&Name($1)) ' ';",
"(.) > &Hex($1) ' ' &RemoveCurly(&Name($1)) ' ';",
UTRANS_FORWARD, pe, ec);
if (t == NULL || U_FAILURE(ec)) {
errln((UnicodeString)"FAIL: createFromRules test " + u_errorName(ec));