mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-09 15:27:38 +00:00
ICU-1225 add new Escape and Unescape transliterators to replace UnicodeToHex and HexToUnicode; register several Any-Hex and Hex-Any variants
X-SVN-Rev: 7049
This commit is contained in:
parent
8986e13b6a
commit
76b369219e
7 changed files with 697 additions and 5 deletions
|
@ -71,7 +71,7 @@ cpdtrans.o hextouni.o rbt.o rbt_data.o rbt_pars.o rbt_rule.o rbt_set.o \
|
|||
dbbi.o dbbi_tbl.o rbbi.o rbbi_tbl.o nultrans.o \
|
||||
remtrans.o titletrn.o tolowtrn.o toupptrn.o xformtrn.o \
|
||||
name2uni.o uni2name.o unitohex.o nortrans.o unifilt.o quant.o transreg.o \
|
||||
llong.o nfrs.o nfrule.o nfsubs.o rbnf.o upropset.o util.o
|
||||
llong.o nfrs.o nfrule.o nfsubs.o rbnf.o upropset.o util.o esctrn.o unesctrn.o
|
||||
|
||||
|
||||
|
||||
|
|
167
icu4c/source/i18n/esctrn.cpp
Normal file
167
icu4c/source/i18n/esctrn.cpp
Normal file
|
@ -0,0 +1,167 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/19/2001 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "esctrn.h"
|
||||
#include "util.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
static const UChar UNIPRE[] = {85,43,0}; // "U+"
|
||||
static const UChar BS_u[] = {92,117,0}; // "\\u"
|
||||
static const UChar BS_U[] = {92,85,0}; // "\\U"
|
||||
static const UChar XMLPRE[] = {38,35,120,0}; // "&#x"
|
||||
static const UChar XML10PRE[] = {38,35,0}; // "&#"
|
||||
static const UChar PERLPRE[] = {92,120,123,0}; // "\\x{"
|
||||
static const UChar SEMI[] = {59,0}; // ";"
|
||||
static const UChar RBRACE[] = {125,0}; // "}"
|
||||
static const UChar EMPTY[] = {0}; // ""
|
||||
|
||||
/**
|
||||
* Factory methods
|
||||
*/
|
||||
Transliterator* EscapeTransliterator::_createUnicode(const UnicodeString& ID, Token context) {
|
||||
// Unicode: "U+10FFFF" hex, min=4, max=6
|
||||
return new EscapeTransliterator(ID, UNIPRE, EMPTY, 16, 4, TRUE, NULL);
|
||||
}
|
||||
Transliterator* EscapeTransliterator::_createJava(const UnicodeString& ID, Token context) {
|
||||
// Java: "\\uFFFF" hex, min=4, max=4
|
||||
return new EscapeTransliterator(ID, BS_u, EMPTY, 16, 4, FALSE, NULL);
|
||||
}
|
||||
Transliterator* EscapeTransliterator::_createC(const UnicodeString& ID, Token context) {
|
||||
// C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
|
||||
return new EscapeTransliterator(ID, BS_u, EMPTY, 16, 4, TRUE,
|
||||
new EscapeTransliterator(EMPTY, BS_U, EMPTY, 16, 8, TRUE, NULL));
|
||||
}
|
||||
Transliterator* EscapeTransliterator::_createXML(const UnicodeString& ID, Token context) {
|
||||
// XML: "" hex, min=1, max=6
|
||||
return new EscapeTransliterator(ID, XMLPRE, SEMI, 16, 1, TRUE, NULL);
|
||||
}
|
||||
Transliterator* EscapeTransliterator::_createXML10(const UnicodeString& ID, Token context) {
|
||||
// XML10: "&1114111;" dec, min=1, max=7 (not really "Any-Hex")
|
||||
return new EscapeTransliterator(ID, XML10PRE, SEMI, 10, 1, TRUE, NULL);
|
||||
}
|
||||
Transliterator* EscapeTransliterator::_createPerl(const UnicodeString& ID, Token context) {
|
||||
// Perl: "\\x{263A}" hex, min=1, max=6
|
||||
return new EscapeTransliterator(ID, PERLPRE, RBRACE, 16, 1, TRUE, NULL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers standard variants with the system. Called by
|
||||
* Transliterator during initialization.
|
||||
*/
|
||||
void EscapeTransliterator::registerIDs() {
|
||||
Token t = integerToken(0);
|
||||
|
||||
Transliterator::_registerFactory("Any-Hex/Unicode", _createUnicode, t);
|
||||
|
||||
Transliterator::_registerFactory("Any-Hex/Java", _createJava, t);
|
||||
|
||||
Transliterator::_registerFactory("Any-Hex/C", _createC, t);
|
||||
|
||||
Transliterator::_registerFactory("Any-Hex/XML", _createXML, t);
|
||||
|
||||
Transliterator::_registerFactory("Any-Hex/XML10", _createXML10, t);
|
||||
|
||||
Transliterator::_registerFactory("Any-Hex/Perl", _createPerl, t);
|
||||
|
||||
Transliterator::_registerFactory("Any-Hex", _createJava, t);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs an escape transliterator with the given ID and
|
||||
* parameters. See the class member documentation for details.
|
||||
*/
|
||||
EscapeTransliterator::EscapeTransliterator(const UnicodeString& ID,
|
||||
const UnicodeString& prefix, const UnicodeString& suffix,
|
||||
int32_t radix, int32_t minDigits,
|
||||
UBool grokSupplementals,
|
||||
EscapeTransliterator* adoptedSupplementalHandler) :
|
||||
Transliterator(ID, NULL) {
|
||||
this->prefix = prefix;
|
||||
this->suffix = suffix;
|
||||
this->radix = radix;
|
||||
this->minDigits = minDigits;
|
||||
this->grokSupplementals = grokSupplementals;
|
||||
this->supplementalHandler = adoptedSupplementalHandler;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
EscapeTransliterator::EscapeTransliterator(const EscapeTransliterator& o) :
|
||||
Transliterator(o),
|
||||
prefix(o.prefix),
|
||||
suffix(o.suffix),
|
||||
radix(o.radix),
|
||||
minDigits(o.minDigits),
|
||||
grokSupplementals(o.grokSupplementals) {
|
||||
supplementalHandler = (o.supplementalHandler != 0) ?
|
||||
new EscapeTransliterator(*o.supplementalHandler) : NULL;
|
||||
}
|
||||
|
||||
EscapeTransliterator::~EscapeTransliterator() {
|
||||
delete supplementalHandler;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterator API.
|
||||
*/
|
||||
Transliterator* EscapeTransliterator::clone() const {
|
||||
return new EscapeTransliterator(*this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
*/
|
||||
void EscapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
|
||||
UBool isIncremental) const {
|
||||
int32_t start = pos.start;
|
||||
int32_t limit = pos.limit;
|
||||
|
||||
UnicodeString buf(prefix);
|
||||
int32_t prefixLen = prefix.length();
|
||||
UBool redoPrefix = FALSE;
|
||||
|
||||
while (start < limit) {
|
||||
int32_t c = grokSupplementals ? text.char32At(start) : text.charAt(start);
|
||||
int32_t charLen = grokSupplementals ? UTF_CHAR_LENGTH(c) : 1;
|
||||
|
||||
if ((c & 0xFFFF0000) != 0 && supplementalHandler != NULL) {
|
||||
buf.truncate(0);
|
||||
buf.append(supplementalHandler->prefix);
|
||||
Utility::appendNumber(buf, c, supplementalHandler->radix,
|
||||
supplementalHandler->minDigits);
|
||||
buf.append(supplementalHandler->suffix);
|
||||
redoPrefix = TRUE;
|
||||
} else {
|
||||
if (redoPrefix) {
|
||||
buf.truncate(0);
|
||||
buf.append(prefix);
|
||||
redoPrefix = FALSE;
|
||||
} else {
|
||||
buf.truncate(prefixLen);
|
||||
}
|
||||
Utility::appendNumber(buf, c, radix, minDigits);
|
||||
buf.append(suffix);
|
||||
}
|
||||
|
||||
text.handleReplaceBetween(start, start + charLen, buf);
|
||||
start += buf.length();
|
||||
limit += buf.length() - charLen;
|
||||
}
|
||||
|
||||
pos.contextLimit += limit - pos.limit;
|
||||
pos.limit = limit;
|
||||
pos.start = start;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
//eof
|
138
icu4c/source/i18n/esctrn.h
Normal file
138
icu4c/source/i18n/esctrn.h
Normal file
|
@ -0,0 +1,138 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/20/2001 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef ESCTRN_H
|
||||
#define ESCTRN_H
|
||||
|
||||
#include "unicode/translit.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* A transliterator that converts Unicode characters to an escape
|
||||
* form. Examples of escape forms are "U+4E01" and "".
|
||||
* Escape forms have a prefix and suffix, either of which may be
|
||||
* empty, a radix, typically 16 or 10, a minimum digit count,
|
||||
* typically 1, 4, or 8, and a boolean that specifies whether
|
||||
* supplemental characters are handled as 32-bit code points or as two
|
||||
* 16-bit code units. Most escape forms handle 32-bit code points,
|
||||
* but some, such as the Java form, intentionally break them into two
|
||||
* surrogate pairs, for backward compatibility.
|
||||
*
|
||||
* <p>Some escape forms actually have two different patterns, one for
|
||||
* BMP characters (0..FFFF) and one for supplements (>FFFF). To
|
||||
* handle this, a second EscapeTransliterator may be defined that
|
||||
* specifies the pattern to be produced for supplementals. An example
|
||||
* of a form that requires this is the C form, which uses "\\uFFFF"
|
||||
* for BMP characters and "\\U0010FFFF" for supplementals.
|
||||
*
|
||||
* <p>This class is package private. It registers several standard
|
||||
* variants with the system which are then accessed via their IDs.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: esctrn.h,v $ $Revision: 1.1 $ $Date: 2001/11/21 07:02:14 $
|
||||
*/
|
||||
class U_I18N_API EscapeTransliterator : public Transliterator {
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* The prefix of the escape form; may be empty, but usually isn't.
|
||||
*/
|
||||
UnicodeString prefix;
|
||||
|
||||
/**
|
||||
* The prefix of the escape form; often empty.
|
||||
*/
|
||||
UnicodeString suffix;
|
||||
|
||||
/**
|
||||
* The radix to display the number in. Typically 16 or 10. Must
|
||||
* be in the range 2 to 36.
|
||||
*/
|
||||
int32_t radix;
|
||||
|
||||
/**
|
||||
* The minimum number of digits. Typically 1, 4, or 8. Values
|
||||
* less than 1 are equivalent to 1.
|
||||
*/
|
||||
int32_t minDigits;
|
||||
|
||||
/**
|
||||
* If true, supplementals are handled as 32-bit code points. If
|
||||
* false, they are handled as two 16-bit code units.
|
||||
*/
|
||||
UBool grokSupplementals;
|
||||
|
||||
/**
|
||||
* The form to be used for supplementals. If this is null then
|
||||
* the same form is used for BMP characters and supplementals. If
|
||||
* this is not null and if grokSupplementals is true then the
|
||||
* prefix, suffix, radix, and minDigits of this object are used
|
||||
* for supplementals. This pointer is owned.
|
||||
*/
|
||||
EscapeTransliterator* supplementalHandler;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Registers standard variants with the system. Called by
|
||||
* Transliterator during initialization.
|
||||
*/
|
||||
static void registerIDs();
|
||||
|
||||
/**
|
||||
* Constructs an escape transliterator with the given ID and
|
||||
* parameters. See the class member documentation for details.
|
||||
*/
|
||||
EscapeTransliterator(const UnicodeString& ID,
|
||||
const UnicodeString& prefix, const UnicodeString& suffix,
|
||||
int32_t radix, int32_t minDigits,
|
||||
UBool grokSupplementals,
|
||||
EscapeTransliterator* adoptedSupplementalHandler);
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
EscapeTransliterator(const EscapeTransliterator&);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~EscapeTransliterator();
|
||||
|
||||
/**
|
||||
* Transliterator API.
|
||||
*/
|
||||
virtual Transliterator* clone() const;
|
||||
|
||||
protected:
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
*/
|
||||
void handleTransliterate(Replaceable& text, UTransPosition& offset,
|
||||
UBool isIncremental) const;
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* Factory methods
|
||||
*/
|
||||
static Transliterator* _createUnicode(const UnicodeString& ID, Token context);
|
||||
static Transliterator* _createJava(const UnicodeString& ID, Token context);
|
||||
static Transliterator* _createC(const UnicodeString& ID, Token context);
|
||||
static Transliterator* _createXML(const UnicodeString& ID, Token context);
|
||||
static Transliterator* _createXML10(const UnicodeString& ID, Token context);
|
||||
static Transliterator* _createPerl(const UnicodeString& ID, Token context);
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
|
@ -150,6 +150,10 @@ SOURCE=.\dtfmtsym.cpp
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\esctrn.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\fmtable.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
@ -342,6 +346,10 @@ SOURCE=.\umsg.cpp
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unesctrn.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\uni2name.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
|
|
@ -16,7 +16,6 @@
|
|||
#include "transreg.h"
|
||||
#include "ucln_in.h"
|
||||
#include "unicode/cpdtrans.h"
|
||||
#include "unicode/hextouni.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/msgfmt.h"
|
||||
#include "name2uni.h"
|
||||
|
@ -35,8 +34,9 @@
|
|||
#include "unicode/unifilt.h"
|
||||
#include "unicode/unifltlg.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unitohex.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "esctrn.h"
|
||||
#include "unesctrn.h"
|
||||
|
||||
|
||||
// keep in sync with CompoundTransliterator
|
||||
|
@ -1690,8 +1690,6 @@ void Transliterator::initializeRegistry(void) {
|
|||
// cache. This is how new non-rule-based transliterators are
|
||||
// added to the system.
|
||||
|
||||
registry->put(new HexToUnicodeTransliterator(), TRUE);
|
||||
registry->put(new UnicodeToHexTransliterator(), TRUE);
|
||||
registry->put(new NullTransliterator(), TRUE);
|
||||
registry->put(new RemoveTransliterator(), TRUE);
|
||||
registry->put(new LowercaseTransliterator(), TRUE);
|
||||
|
@ -1701,6 +1699,8 @@ void Transliterator::initializeRegistry(void) {
|
|||
_registerSpecialInverse("Title", "Lower", FALSE);
|
||||
registry->put(new UnicodeNameTransliterator(), TRUE);
|
||||
registry->put(new NameUnicodeTransliterator(), TRUE);
|
||||
EscapeTransliterator::registerIDs();
|
||||
UnescapeTransliterator::registerIDs();
|
||||
NormalizationTransliterator::registerIDs();
|
||||
ucln_i18n_registerCleanup();
|
||||
}
|
||||
|
|
278
icu4c/source/i18n/unesctrn.cpp
Normal file
278
icu4c/source/i18n/unesctrn.cpp
Normal file
|
@ -0,0 +1,278 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/19/2001 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "unesctrn.h"
|
||||
#include "util.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* Special character marking the end of the spec[] array.
|
||||
*/
|
||||
static const UChar END = 0xFFFF;
|
||||
|
||||
// Unicode: "U+10FFFF" hex, min=4, max=6
|
||||
static const UChar SPEC_Unicode[] = {
|
||||
2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
|
||||
END
|
||||
};
|
||||
|
||||
// Java: "\\uFFFF" hex, min=4, max=4
|
||||
static const UChar SPEC_Java[] = {
|
||||
2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
|
||||
END
|
||||
};
|
||||
|
||||
// C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
|
||||
static const UChar SPEC_C[] = {
|
||||
2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
|
||||
2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
|
||||
END
|
||||
};
|
||||
|
||||
// XML: "" hex, min=1, max=6
|
||||
static const UChar SPEC_XML[] = {
|
||||
3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
|
||||
END
|
||||
};
|
||||
|
||||
// XML10: "" dec, min=1, max=7 (not really "Hex-Any")
|
||||
static const UChar SPEC_XML10[] = {
|
||||
2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
|
||||
END
|
||||
};
|
||||
|
||||
// Perl: "\\x{263A}" hex, min=1, max=6
|
||||
static const UChar SPEC_Perl[] = {
|
||||
3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
|
||||
END
|
||||
};
|
||||
|
||||
// All: Java, C, Perl, XML, XML10, Unicode
|
||||
static const UChar SPEC_Any[] = {
|
||||
2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, // Unicode
|
||||
2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, // Java
|
||||
2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, // C (surrogates)
|
||||
3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, // XML
|
||||
2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, // XML10
|
||||
3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
|
||||
END
|
||||
};
|
||||
|
||||
/**
|
||||
* Factory methods
|
||||
*/
|
||||
Transliterator* UnescapeTransliterator::_createUnicode(const UnicodeString& ID, Token context) {
|
||||
return new UnescapeTransliterator(ID, SPEC_Unicode);
|
||||
}
|
||||
Transliterator* UnescapeTransliterator::_createJava(const UnicodeString& ID, Token context) {
|
||||
return new UnescapeTransliterator(ID, SPEC_Java);
|
||||
}
|
||||
Transliterator* UnescapeTransliterator::_createC(const UnicodeString& ID, Token context) {
|
||||
return new UnescapeTransliterator(ID, SPEC_C);
|
||||
}
|
||||
Transliterator* UnescapeTransliterator::_createXML(const UnicodeString& ID, Token context) {
|
||||
return new UnescapeTransliterator(ID, SPEC_XML);
|
||||
}
|
||||
Transliterator* UnescapeTransliterator::_createXML10(const UnicodeString& ID, Token context) {
|
||||
return new UnescapeTransliterator(ID, SPEC_XML10);
|
||||
}
|
||||
Transliterator* UnescapeTransliterator::_createPerl(const UnicodeString& ID, Token context) {
|
||||
return new UnescapeTransliterator(ID, SPEC_Perl);
|
||||
}
|
||||
Transliterator* UnescapeTransliterator::_createAny(const UnicodeString& ID, Token context) {
|
||||
return new UnescapeTransliterator(ID, SPEC_Any);
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers standard variants with the system. Called by
|
||||
* Transliterator during initialization.
|
||||
*/
|
||||
void UnescapeTransliterator::registerIDs() {
|
||||
Token t = integerToken(0);
|
||||
|
||||
Transliterator::_registerFactory("Hex-Any/Unicode", _createUnicode, t);
|
||||
|
||||
Transliterator::_registerFactory("Hex-Any/Java", _createJava, t);
|
||||
|
||||
Transliterator::_registerFactory("Hex-Any/C", _createC, t);
|
||||
|
||||
Transliterator::_registerFactory("Hex-Any/XML", _createXML, t);
|
||||
|
||||
Transliterator::_registerFactory("Hex-Any/XML10", _createXML10, t);
|
||||
|
||||
Transliterator::_registerFactory("Hex-Any/Perl", _createPerl, t);
|
||||
|
||||
Transliterator::_registerFactory("Hex-Any", _createAny, t);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor. Takes the encoded spec array.
|
||||
*/
|
||||
UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& ID,
|
||||
const UChar *spec) :
|
||||
Transliterator(ID, NULL) {
|
||||
this->spec = copySpec(spec);
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
|
||||
Transliterator(o) {
|
||||
this->spec = copySpec(o.spec);
|
||||
}
|
||||
|
||||
UnescapeTransliterator::~UnescapeTransliterator() {
|
||||
delete spec;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterator API.
|
||||
*/
|
||||
Transliterator* UnescapeTransliterator::clone() const {
|
||||
return new UnescapeTransliterator(*this);
|
||||
}
|
||||
|
||||
UChar* UnescapeTransliterator::copySpec(const UChar* spec) {
|
||||
int32_t len = 0;
|
||||
while (spec[len] != END) {
|
||||
++len;
|
||||
}
|
||||
++len;
|
||||
UChar *result = new UChar[len];
|
||||
uprv_memcpy(result, spec, len*sizeof(result[0]));
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
*/
|
||||
void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
|
||||
UBool isIncremental) const {
|
||||
int32_t start = pos.start;
|
||||
int32_t limit = pos.limit;
|
||||
int32_t i, j, ipat;
|
||||
UnicodeString str;
|
||||
|
||||
while (start < limit) {
|
||||
// Loop over the forms in spec[]. Exit this loop when we
|
||||
// match one of the specs. Exit the outer loop if a
|
||||
// partial match is detected and isIncremental is true.
|
||||
for (j=0, ipat=0; spec[ipat] != END; ++j) {
|
||||
|
||||
// Read the header
|
||||
int32_t prefixLen = spec[ipat++];
|
||||
int32_t suffixLen = spec[ipat++];
|
||||
int8_t radix = (int8_t) spec[ipat++];
|
||||
int32_t minDigits = spec[ipat++];
|
||||
int32_t maxDigits = spec[ipat++];
|
||||
|
||||
// s is a copy of start that is advanced over the
|
||||
// characters as we parse them.
|
||||
int32_t s = start;
|
||||
UBool match = TRUE;
|
||||
|
||||
for (i=0; i<prefixLen; ++i) {
|
||||
if (s >= limit) {
|
||||
if (i > 0) {
|
||||
// We've already matched a character. This is
|
||||
// a partial match, so we return if in
|
||||
// incremental mode. In non-incremental mode,
|
||||
// go to the next spec.
|
||||
if (isIncremental) {
|
||||
goto exit;
|
||||
}
|
||||
match = FALSE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
UChar c = text.charAt(s++);
|
||||
if (c != spec[ipat + i]) {
|
||||
match = FALSE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (match) {
|
||||
UChar32 u = 0;
|
||||
int32_t digitCount = 0;
|
||||
for (;;) {
|
||||
if (s >= limit) {
|
||||
// Check for partial match in incremental mode.
|
||||
if (s > start && isIncremental) {
|
||||
goto exit;
|
||||
}
|
||||
break;
|
||||
}
|
||||
UChar32 ch = text.char32At(s);
|
||||
int32_t digit = u_digit(ch, radix);
|
||||
if (digit < 0) {
|
||||
break;
|
||||
}
|
||||
s += UTF_CHAR_LENGTH(ch);
|
||||
u = (u * radix) + digit;
|
||||
if (++digitCount == maxDigits) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
match = (digitCount >= minDigits);
|
||||
|
||||
if (match) {
|
||||
for (i=0; i<suffixLen; ++i) {
|
||||
if (s >= limit) {
|
||||
// Check for partial match in incremental mode.
|
||||
if (s > start && isIncremental) {
|
||||
goto exit;
|
||||
}
|
||||
match = FALSE;
|
||||
break;
|
||||
}
|
||||
UChar c = text.charAt(s++);
|
||||
if (c != spec[ipat + prefixLen + i]) {
|
||||
match = FALSE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (match) {
|
||||
// At this point, we have a match
|
||||
str.truncate(0);
|
||||
str.append(u);
|
||||
text.handleReplaceBetween(start, s, str);
|
||||
limit -= s - start - str.length();
|
||||
// The following break statement leaves the
|
||||
// loop that is traversing the forms in
|
||||
// spec[]. We then parse the next input
|
||||
// character.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ipat += prefixLen + suffixLen;
|
||||
}
|
||||
|
||||
if (start < limit) {
|
||||
start += UTF_CHAR_LENGTH(text.char32At(start));
|
||||
}
|
||||
}
|
||||
|
||||
exit:
|
||||
pos.contextLimit += limit - pos.limit;
|
||||
pos.limit = limit;
|
||||
pos.start = start;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
//eof
|
101
icu4c/source/i18n/unesctrn.h
Normal file
101
icu4c/source/i18n/unesctrn.h
Normal file
|
@ -0,0 +1,101 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/20/2001 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef UNESCTRN_H
|
||||
#define UNESCTRN_H
|
||||
|
||||
#include "unicode/translit.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* A transliterator that converts Unicode escape forms to the
|
||||
* characters they represent. Escape forms have a prefix, a suffix, a
|
||||
* radix, and minimum and maximum digit counts.
|
||||
*
|
||||
* <p>This class is package private. It registers several standard
|
||||
* variants with the system which are then accessed via their IDs.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: unesctrn.h,v $ $Revision: 1.1 $ $Date: 2001/11/21 07:02:15 $
|
||||
*/
|
||||
class U_I18N_API UnescapeTransliterator : public Transliterator {
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* The encoded pattern specification. The pattern consists of
|
||||
* zero or more forms. Each form consists of a prefix, suffix,
|
||||
* radix, minimum digit count, and maximum digit count. These
|
||||
* values are stored as a five character header. That is, their
|
||||
* numeric values are cast to 16-bit characters and stored in the
|
||||
* string. Following these five characters, the prefix
|
||||
* characters, then suffix characters are stored. Each form thus
|
||||
* takes n+5 characters, where n is the total length of the prefix
|
||||
* and suffix. The end is marked by a header of length one
|
||||
* consisting of the character END.
|
||||
*/
|
||||
UChar* spec; // owned; may not be NULL
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Registers standard variants with the system. Called by
|
||||
* Transliterator during initialization.
|
||||
*/
|
||||
static void registerIDs();
|
||||
|
||||
/**
|
||||
* Constructor. Takes the encoded spec array (does not adopt it).
|
||||
*/
|
||||
UnescapeTransliterator(const UnicodeString& ID,
|
||||
const UChar *spec);
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
UnescapeTransliterator(const UnescapeTransliterator&);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~UnescapeTransliterator();
|
||||
|
||||
/**
|
||||
* Transliterator API.
|
||||
*/
|
||||
virtual Transliterator* clone() const;
|
||||
|
||||
protected:
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
*/
|
||||
void handleTransliterate(Replaceable& text, UTransPosition& offset,
|
||||
UBool isIncremental) const;
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* Factory methods
|
||||
*/
|
||||
static Transliterator* _createUnicode(const UnicodeString& ID, Token context);
|
||||
static Transliterator* _createJava(const UnicodeString& ID, Token context);
|
||||
static Transliterator* _createC(const UnicodeString& ID, Token context);
|
||||
static Transliterator* _createXML(const UnicodeString& ID, Token context);
|
||||
static Transliterator* _createXML10(const UnicodeString& ID, Token context);
|
||||
static Transliterator* _createPerl(const UnicodeString& ID, Token context);
|
||||
static Transliterator* _createAny(const UnicodeString& ID, Token context);
|
||||
|
||||
static UChar* copySpec(const UChar* spec);
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
Loading…
Add table
Reference in a new issue