mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 17:24:01 +00:00
ICU-989 implement algorithmic Any-Name, Name-Any
X-SVN-Rev: 4948
This commit is contained in:
parent
cc3428af39
commit
8f8fbedbfa
4 changed files with 433 additions and 0 deletions
183
icu4c/source/i18n/name2uni.cpp
Normal file
183
icu4c/source/i18n/name2uni.cpp
Normal file
|
@ -0,0 +1,183 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 06/07/01 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/name2uni.h"
|
||||
#include "unicode/unifilt.h"
|
||||
#include "unicode/unicode.h"
|
||||
|
||||
const char* NameUnicodeTransliterator::_ID = "Name-Any";
|
||||
|
||||
// As of Unicode 3.0.0, the longest name is 83 characters long.
|
||||
#define LONGEST_NAME 83
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
*/
|
||||
NameUnicodeTransliterator::NameUnicodeTransliterator(
|
||||
UChar32 openDelim, UChar32 closeDelim,
|
||||
UnicodeFilter* adoptedFilter) :
|
||||
Transliterator(_ID, adoptedFilter),
|
||||
openDelimiter(openDelim),
|
||||
closeDelimiter(closeDelim) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a transliterator with the default delimiters '{' and
|
||||
* '}'.
|
||||
*/
|
||||
NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
|
||||
Transliterator(_ID, adoptedFilter),
|
||||
openDelimiter((UChar) 0x007B /*{*/),
|
||||
closeDelimiter((UChar) 0x007D /*}*/) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
NameUnicodeTransliterator::~NameUnicodeTransliterator() {}
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) :
|
||||
Transliterator(o),
|
||||
openDelimiter(o.openDelimiter),
|
||||
closeDelimiter(o.closeDelimiter) {}
|
||||
|
||||
/**
|
||||
* Assignment operator.
|
||||
*/
|
||||
NameUnicodeTransliterator& NameUnicodeTransliterator::operator=(
|
||||
const NameUnicodeTransliterator& o) {
|
||||
Transliterator::operator=(o);
|
||||
openDelimiter = o.openDelimiter;
|
||||
closeDelimiter = o.closeDelimiter;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterator API.
|
||||
*/
|
||||
Transliterator* NameUnicodeTransliterator::clone(void) const {
|
||||
return new NameUnicodeTransliterator(*this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
*/
|
||||
void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
|
||||
UBool isIncremental) const {
|
||||
// Accomodate the longest possible name plus padding
|
||||
char buf[LONGEST_NAME + 8];
|
||||
|
||||
// The only characters used in names are (as of Unicode 3.0.0):
|
||||
// -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||
// (first character is a space).
|
||||
|
||||
int32_t cursor = offsets.start;
|
||||
int32_t limit = offsets.limit;
|
||||
|
||||
// Modes:
|
||||
// 0 - looking for open delimiter
|
||||
// 1 - after open delimiter
|
||||
int32_t mode = 0;
|
||||
int32_t ibuf = 0;
|
||||
int32_t openPos = offsets.start; // position of openDelimiter
|
||||
|
||||
UnicodeString str;
|
||||
|
||||
for (; cursor < limit; ++cursor) {
|
||||
UChar c = filteredCharAt(text, cursor);
|
||||
|
||||
switch (mode) {
|
||||
case 0: // looking for open delimiter
|
||||
if (c == openDelimiter) {
|
||||
openPos = cursor;
|
||||
mode = 1;
|
||||
ibuf = 0;
|
||||
}
|
||||
break;
|
||||
|
||||
case 1: // after open delimiter
|
||||
// Look for [-a-zA-Z0-9]. If \w+ is found, convert it
|
||||
// to a single space. If closeDelimiter is found, exit
|
||||
// the loop. If any other character is found, exit the
|
||||
// loop. If the limit is found, exit the loop.
|
||||
if (Unicode::isWhitespace(c)) {
|
||||
// Ignore leading whitespace
|
||||
if (ibuf != 0 && buf[ibuf-1] != (UChar)0x0020) {
|
||||
buf[ibuf++] = (UChar)0x0020 /* */;
|
||||
// If we go a bit past the longest possible name then abort
|
||||
if (ibuf == (LONGEST_NAME + 4)) {
|
||||
mode = 0;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c == closeDelimiter) {
|
||||
// Delete trailing space, if any
|
||||
if (ibuf > 0 && buf[ibuf-1] == (UChar)0x0020) {
|
||||
--ibuf;
|
||||
}
|
||||
buf[ibuf] = 0; // Add terminating zero
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UChar32 ch = u_charFromName(U_UNICODE_CHAR_NAME, buf, &status);
|
||||
if (ch != (UChar32) 0xFFFF && U_SUCCESS(status)) {
|
||||
// Lookup succeeded
|
||||
str.truncate(0);
|
||||
str.append(ch);
|
||||
text.handleReplaceBetween(openPos, cursor+1, str);
|
||||
|
||||
// Adjust indices for the change in the length of
|
||||
// the string. Do not assume that str.length() ==
|
||||
// 1, in case of surrogates.
|
||||
int32_t delta = cursor + 1 - openPos - str.length();
|
||||
cursor -= delta;
|
||||
limit -= delta;
|
||||
// assert(cursor == openPos + str.length());
|
||||
}
|
||||
// If the lookup failed, we leave things as-is and
|
||||
// still switch to mode 0 and continue.
|
||||
mode = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c >= (UChar)0x0061 && c <= (UChar)0x007A) {
|
||||
c -= 0x0020; // [a-z] => [A-Z]
|
||||
}
|
||||
|
||||
// Check if c =~ [-A-Z0-9]
|
||||
if (c == (UChar)0x002D ||
|
||||
(c >= (UChar)0x0041 && c <= (UChar)0x005A) ||
|
||||
(c >= (UChar)0x0030 && c <= (UChar)0x0039)) {
|
||||
buf[ibuf++] = (char) c;
|
||||
// If we go a bit past the longest possible name then abort
|
||||
if (ibuf == (LONGEST_NAME + 4)) {
|
||||
mode = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Invalid character
|
||||
else {
|
||||
--cursor; // Backup and reprocess this character
|
||||
mode = 0;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
offsets.contextLimit += limit - offsets.limit;
|
||||
offsets.limit = limit;
|
||||
// In incremental mode, only advance the cursor up to the last
|
||||
// open delimiter, if we are in mode 1.
|
||||
offsets.start = (mode == 1 && isIncremental) ? openPos : cursor;
|
||||
}
|
108
icu4c/source/i18n/uni2name.cpp
Normal file
108
icu4c/source/i18n/uni2name.cpp
Normal file
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 06/06/01 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/uni2name.h"
|
||||
#include "unicode/unifilt.h"
|
||||
|
||||
const char* UnicodeNameTransliterator::_ID = "Any-Name";
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
*/
|
||||
UnicodeNameTransliterator::UnicodeNameTransliterator(
|
||||
UChar32 openDelim, UChar32 closeDelim,
|
||||
UnicodeFilter* adoptedFilter) :
|
||||
Transliterator(_ID, adoptedFilter),
|
||||
openDelimiter(openDelim),
|
||||
closeDelimiter(closeDelim) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a transliterator with the default delimiters '{' and
|
||||
* '}'.
|
||||
*/
|
||||
UnicodeNameTransliterator::UnicodeNameTransliterator(UnicodeFilter* adoptedFilter) :
|
||||
Transliterator(_ID, adoptedFilter),
|
||||
openDelimiter((UChar) 0x007B /*{*/),
|
||||
closeDelimiter((UChar) 0x007D /*}*/) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
UnicodeNameTransliterator::~UnicodeNameTransliterator() {}
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
UnicodeNameTransliterator::UnicodeNameTransliterator(const UnicodeNameTransliterator& o) :
|
||||
Transliterator(o),
|
||||
openDelimiter(o.openDelimiter),
|
||||
closeDelimiter(o.closeDelimiter) {}
|
||||
|
||||
/**
|
||||
* Assignment operator.
|
||||
*/
|
||||
UnicodeNameTransliterator& UnicodeNameTransliterator::operator=(
|
||||
const UnicodeNameTransliterator& o) {
|
||||
Transliterator::operator=(o);
|
||||
openDelimiter = o.openDelimiter;
|
||||
closeDelimiter = o.closeDelimiter;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterator API.
|
||||
*/
|
||||
Transliterator* UnicodeNameTransliterator::clone(void) const {
|
||||
return new UnicodeNameTransliterator(*this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
*/
|
||||
void UnicodeNameTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
|
||||
UBool isIncremental) const {
|
||||
// As of Unicode 3.0.0, the longest name is 83 characters long.
|
||||
// Adjust this buffer size as needed.
|
||||
char buf[128];
|
||||
|
||||
int32_t cursor = offsets.start;
|
||||
int32_t limit = offsets.limit;
|
||||
|
||||
const UnicodeFilter* filt = getFilter();
|
||||
UnicodeString str(openDelimiter);
|
||||
UErrorCode status;
|
||||
UTextOffset len;
|
||||
|
||||
while (cursor < limit) {
|
||||
status = U_ZERO_ERROR;
|
||||
UChar c = text.charAt(cursor);
|
||||
if ((filt == 0 || filt->contains(c)) &&
|
||||
(len=u_charName(c, U_UNICODE_CHAR_NAME, buf, sizeof(buf), &status)) > 0 &&
|
||||
!U_FAILURE(status)) {
|
||||
|
||||
str.truncate(1);
|
||||
str.append(UnicodeString(buf, len, "")).append(closeDelimiter);
|
||||
|
||||
text.handleReplaceBetween(cursor, cursor+1, str);
|
||||
len += 2; // adjust for delimiters
|
||||
cursor += len; // advance cursor by 1 and adjust for new text
|
||||
limit += len-1; // change in length is (len - 1)
|
||||
} else {
|
||||
++cursor;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
offsets.contextLimit += limit - offsets.limit;
|
||||
offsets.limit = limit;
|
||||
offsets.start = cursor;
|
||||
}
|
71
icu4c/source/i18n/unicode/name2uni.h
Normal file
71
icu4c/source/i18n/unicode/name2uni.h
Normal file
|
@ -0,0 +1,71 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 06/07/01 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef NAME2UNI_H
|
||||
#define NAME2UNI_H
|
||||
|
||||
#include "unicode/translit.h"
|
||||
|
||||
/**
|
||||
* A transliterator that performs name to character mapping.
|
||||
* @author Alan Liu
|
||||
*/
|
||||
class U_I18N_API NameUnicodeTransliterator : public Transliterator {
|
||||
|
||||
UChar32 openDelimiter;
|
||||
UChar32 closeDelimiter;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
*/
|
||||
NameUnicodeTransliterator(UChar32 openDelimiter, UChar32 closeDelimiter,
|
||||
UnicodeFilter* adoptedFilter = 0);
|
||||
|
||||
/**
|
||||
* Constructs a transliterator with the default delimiters '{' and
|
||||
* '}'.
|
||||
*/
|
||||
NameUnicodeTransliterator(UnicodeFilter* adoptedFilter = 0);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~NameUnicodeTransliterator();
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
NameUnicodeTransliterator(const NameUnicodeTransliterator&);
|
||||
|
||||
/**
|
||||
* Assignment operator.
|
||||
*/
|
||||
NameUnicodeTransliterator& operator=(const NameUnicodeTransliterator&);
|
||||
|
||||
/**
|
||||
* Transliterator API.
|
||||
*/
|
||||
Transliterator* clone(void) const;
|
||||
|
||||
protected:
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
* @draft
|
||||
*/
|
||||
virtual void handleTransliterate(Replaceable& text, UTransPosition& offset,
|
||||
UBool isIncremental) const;
|
||||
|
||||
private:
|
||||
|
||||
static const char* _ID;
|
||||
};
|
||||
#endif
|
71
icu4c/source/i18n/unicode/uni2name.h
Normal file
71
icu4c/source/i18n/unicode/uni2name.h
Normal file
|
@ -0,0 +1,71 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 06/06/01 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef UNI2NAME_H
|
||||
#define UNI2NAME_H
|
||||
|
||||
#include "unicode/translit.h"
|
||||
|
||||
/**
|
||||
* A transliterator that performs character to name mapping.
|
||||
* @author Alan Liu
|
||||
*/
|
||||
class U_I18N_API UnicodeNameTransliterator : public Transliterator {
|
||||
|
||||
UChar32 openDelimiter;
|
||||
UChar32 closeDelimiter;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
*/
|
||||
UnicodeNameTransliterator(UChar32 openDelimiter, UChar32 closeDelimiter,
|
||||
UnicodeFilter* adoptedFilter = 0);
|
||||
|
||||
/**
|
||||
* Constructs a transliterator with the default delimiters '{' and
|
||||
* '}'.
|
||||
*/
|
||||
UnicodeNameTransliterator(UnicodeFilter* adoptedFilter = 0);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~UnicodeNameTransliterator();
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
UnicodeNameTransliterator(const UnicodeNameTransliterator&);
|
||||
|
||||
/**
|
||||
* Assignment operator.
|
||||
*/
|
||||
UnicodeNameTransliterator& operator=(const UnicodeNameTransliterator&);
|
||||
|
||||
/**
|
||||
* Transliterator API.
|
||||
*/
|
||||
Transliterator* clone(void) const;
|
||||
|
||||
protected:
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
* @draft
|
||||
*/
|
||||
virtual void handleTransliterate(Replaceable& text, UTransPosition& offset,
|
||||
UBool isIncremental) const;
|
||||
|
||||
private:
|
||||
|
||||
static const char* _ID;
|
||||
};
|
||||
#endif
|
Loading…
Add table
Reference in a new issue