mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-15 01:42:37 +00:00
ICU-1007 remove ComposedCharIter
X-SVN-Rev: 5600
This commit is contained in:
parent
6a7682e8a0
commit
44850e6120
6 changed files with 2 additions and 299 deletions
|
@ -70,7 +70,7 @@ LIBS = $(LIBICUDT) @LIBS@
|
|||
CPPFLAGS += @DATA_PACKAGING_CPPFLAGS@
|
||||
|
||||
OBJECTS = compdata.o dcmpdata.o normlzr.o unorm.o bidi.o ubidi.o \
|
||||
ubidiwrt.o ubidiln.o chariter.o compitr.o cwchar.o schriter.o uchriter.o \
|
||||
ubidiwrt.o ubidiln.o chariter.o cwchar.o schriter.o uchriter.o \
|
||||
cpputils.o digitlst.o filestrm.o ushape.o umemstrm.o locid.o locmap.o uloc.o \
|
||||
mutex.o umutex.o putil.o udata.o uresbund.o uresdata.o resbund.o \
|
||||
uchar.o ucmp8.o ucmp16.o ucmp32.o ucmpe32.o uvector.o uhash.o uhash_us.o \
|
||||
|
|
|
@ -108,10 +108,6 @@ SOURCE=.\compdata.cpp
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\compitr.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\convert.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
@ -414,10 +410,6 @@ SOURCE=.\compdata.h
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\compitr.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unicode\convert.h
|
||||
|
||||
!IF "$(CFG)" == "common - Win32 Release"
|
||||
|
|
|
@ -1,133 +0,0 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "dcmpdata.h"
|
||||
|
||||
#include "compitr.h"
|
||||
|
||||
#include "unicode/normlzr.h"
|
||||
|
||||
/**
|
||||
* Construct a new <tt>ComposedCharIter</tt>. The iterator will return
|
||||
* all Unicode characters with canonical decompositions, including Korean
|
||||
* Hangul characters.
|
||||
*/
|
||||
ComposedCharIter::ComposedCharIter()
|
||||
: minDecomp(DecompData::MAX_COMPAT),
|
||||
hangul(FALSE),
|
||||
curChar(0),
|
||||
nextChar(ComposedCharIter::DONE)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Constructs a non-default <tt>ComposedCharIter</tt> with optional behavior.
|
||||
* <p>
|
||||
* @param compat <tt>false</tt> for canonical decompositions only;
|
||||
* <tt>true</tt> for both canonical and compatibility
|
||||
* decompositions.
|
||||
*
|
||||
* @param options Optional decomposition features. Currently, the only
|
||||
* supported option is {@link Normalizer#IGNORE_HANGUL}, which
|
||||
* causes this <tt>ComposedCharIter</tt> not to iterate
|
||||
* over the Hangul characters and their corresponding
|
||||
* Jamo decompositions.
|
||||
*/
|
||||
ComposedCharIter::ComposedCharIter(UBool compat,
|
||||
int32_t options)
|
||||
: minDecomp(compat ? 0 : DecompData::MAX_COMPAT),
|
||||
hangul((options & Normalizer::IGNORE_HANGUL) == 0),
|
||||
curChar(0),
|
||||
nextChar(ComposedCharIter::DONE)
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether there any precomposed Unicode characters not yet returned
|
||||
* by {@link #next}.
|
||||
*/
|
||||
UBool ComposedCharIter::hasNext() const {
|
||||
if (nextChar == DONE) {
|
||||
((ComposedCharIter*)this)->findNextChar();
|
||||
}
|
||||
return nextChar != DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next precomposed Unicode character.
|
||||
* Repeated calls to <tt>next</tt> return all of the precomposed characters defined
|
||||
* by Unicode, in ascending order. After all precomposed characters have
|
||||
* been returned, {@link #hasNext} will return <tt>false</tt> and further calls
|
||||
* to <tt>next</tt> will return {@link #DONE}.
|
||||
*/
|
||||
UChar ComposedCharIter::next()
|
||||
{
|
||||
if (nextChar == DONE) {
|
||||
findNextChar();
|
||||
}
|
||||
curChar = nextChar;
|
||||
nextChar = DONE;
|
||||
return curChar;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the Unicode decomposition of the current character.
|
||||
* This method returns the decomposition of the precomposed character most
|
||||
* recently returned by {@link #next}. The resulting decomposition is
|
||||
* affected by the settings of the
|
||||
* {@link Normalizer#COMPATIBILITY COMPATIBILITY}
|
||||
* and {@link Normalizer#NO_HANGUL NO_HANGUL} options passed to the constructor.
|
||||
*/
|
||||
void ComposedCharIter::getDecomposition(UnicodeString& result) const
|
||||
{
|
||||
// We duplicate most of the implementation of Normalizer::decompose() here
|
||||
// for efficiency. One thing we don't duplicate is the recursive
|
||||
// decomposition code. If we detect a need to do recursive decomposition
|
||||
// (which happens for only 16 characters in Unicode 3.0) then we delegate to
|
||||
// Normalizer::decompose(). This gives us optimal performance without
|
||||
// having a complete copy of Normalizer::decompose() here, with its extra
|
||||
// baggage of recursion buffers, etc. - Liu
|
||||
|
||||
result.truncate(0);
|
||||
|
||||
uint16_t offset = ucmp16_getu(DecompData::offsets, curChar);
|
||||
uint16_t index = (uint16_t)(offset & DecompData::DECOMP_MASK);
|
||||
if (index > minDecomp) {
|
||||
if ((offset & DecompData::DECOMP_RECURSE) != 0) {
|
||||
// Let Normalizer::decompose() handle recursive decomp
|
||||
UnicodeString temp(curChar);
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Normalizer::decompose(temp, minDecomp > 0,
|
||||
hangul ? Normalizer::IGNORE_HANGUL : 0,
|
||||
result, status);
|
||||
} else {
|
||||
Normalizer::doAppend((const UChar*)DecompData::contents, index, result);
|
||||
}
|
||||
}
|
||||
else if (hangul && curChar >= Normalizer::HANGUL_BASE && curChar < Normalizer::HANGUL_LIMIT) {
|
||||
Normalizer::hangulToJamo(curChar, result, (uint16_t)minDecomp);
|
||||
}
|
||||
else {
|
||||
result += curChar;
|
||||
}
|
||||
}
|
||||
|
||||
void ComposedCharIter::findNextChar()
|
||||
{
|
||||
if (curChar != DONE) {
|
||||
UChar ch = curChar;
|
||||
while (++ch < 0xFFFF) {
|
||||
UChar offset = ucmp16_getu(DecompData::offsets, ch);
|
||||
if (offset > minDecomp
|
||||
|| (hangul && ch >= Normalizer::HANGUL_BASE && ch < Normalizer::HANGUL_LIMIT) ) {
|
||||
nextChar = ch;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,130 +0,0 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1996-2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef COMPITR_H
|
||||
#define COMPITR_H
|
||||
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/unistr.h"
|
||||
|
||||
|
||||
/**
|
||||
* <tt>ComposedCharIter</tt> is an iterator class that returns all
|
||||
* of the precomposed characters defined in the Unicode standard, along
|
||||
* with their decomposed forms. This is often useful when building
|
||||
* data tables (<i>e.g.</i> collation tables) which need to treat composed
|
||||
* and decomposed characters equivalently.
|
||||
* <p>
|
||||
* For example, imagine that you have built a collation table with ordering
|
||||
* rules for the {@link Normalizer#DECOMP canonically decomposed} forms of all
|
||||
* characters used in a particular language. When you process input text using
|
||||
* this table, the text must first be decomposed so that it matches the form
|
||||
* used in the table. This can impose a performance penalty that may be
|
||||
* unacceptable in some situations.
|
||||
* <p>
|
||||
* You can avoid this problem by ensuring that the collation table contains
|
||||
* rules for both the decomposed <i>and</i> composed versions of each character.
|
||||
* To do so, use a <tt>ComposedCharIter</tt> to iterate through all of the
|
||||
* composed characters in Unicode. If the decomposition for that character
|
||||
* consists solely of characters that are listed in your ruleset, you can
|
||||
* add a new rule for the composed character that makes it equivalent to
|
||||
* its decomposition sequence.
|
||||
* <p>
|
||||
* Note that <tt>ComposedCharIter</tt> iterates over a <em>static</em> table
|
||||
* of the composed characters in Unicode. If you want to iterate over the
|
||||
* composed characters in a particular string, use {@link Normalizer} instead.
|
||||
* <p>
|
||||
* When constructing a <tt>ComposedCharIter</tt> there is one
|
||||
* optional feature that you can enable or disable:
|
||||
* <ul>
|
||||
* <li>{@link Normalizer#IGNORE_HANGUL} - Do not iterate over the Hangul
|
||||
* characters and their corresponding Jamo decompositions.
|
||||
* This option is off by default (<i>i.e.</i> Hangul processing is enabled)
|
||||
* since the Unicode standard specifies that Hangul to Jamo
|
||||
* is a canonical decomposition.
|
||||
* </ul>
|
||||
* <p>
|
||||
* <tt>ComposedCharIter</tt> is currently based on version 2.1.8 of the
|
||||
* <a href="http://www.unicode.org" target="unicode">Unicode Standard</a>.
|
||||
* It will be updated as later versions of Unicode are released.
|
||||
*/
|
||||
class U_COMMON_API ComposedCharIter
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Constant that indicates the iteration has completed.
|
||||
* {@link #next} returns this value when there are no more composed
|
||||
* characters over which to iterate.
|
||||
* This value is equal to <code>Normalizer::DONE</tt>.
|
||||
*/
|
||||
enum { DONE = 0xffff };
|
||||
|
||||
/**
|
||||
* Construct a new <tt>ComposedCharIter</tt>. The iterator will return
|
||||
* all Unicode characters with canonical decompositions, including Korean
|
||||
* Hangul characters.
|
||||
*/
|
||||
ComposedCharIter();
|
||||
|
||||
|
||||
/**
|
||||
* Constructs a non-default <tt>ComposedCharIter</tt> with optional behavior.
|
||||
* <p>
|
||||
* @param compat <tt>false</tt> for canonical decompositions only;
|
||||
* <tt>true</tt> for both canonical and compatibility
|
||||
* decompositions.
|
||||
*
|
||||
* @param options Optional decomposition features. Currently, the only
|
||||
* supported option is {@link Normalizer#IGNORE_HANGUL}, which
|
||||
* causes this <tt>ComposedCharIter</tt> not to iterate
|
||||
* over the Hangul characters and their corresponding
|
||||
* Jamo decompositions.
|
||||
*/
|
||||
ComposedCharIter(UBool compat, int32_t options);
|
||||
|
||||
/**
|
||||
* Determines whether there any precomposed Unicode characters not yet returned
|
||||
* by {@link #next}.
|
||||
*/
|
||||
UBool hasNext(void) const;
|
||||
|
||||
/**
|
||||
* Returns the next precomposed Unicode character.
|
||||
* Repeated calls to <tt>next</tt> return all of the precomposed characters defined
|
||||
* by Unicode, in ascending order. After all precomposed characters have
|
||||
* been returned, {@link #hasNext} will return <tt>false</tt> and further calls
|
||||
* to <tt>next</tt> will return {@link #DONE}.
|
||||
*/
|
||||
UChar next(void);
|
||||
|
||||
/**
|
||||
* Returns the Unicode decomposition of the current character.
|
||||
* This method returns the decomposition of the precomposed character most
|
||||
* recently returned by {@link #next}. The resulting decomposition is
|
||||
* affected by the settings of the options passed to the constructor.
|
||||
* {@link Normalizer#COMPATIBILITY COMPATIBILITY}
|
||||
* and {@link Normalizer#NO_HANGUL NO_HANGUL} options passed to the constructor.
|
||||
*/
|
||||
void getDecomposition(UnicodeString& result) const;
|
||||
|
||||
private:
|
||||
void findNextChar(void);
|
||||
|
||||
int32_t minDecomp;
|
||||
UBool hangul;
|
||||
|
||||
UChar curChar;
|
||||
UChar nextChar;
|
||||
};
|
||||
|
||||
#endif // _COMPITR
|
||||
|
||||
|
||||
|
|
@ -5,7 +5,6 @@
|
|||
********************************************************************/
|
||||
|
||||
#include "tstnorm.h"
|
||||
#include "compitr.h"
|
||||
|
||||
#define ARRAY_LENGTH(array) ((int32_t)(sizeof (array) / sizeof (*array)))
|
||||
|
||||
|
@ -33,8 +32,7 @@ void BasicNormalizerTest::runIndexedTest(int32_t index, UBool exec,
|
|||
CASE(7,TestTibetan);
|
||||
CASE(8,TestCompositionExclusion);
|
||||
CASE(9,TestZeroIndex);
|
||||
CASE(10,TestComposedCharIter);
|
||||
CASE(11,TestVerisign);
|
||||
CASE(10,TestVerisign);
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
|
@ -359,29 +357,6 @@ void BasicNormalizerTest::TestZeroIndex(void) {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test ComposedCharIter.
|
||||
*/
|
||||
void BasicNormalizerTest::TestComposedCharIter(void) {
|
||||
ComposedCharIter iter;
|
||||
UnicodeString decompose;
|
||||
UnicodeString temp;
|
||||
UnicodeString buffer;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
while (iter.hasNext()) {
|
||||
UChar c = iter.next();
|
||||
temp.remove(0);
|
||||
temp.append(c);
|
||||
iter.getDecomposition(decompose);
|
||||
Normalizer::decompose(temp, TRUE, 0, buffer, status);
|
||||
if (buffer != decompose) {
|
||||
errln((UnicodeString)"FAIL: " +
|
||||
hex(c) + " -> ComposedCharIter:" + hex(decompose) +
|
||||
" vs. Normalizer:" + hex(buffer));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a few specific cases that are failing for Verisign.
|
||||
*/
|
||||
|
|
|
@ -44,7 +44,6 @@ public:
|
|||
void TestTibetan(void);
|
||||
void TestCompositionExclusion(void);
|
||||
void TestZeroIndex(void);
|
||||
void TestComposedCharIter(void);
|
||||
void TestVerisign(void);
|
||||
|
||||
private:
|
||||
|
|
Loading…
Add table
Reference in a new issue