ICU-11449 collation formatVersion 5: reorder single scripts not groups, write new scripts data with reserved ranges

X-SVN-Rev: 36914
This commit is contained in:
Markus Scherer 2015-01-05 23:31:52 +00:00
parent b2c4740141
commit 12f91f5673
3 changed files with 248 additions and 144 deletions

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2012-2014, International Business Machines
* Copyright (C) 2012-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationbasedatabuilder.cpp
@ -85,7 +85,10 @@ CollationBaseDataBuilder::CollationBaseDataBuilder(UErrorCode &errorCode)
: CollationDataBuilder(errorCode),
numericPrimary(0x12000000),
firstHanPrimary(0), lastHanPrimary(0), hanStep(2),
rootElements(errorCode) {
rootElements(errorCode),
scriptStartsLength(1) {
uprv_memset(scriptsIndex, 0, sizeof(scriptsIndex));
uprv_memset(scriptStarts, 0, sizeof(scriptStarts));
}
CollationBaseDataBuilder::~CollationBaseDataBuilder() {
@ -309,27 +312,43 @@ CollationBaseDataBuilder::addRootElement(int64_t ce, UErrorCode &errorCode) {
}
void
CollationBaseDataBuilder::addReorderingGroup(uint32_t firstByte, uint32_t lastByte,
const UnicodeString &groupScripts,
UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
if(groupScripts.isEmpty()) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
CollationBaseDataBuilder::addScriptStart(int32_t script, uint32_t p) {
// The primary weight must be the lowest possible for a two-byte prefix.
// It could be 2, 3, or 4 bytes long. We round down to the two-byte boundary.
U_ASSERT((p & 0xff) == 0 || (p & 0xff) == 2);
p >>= 8;
U_ASSERT((p & 0xff) == 0 || (p & 0xff) == 2);
p >>= 8;
uint32_t lowestP2 = compressibleBytes[p >> 8] ? 4 : 2;
if((p & 0xff) == lowestP2) {
// The script really starts on a lead byte boundary. Round down to that.
p &= 0xff00;
}
if(groupScripts.indexOf((UChar)USCRIPT_UNKNOWN) >= 0) {
// Zzzz must not occur.
// It is the code used in the API to separate low and high scripts.
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
// Script starts should be added in ascending order, otherwise we would need to sort them.
if(script < UCOL_REORDER_CODE_FIRST) {
U_ASSERT(0 <= script && script < USCRIPT_CODE_LIMIT);
} else {
U_ASSERT(script <= (UCOL_REORDER_CODE_FIRST + 15));
script = USCRIPT_CODE_LIMIT + script - UCOL_REORDER_CODE_FIRST;
}
if(scriptStartsLength != 0 && scriptStarts[scriptStartsLength - 1] == p) {
// Two scripts share a range (e.g., Hira & Kana).
scriptsIndex[script] = (uint16_t)(scriptStartsLength - 1);
} else {
U_ASSERT(scriptStartsLength == 0 || scriptStarts[scriptStartsLength - 1] <= p);
U_ASSERT(scriptStartsLength < UPRV_LENGTHOF(scriptStarts));
scriptsIndex[script] = (uint16_t)scriptStartsLength;
scriptStarts[scriptStartsLength++] = (uint16_t)p;
}
if(script == USCRIPT_UNKNOWN) {
// The last script start is for unassigned code points
// (with high implict primary weights).
// Add one more entry with the limit of this range,
// which is the start of the trailing-weights range.
U_ASSERT(scriptStartsLength < UPRV_LENGTHOF(scriptStarts));
scriptStarts[scriptStartsLength++] =
(uint16_t)((Collation::FIRST_TRAILING_PRIMARY >> 16) & 0xff00);
}
// Note: We are mostly trusting the input data,
// rather than verifying that reordering groups do not intersect
// with their lead byte ranges nor their sets of scripts,
// and that all script codes are valid.
scripts.append((UChar)((firstByte << 8) | lastByte));
scripts.append((UChar)groupScripts.length());
scripts.append(groupScripts);
}
void
@ -337,8 +356,18 @@ CollationBaseDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
buildMappings(data, errorCode);
data.numericPrimary = numericPrimary;
data.compressibleBytes = compressibleBytes;
data.scripts = reinterpret_cast<const uint16_t *>(scripts.getBuffer());
data.scriptsLength = scripts.length();
int32_t numScripts = USCRIPT_CODE_LIMIT;
while(numScripts > 0 && scriptsIndex[numScripts - 1] == 0) { --numScripts; }
// Move the 16 special groups (not all used)
// down for contiguous storage of the script and special-group indexes.
for(int32_t i = 0; i < 16; ++i) {
scriptsIndex[numScripts + i] = scriptsIndex[USCRIPT_CODE_LIMIT + i];
}
data.numScripts = numScripts;
data.scriptsIndex = scriptsIndex;
data.scriptStarts = scriptStarts;
data.scriptStartsLength = scriptStartsLength;
buildFastLatinTable(data, errorCode);
}

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2012-2014, International Business Machines
* Copyright (C) 2012-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationbasedatabuilder.h
@ -18,6 +18,7 @@
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/uscript.h"
#include "collation.h"
#include "collationdata.h"
#include "collationdatabuilder.h"
@ -66,9 +67,7 @@ public:
void addRootElements(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);
void addRootElement(int64_t ce, UErrorCode &errorCode);
void addReorderingGroup(uint32_t firstByte, uint32_t lastByte,
const UnicodeString &groupScripts,
UErrorCode &errorCode);
void addScriptStart(int32_t script, uint32_t p);
virtual void build(CollationData &data, UErrorCode &errorCode);
@ -86,7 +85,9 @@ private:
uint32_t lastHanPrimary;
int32_t hanStep;
UVector64 rootElements;
UnicodeString scripts;
uint16_t scriptsIndex[USCRIPT_CODE_LIMIT + 16]; // need exactly this many
uint16_t scriptStarts[USCRIPT_CODE_LIMIT + 16]; // should be safely more than needed
int32_t scriptStartsLength;
};
U_NAMESPACE_END

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2000-2014, International Business Machines
* Copyright (C) 2000-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -25,6 +25,8 @@
#include "unicode/utypes.h"
#include "unicode/errorcode.h"
#include "unicode/localpointer.h"
#include "unicode/ucol.h"
#include "unicode/uscript.h"
#include "unicode/utf8.h"
#include "charstr.h"
#include "cmemory.h"
@ -46,8 +48,6 @@
#include "uparse.h"
#include "writesrc.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
#if UCONFIG_NO_COLLATION
extern "C" int
@ -83,7 +83,7 @@ static UDataInfo ucaDataInfo={
0,
{ 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol"
{ 4, 1, 0, 0 }, // formatVersion
{ 5, 0, 0, 0 }, // formatVersion
{ 6, 3, 0, 0 } // dataVersion
};
@ -206,32 +206,151 @@ static int64_t parseCE(const CollationDataBuilder &builder, char *&s, UErrorCode
}
}
// Hardcoded mapping from script sample characters to script codes.
// Pro: Available without complete and updated UCD scripts data,
// easy to add non-script codes specific to collation.
// Con: Needs manual update for each new script or change in sample character.
static const struct {
const char *name;
int32_t code;
} specialReorderTokens[] = {
{ "TERMINATOR", -2 }, // -2 means "ignore"
{ "LEVEL-SEPARATOR", -2 },
{ "FIELD-SEPARATOR", -2 },
{ "COMPRESS", -3 },
// The standard name is "PUNCT" but FractionalUCA.txt uses the long form.
{ "PUNCTUATION", UCOL_REORDER_CODE_PUNCTUATION },
{ "IMPLICIT", USCRIPT_HAN }, // Implicit weights are usually for Han characters. Han & unassigned share a lead byte.
{ "TRAILING", -2 }, // We do not reorder trailing weights (those after implicits).
{ "SPECIAL", -2 } // We must never reorder internal, special CE lead bytes.
UChar32 sampleChar;
int32_t script;
} sampleCharsToScripts[] = {
{ 0x00A0, UCOL_REORDER_CODE_SPACE },
{ 0x201C, UCOL_REORDER_CODE_PUNCTUATION },
{ 0x263A, UCOL_REORDER_CODE_SYMBOL },
{ 0x20AC, UCOL_REORDER_CODE_CURRENCY },
{ 0x0034, UCOL_REORDER_CODE_DIGIT },
{ 0x004C, USCRIPT_LATIN },
{ 0x03A9, USCRIPT_GREEK },
{ 0x03E2, USCRIPT_COPTIC },
{ 0x042F, USCRIPT_CYRILLIC },
{ 0x2C00, USCRIPT_GLAGOLITIC },
{ 0x1036B, USCRIPT_OLD_PERMIC },
{ 0x10D3, USCRIPT_GEORGIAN },
{ 0x0531, USCRIPT_ARMENIAN },
{ 0x05D0, USCRIPT_HEBREW },
{ 0x10900, USCRIPT_PHOENICIAN },
{ 0x0800, USCRIPT_SAMARITAN },
{ 0x0628, USCRIPT_ARABIC },
{ 0x0710, USCRIPT_SYRIAC },
{ 0x0840, USCRIPT_MANDAIC },
{ 0x078C, USCRIPT_THAANA },
{ 0x07CA, USCRIPT_NKO },
{ 0x2D5E, USCRIPT_TIFINAGH },
{ 0x12A0, USCRIPT_ETHIOPIC },
{ 0x0905, USCRIPT_DEVANAGARI },
{ 0x0995, USCRIPT_BENGALI },
{ 0x0A15, USCRIPT_GURMUKHI },
{ 0x0A95, USCRIPT_GUJARATI },
{ 0x0B15, USCRIPT_ORIYA },
{ 0x0B95, USCRIPT_TAMIL },
{ 0x0C15, USCRIPT_TELUGU },
{ 0x0C95, USCRIPT_KANNADA },
{ 0x0D15, USCRIPT_MALAYALAM },
{ 0x0D85, USCRIPT_SINHALA },
{ 0xABC0, USCRIPT_MEITEI_MAYEK },
{ 0xA800, USCRIPT_SYLOTI_NAGRI },
{ 0xA882, USCRIPT_SAURASHTRA },
{ 0x11083, USCRIPT_KAITHI },
{ 0x11152, USCRIPT_MAHAJANI },
{ 0x11183, USCRIPT_SHARADA },
{ 0x11208, USCRIPT_KHOJKI },
{ 0x112BE, USCRIPT_KHUDAWADI },
{ 0x11315, USCRIPT_GRANTHA },
{ 0x11484, USCRIPT_TIRHUTA },
{ 0x1158E, USCRIPT_SIDDHAM },
{ 0x1160E, USCRIPT_MODI },
{ 0x11680, USCRIPT_TAKRI },
{ 0x1B83, USCRIPT_SUNDANESE },
{ 0x11005, USCRIPT_BRAHMI },
{ 0x10A00, USCRIPT_KHAROSHTHI },
{ 0x0E17, USCRIPT_THAI },
{ 0x0EA5, USCRIPT_LAO },
{ 0xAA80, USCRIPT_TAI_VIET },
{ 0x0F40, USCRIPT_TIBETAN },
{ 0x1C00, USCRIPT_LEPCHA },
{ 0xA840, USCRIPT_PHAGS_PA },
{ 0x1900, USCRIPT_LIMBU },
{ 0x1703, USCRIPT_TAGALOG },
{ 0x1723, USCRIPT_HANUNOO },
{ 0x1743, USCRIPT_BUHID },
{ 0x1763, USCRIPT_TAGBANWA },
{ 0x1A00, USCRIPT_BUGINESE },
{ 0x1BC0, USCRIPT_BATAK },
{ 0xA930, USCRIPT_REJANG },
{ 0xA90A, USCRIPT_KAYAH_LI },
{ 0x1000, USCRIPT_MYANMAR },
{ 0x11103, USCRIPT_CHAKMA },
{ 0x1780, USCRIPT_KHMER },
{ 0x1950, USCRIPT_TAI_LE },
{ 0x1980, USCRIPT_NEW_TAI_LUE },
{ 0x1A20, USCRIPT_LANNA },
{ 0xAA00, USCRIPT_CHAM },
{ 0x1B05, USCRIPT_BALINESE },
{ 0xA984, USCRIPT_JAVANESE },
{ 0x1826, USCRIPT_MONGOLIAN },
{ 0x1C5A, USCRIPT_OL_CHIKI },
{ 0x13C4, USCRIPT_CHEROKEE },
{ 0x14C0, USCRIPT_CANADIAN_ABORIGINAL },
{ 0x168F, USCRIPT_OGHAM },
{ 0x16A0, USCRIPT_RUNIC },
{ 0x10C00, USCRIPT_ORKHON },
{ 0xA549, USCRIPT_VAI },
{ 0xA6A0, USCRIPT_BAMUM },
{ 0x16AE6, USCRIPT_BASSA_VAH },
{ 0x1E802, USCRIPT_MENDE },
{ 0xAC00, USCRIPT_HANGUL },
{ 0x304B, USCRIPT_HIRAGANA },
{ 0x30AB, USCRIPT_KATAKANA },
{ 0x3105, USCRIPT_BOPOMOFO },
{ 0xA288, USCRIPT_YI },
{ 0xA4D0, USCRIPT_LISU },
{ 0x16F00, USCRIPT_MIAO },
{ 0x118B4, USCRIPT_WARANG_CITI },
{ 0x11AC0, USCRIPT_PAU_CIN_HAU },
{ 0x16B1C, USCRIPT_PAHAWH_HMONG },
{ 0x10280, USCRIPT_LYCIAN },
{ 0x102A0, USCRIPT_CARIAN },
{ 0x10920, USCRIPT_LYDIAN },
{ 0x10300, USCRIPT_OLD_ITALIC },
{ 0x10330, USCRIPT_GOTHIC },
{ 0x10414, USCRIPT_DESERET },
{ 0x10450, USCRIPT_SHAVIAN },
{ 0x1BC20, USCRIPT_DUPLOYAN },
{ 0x10480, USCRIPT_OSMANYA },
{ 0x10500, USCRIPT_ELBASAN },
{ 0x10537, USCRIPT_CAUCASIAN_ALBANIAN },
{ 0x110D0, USCRIPT_SORA_SOMPENG },
{ 0x16A4F, USCRIPT_MRO },
{ 0x10000, USCRIPT_LINEAR_B },
{ 0x10647, USCRIPT_LINEAR_A },
{ 0x10800, USCRIPT_CYPRIOT },
{ 0x10A60, USCRIPT_OLD_SOUTH_ARABIAN },
{ 0x10A95, USCRIPT_OLD_NORTH_ARABIAN },
{ 0x10B00, USCRIPT_AVESTAN },
{ 0x10873, USCRIPT_PALMYRENE },
{ 0x10896, USCRIPT_NABATAEAN },
{ 0x10840, USCRIPT_IMPERIAL_ARAMAIC },
{ 0x10B40, USCRIPT_INSCRIPTIONAL_PARTHIAN },
{ 0x10B60, USCRIPT_INSCRIPTIONAL_PAHLAVI },
{ 0x10B8F, USCRIPT_PSALTER_PAHLAVI },
{ 0x10AD8, USCRIPT_MANICHAEAN },
{ 0x10380, USCRIPT_UGARITIC },
{ 0x103A0, USCRIPT_OLD_PERSIAN },
{ 0x12000, USCRIPT_CUNEIFORM },
{ 0x13153, USCRIPT_EGYPTIAN_HIEROGLYPHS },
{ 0x109A0, USCRIPT_MEROITIC_CURSIVE },
{ 0x10980, USCRIPT_MEROITIC_HIEROGLYPHS },
{ 0x5B57, USCRIPT_HAN },
{ 0xFDD0, USCRIPT_UNKNOWN } // unassigned-implicit primary weights
};
int32_t getReorderCode(const char* name) {
int32_t code = CollationRuleParser::getReorderCode(name);
if (code >= 0) {
return code;
}
for (int32_t i = 0; i < LENGTHOF(specialReorderTokens); ++i) {
if (0 == strcmp(name, specialReorderTokens[i].name)) {
return specialReorderTokens[i].code;
static int32_t getCharScript(UChar32 c) {
for(int32_t i = 0; i < UPRV_LENGTHOF(sampleCharsToScripts); ++i) {
if(c == sampleCharsToScripts[i].sampleChar) {
return sampleCharsToScripts[i].script;
}
}
return -1; // Same as UCHAR_INVALID_CODE or USCRIPT_INVALID_CODE.
return USCRIPT_INVALID_CODE; // -1
}
/**
@ -333,7 +452,7 @@ static struct {
};
static int64_t getOptionValue(const char *name) {
for (int32_t i = 0; i < LENGTHOF(vt); ++i) {
for (int32_t i = 0; i < UPRV_LENGTHOF(vt); ++i) {
if(uprv_strcmp(name, vt[i].name) == 0) {
return vt[i].value;
}
@ -341,11 +460,9 @@ static int64_t getOptionValue(const char *name) {
return 0;
}
static UnicodeString *leadByteScripts = NULL;
static void readAnOption(
CollationBaseDataBuilder &builder, char *buffer, UErrorCode *status) {
for (int32_t cnt = 0; cnt<LENGTHOF(vt); cnt++) {
for (int32_t cnt = 0; cnt<UPRV_LENGTHOF(vt); cnt++) {
int32_t vtLen = (int32_t)uprv_strlen(vt[cnt].name);
if(uprv_strncmp(buffer, vt[cnt].name, vtLen) == 0) {
ActionType what_to_do = vt[cnt].what_to_do;
@ -491,54 +608,12 @@ static void readAnOption(
fprintf(stderr, "warning: UCA version %s != UCD version %s\n", uca, ucd);
}
} else if (what_to_do == READLEADBYTETOSCRIPTS) {
uint16_t leadByte = (hex2num(*pointer++) * 16);
leadByte += hex2num(*pointer++);
if(0xe0 <= leadByte && leadByte < Collation::UNASSIGNED_IMPLICIT_BYTE) {
// Extend the Hani range to the end of what this implementation uses.
// FractionalUCA.txt assumes a different algorithm for implicit primary weights,
// and different high-lead byte ranges.
leadByteScripts[leadByte] = leadByteScripts[0xdf];
return;
}
UnicodeString scripts;
for(;;) {
pointer = skipWhiteSpace(pointer);
if (*pointer == ']') {
break;
}
const char *scriptName = pointer;
char c;
while((c = *pointer) != 0 && c != ' ' && c != '\t' && c != ']') { ++pointer; }
if(c == 0) {
fprintf(stderr, "Syntax error: unterminated list of scripts: '%s'\n", buffer);
*status = U_INVALID_FORMAT_ERROR;
return;
}
*pointer = 0;
int32_t reorderCode = getReorderCode(scriptName);
*pointer = c;
if (reorderCode == -3) { // COMPRESS
builder.setCompressibleLeadByte(leadByte);
continue;
}
if (reorderCode == -2) {
continue; // Ignore "TERMINATOR" etc.
}
if (reorderCode < 0 || 0xffff < reorderCode) {
fprintf(stderr, "Syntax error: unable to parse reorder code from '%s'\n", scriptName);
*status = U_INVALID_FORMAT_ERROR;
return;
}
scripts.append((UChar)reorderCode);
}
if(!scripts.isEmpty()) {
if(leadByteScripts == NULL) {
leadByteScripts = new UnicodeString[256];
}
leadByteScripts[leadByte] = scripts;
if (strstr(pointer, "COMPRESS") != NULL) {
uint16_t leadByte = (hex2num(*pointer++) * 16);
leadByte += hex2num(*pointer++);
builder.setCompressibleLeadByte(leadByte);
}
// We do not need the list of scripts on this line.
}
return;
}
@ -730,9 +805,21 @@ parseFractionalUCA(const char *filename,
// are only entered into the inverse table,
// not into the normal collation data.
builder.addRootElements(ces, cesLength, *status);
if(s.length() == 2 && s[1] == 0x34 && cesLength == 1) {
// Lead byte for numeric sorting.
builder.setNumericPrimary(p);
if(s.length() == 2 && cesLength == 1) {
switch(s[1]) {
case 0x34:
// Lead byte for numeric sorting.
builder.setNumericPrimary(p);
break;
case 0xFF21:
builder.addScriptStart(CollationData::REORDER_RESERVED_BEFORE_LATIN, p);
break;
case 0xFF3A:
builder.addScriptStart(CollationData::REORDER_RESERVED_AFTER_LATIN, p);
break;
default:
break;
}
}
} else {
UChar32 c = s.char32At(0);
@ -742,8 +829,29 @@ parseFractionalUCA(const char *filename,
// CollationBaseDataBuilder::init() maps them to special CEs.
// Except for U+FFFE, these have higher primaries in v2 than in FractionalUCA.txt.
if(0xfffd <= c && c <= 0xffff) { continue; }
if(s.length() == 2 && s[0] == 0xFDD1 && s[1] == 0xFDD0) {
continue;
if(s.length() >= 2 && c == 0xFDD1) {
UChar32 c2 = s.char32At(1);
int32_t script = getCharScript(c2);
if(script < 0) {
fprintf(stderr,
"Error: Unknown script for first-primary sample character "
"U+%04x on line %u of %s\n"
" (add the character to genuca.cpp sampleCharsToScripts[])\n",
c2, (int)line, filename);
exit(U_INVALID_FORMAT_ERROR);
}
if(script == USCRIPT_UNKNOWN) {
// FDD1 FDD0, first unassigned-implicit primary
builder.addScriptStart(script, Collation::FIRST_UNASSIGNED_PRIMARY);
continue;
}
builder.addScriptStart(script, p);
if(script == USCRIPT_HIRAGANA) {
builder.addScriptStart(USCRIPT_KATAKANA_OR_HIRAGANA, p);
} else if(script == USCRIPT_HAN) {
builder.addScriptStart(USCRIPT_SIMPLIFIED_HAN, p);
builder.addScriptStart(USCRIPT_TRADITIONAL_HAN, p);
}
}
if(0xe0000000 <= p && p < 0xf0000000) {
@ -887,40 +995,6 @@ buildAndWriteBaseData(CollationBaseDataBuilder &builder,
return;
}
if(leadByteScripts != NULL) {
uint32_t firstLead = Collation::MERGE_SEPARATOR_BYTE + 1;
do {
// Find the range of lead bytes with this set of scripts.
const UnicodeString &firstScripts = leadByteScripts[firstLead];
if(firstScripts.isEmpty()) {
fprintf(stderr, "[top_byte 0x%02X] has no reorderable scripts\n", (int)firstLead);
errorCode = U_INVALID_FORMAT_ERROR;
return;
}
uint32_t lead = firstLead;
for(;;) {
++lead;
const UnicodeString &scripts = leadByteScripts[lead];
// The scripts should either be the same or disjoint.
// We do not test if all reordering groups have disjoint sets of scripts.
if(scripts.isEmpty() || firstScripts.indexOf(scripts[0]) < 0) { break; }
if(scripts != firstScripts) {
fprintf(stderr,
"[top_byte 0x%02X] includes script %d from [top_byte 0x%02X] "
"but not all scripts match\n",
(int)firstLead, scripts[0], (int)lead);
errorCode = U_INVALID_FORMAT_ERROR;
return;
}
}
// lead is one greater than the last lead byte with the same set of scripts as firstLead.
builder.addReorderingGroup(firstLead, lead - 1, firstScripts, errorCode);
if(U_FAILURE(errorCode)) { return; }
firstLead = lead;
} while(firstLead < Collation::UNASSIGNED_IMPLICIT_BYTE);
delete[] leadByteScripts;
}
CollationData data(*Normalizer2Factory::getNFCImpl(errorCode));
builder.enableFastLatin();
builder.build(data, errorCode);
@ -1168,7 +1242,7 @@ extern "C" int
main(int argc, char* argv[]) {
U_MAIN_INIT_ARGS(argc, argv);
argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
/* error handling, printing usage message */
if(argc<0) {