mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 05:55:35 +00:00
ICU-12526 genuca: add new script sample characters, more readable error output
X-SVN-Rev: 38716
This commit is contained in:
parent
29e5258904
commit
798f5235dd
1 changed files with 58 additions and 48 deletions
|
@ -236,6 +236,7 @@ static const struct {
|
|||
{ 0x078C, USCRIPT_THAANA },
|
||||
{ 0x07CA, USCRIPT_NKO },
|
||||
{ 0x07D8, USCRIPT_NKO },
|
||||
{ 0x2D30, USCRIPT_TIFINAGH },
|
||||
{ 0x2D5E, USCRIPT_TIFINAGH },
|
||||
{ 0x12A0, USCRIPT_ETHIOPIC },
|
||||
{ 0x0905, USCRIPT_DEVANAGARI },
|
||||
|
@ -258,6 +259,7 @@ static const struct {
|
|||
{ 0x112BE, USCRIPT_KHUDAWADI },
|
||||
{ 0x1128F, USCRIPT_MULTANI },
|
||||
{ 0x11315, USCRIPT_GRANTHA },
|
||||
{ 0x11412, USCRIPT_NEWA },
|
||||
{ 0x11484, USCRIPT_TIRHUTA },
|
||||
{ 0x1158E, USCRIPT_SIDDHAM },
|
||||
{ 0x1160E, USCRIPT_MODI },
|
||||
|
@ -266,10 +268,12 @@ static const struct {
|
|||
{ 0x1B83, USCRIPT_SUNDANESE },
|
||||
{ 0x11005, USCRIPT_BRAHMI },
|
||||
{ 0x10A00, USCRIPT_KHAROSHTHI },
|
||||
{ 0x11C0E, USCRIPT_BHAIKSUKI },
|
||||
{ 0x0E17, USCRIPT_THAI },
|
||||
{ 0x0EA5, USCRIPT_LAO },
|
||||
{ 0xAA80, USCRIPT_TAI_VIET },
|
||||
{ 0x0F40, USCRIPT_TIBETAN },
|
||||
{ 0x11C72, USCRIPT_MARCHEN },
|
||||
{ 0x1C00, USCRIPT_LEPCHA },
|
||||
{ 0xA840, USCRIPT_PHAGS_PA },
|
||||
{ 0x1900, USCRIPT_LIMBU },
|
||||
|
@ -293,6 +297,7 @@ static const struct {
|
|||
{ 0x1826, USCRIPT_MONGOLIAN },
|
||||
{ 0x1C5A, USCRIPT_OL_CHIKI },
|
||||
{ 0x13C4, USCRIPT_CHEROKEE },
|
||||
{ 0x104B5, USCRIPT_OSAGE },
|
||||
{ 0x14C0, USCRIPT_CANADIAN_ABORIGINAL },
|
||||
{ 0x168F, USCRIPT_OGHAM },
|
||||
{ 0x16A0, USCRIPT_RUNIC },
|
||||
|
@ -302,6 +307,7 @@ static const struct {
|
|||
{ 0xA6A0, USCRIPT_BAMUM },
|
||||
{ 0x16AE6, USCRIPT_BASSA_VAH },
|
||||
{ 0x1E802, USCRIPT_MENDE },
|
||||
{ 0x1E909, USCRIPT_ADLAM, },
|
||||
{ 0xAC00, USCRIPT_HANGUL },
|
||||
{ 0x304B, USCRIPT_HIRAGANA },
|
||||
{ 0x30AB, USCRIPT_KATAKANA },
|
||||
|
@ -350,6 +356,7 @@ static const struct {
|
|||
{ 0x109A0, USCRIPT_MEROITIC_CURSIVE },
|
||||
{ 0x10980, USCRIPT_MEROITIC_HIEROGLYPHS },
|
||||
{ 0x14400, USCRIPT_ANATOLIAN_HIEROGLYPHS },
|
||||
{ 0x18229, USCRIPT_TANGUT },
|
||||
{ 0x5B57, USCRIPT_HAN },
|
||||
{ 0xFDD0, USCRIPT_UNKNOWN } // unassigned-implicit primary weights
|
||||
};
|
||||
|
@ -632,7 +639,7 @@ static void readAnOption(
|
|||
}
|
||||
|
||||
static UBool
|
||||
readAnElement(FILE *data,
|
||||
readAnElement(char *line,
|
||||
CollationBaseDataBuilder &builder,
|
||||
UnicodeString &prefix, UnicodeString &s,
|
||||
int64_t ces[32], int32_t &cesLength,
|
||||
|
@ -640,79 +647,69 @@ readAnElement(FILE *data,
|
|||
if(U_FAILURE(*status)) {
|
||||
return FALSE;
|
||||
}
|
||||
char buffer[30000];
|
||||
char *result = fgets(buffer, sizeof(buffer), data);
|
||||
if(result == NULL) {
|
||||
if(feof(data)) {
|
||||
return FALSE;
|
||||
} else {
|
||||
fprintf(stderr, "empty line but no EOF!\n");
|
||||
*status = U_INVALID_FORMAT_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
int32_t buflen = (int32_t)uprv_strlen(buffer);
|
||||
while(buflen>0 && (buffer[buflen-1] == '\r' || buffer[buflen-1] == '\n')) {
|
||||
buffer[--buflen] = 0;
|
||||
int32_t lineLength = (int32_t)uprv_strlen(line);
|
||||
while(lineLength>0 && (line[lineLength-1] == '\r' || line[lineLength-1] == '\n')) {
|
||||
line[--lineLength] = 0;
|
||||
}
|
||||
|
||||
if(buflen >= 3 && buffer[0] == (char)0xef &&
|
||||
buffer[1] == (char)0xbb && buffer[2] == (char)0xbf) {
|
||||
if(lineLength >= 3 && line[0] == (char)0xef &&
|
||||
line[1] == (char)0xbb && line[2] == (char)0xbf) {
|
||||
// U+FEFF UTF-8 signature byte sequence.
|
||||
// Ignore, assuming it is at the start of the file.
|
||||
buflen -= 3;
|
||||
uprv_memmove(buffer, buffer + 3, buflen + 1); // +1: including NUL terminator
|
||||
line += 3;
|
||||
lineLength -= 3;
|
||||
}
|
||||
if(buffer[0] == 0 || buffer[0] == '#') {
|
||||
if(line[0] == 0 || line[0] == '#') {
|
||||
return FALSE; // just a comment, skip whole line
|
||||
}
|
||||
|
||||
// Directives.
|
||||
if(buffer[0] == '[') {
|
||||
readAnOption(builder, buffer, status);
|
||||
if(line[0] == '[') {
|
||||
readAnOption(builder, line, status);
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
char *startCodePoint = buffer;
|
||||
CharString input;
|
||||
char *startCodePoint = line;
|
||||
char *endCodePoint = strchr(startCodePoint, ';');
|
||||
if(endCodePoint == NULL) {
|
||||
fprintf(stderr, "error - line with no code point!\n");
|
||||
fprintf(stderr, "error - line with no code point:\n%s\n", line);
|
||||
*status = U_INVALID_FORMAT_ERROR; /* No code point - could be an error, but probably only an empty line */
|
||||
return FALSE;
|
||||
} else {
|
||||
*endCodePoint = 0;
|
||||
}
|
||||
|
||||
char *pipePointer = strchr(buffer, '|');
|
||||
char *pipePointer = strchr(line, '|');
|
||||
if (pipePointer != NULL) {
|
||||
// Read the prefix string which precedes the actual string.
|
||||
*pipePointer = 0;
|
||||
input.append(startCodePoint, (int32_t)(pipePointer - startCodePoint), *status);
|
||||
UChar *prefixChars = prefix.getBuffer(32);
|
||||
int32_t prefixSize =
|
||||
u_parseString(startCodePoint,
|
||||
u_parseString(input.data(),
|
||||
prefixChars, prefix.getCapacity(),
|
||||
NULL, status);
|
||||
if(U_FAILURE(*status)) {
|
||||
prefix.releaseBuffer(0);
|
||||
fprintf(stderr, "error - parsing of prefix \"%s\" failed: %s\n",
|
||||
startCodePoint, u_errorName(*status));
|
||||
fprintf(stderr, "error - parsing of prefix \"%s\" failed: %s\n%s\n",
|
||||
input.data(), line, u_errorName(*status));
|
||||
*status = U_INVALID_FORMAT_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
prefix.releaseBuffer(prefixSize);
|
||||
startCodePoint = pipePointer + 1;
|
||||
input.clear();
|
||||
}
|
||||
|
||||
// Read the string which gets the CE(s) assigned.
|
||||
input.append(startCodePoint, (int32_t)(endCodePoint - startCodePoint), *status);
|
||||
UChar *uchars = s.getBuffer(32);
|
||||
int32_t cSize =
|
||||
u_parseString(startCodePoint,
|
||||
u_parseString(input.data(),
|
||||
uchars, s.getCapacity(),
|
||||
NULL, status);
|
||||
if(U_FAILURE(*status)) {
|
||||
s.releaseBuffer(0);
|
||||
fprintf(stderr, "error - parsing of code point(s) \"%s\" failed: %s\n",
|
||||
startCodePoint, u_errorName(*status));
|
||||
fprintf(stderr, "error - parsing of code point(s) \"%s\" failed: %s\n%s\n",
|
||||
input.data(), line, u_errorName(*status));
|
||||
*status = U_INVALID_FORMAT_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -732,14 +729,14 @@ readAnElement(FILE *data,
|
|||
break;
|
||||
}
|
||||
if(cesLength >= 31) {
|
||||
fprintf(stderr, "Error: Too many CEs on line '%s'\n", buffer);
|
||||
fprintf(stderr, "Error: Too many CEs on line '%s'\n", line);
|
||||
*status = U_INVALID_FORMAT_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
ces[cesLength++] = parseCE(builder, pointer, *status);
|
||||
if(U_FAILURE(*status)) {
|
||||
fprintf(stderr, "Syntax error parsing CE from line '%s' - %s\n",
|
||||
buffer, u_errorName(*status));
|
||||
line, u_errorName(*status));
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
@ -759,11 +756,11 @@ readAnElement(FILE *data,
|
|||
uint8_t b = (uint8_t)(ce >> (j * 8));
|
||||
if(j <= 1) { b &= 0x3f; } // tertiary bytes use 6 bits
|
||||
if (b == 1) {
|
||||
fprintf(stderr, "Warning: invalid UCA weight byte 01 for %s\n", buffer);
|
||||
fprintf(stderr, "Warning: invalid UCA weight byte 01 for %s\n", line);
|
||||
return FALSE;
|
||||
}
|
||||
if (j == 7 && b == 2) {
|
||||
fprintf(stderr, "Warning: invalid UCA primary weight lead byte 02 for %s\n", buffer);
|
||||
fprintf(stderr, "Warning: invalid UCA primary weight lead byte 02 for %s\n", line);
|
||||
return FALSE;
|
||||
}
|
||||
if (j == 7) {
|
||||
|
@ -774,7 +771,7 @@ readAnElement(FILE *data,
|
|||
// 02 is unusable and 03 is the low compression terminator when the lead byte is compressible.
|
||||
if (isCompressible && (b <= 3 || b == 0xff)) {
|
||||
fprintf(stderr, "Warning: invalid UCA primary second weight byte %02X for %s\n",
|
||||
b, buffer);
|
||||
b, line);
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
@ -797,23 +794,34 @@ parseFractionalUCA(const char *filename,
|
|||
*status = U_FILE_ACCESS_ERROR;
|
||||
return;
|
||||
}
|
||||
uint32_t line = 0;
|
||||
int32_t lineNumber = 0;
|
||||
char buffer[30000];
|
||||
|
||||
UChar32 maxCodePoint = 0;
|
||||
while(!feof(data)) {
|
||||
if(U_FAILURE(*status)) {
|
||||
fprintf(stderr, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n",
|
||||
*status, u_errorName(*status), (int)line, filename);
|
||||
*status, u_errorName(*status), (int)lineNumber, filename);
|
||||
exit(*status);
|
||||
}
|
||||
|
||||
line++;
|
||||
lineNumber++;
|
||||
char *line = fgets(buffer, sizeof(buffer), data);
|
||||
if(line == NULL) {
|
||||
if(feof(data)) {
|
||||
break;
|
||||
} else {
|
||||
fprintf(stderr, "no more input line and also no EOF!\n");
|
||||
*status = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeString prefix;
|
||||
UnicodeString s;
|
||||
int64_t ces[32];
|
||||
int32_t cesLength = 0;
|
||||
if(readAnElement(data, builder, prefix, s, ces, cesLength, status)) {
|
||||
if(readAnElement(line, builder, prefix, s, ces, cesLength, status)) {
|
||||
// we have read the line, now do something sensible with the read data!
|
||||
uint32_t p = (uint32_t)(ces[0] >> 32);
|
||||
|
||||
|
@ -852,9 +860,10 @@ parseFractionalUCA(const char *filename,
|
|||
if(script < 0) {
|
||||
fprintf(stderr,
|
||||
"Error: Unknown script for first-primary sample character "
|
||||
"U+%04x on line %u of %s\n"
|
||||
"U+%04X on line %u of %s:\n"
|
||||
"%s\n"
|
||||
" (add the character to genuca.cpp sampleCharsToScripts[])\n",
|
||||
c2, (int)line, filename);
|
||||
c2, (int)lineNumber, filename, line);
|
||||
exit(U_INVALID_FORMAT_ERROR);
|
||||
}
|
||||
if(script == USCRIPT_UNKNOWN) {
|
||||
|
@ -874,8 +883,9 @@ parseFractionalUCA(const char *filename,
|
|||
if(0xe0000000 <= p && p < 0xf0000000) {
|
||||
fprintf(stderr,
|
||||
"Error: Unexpected mapping to an implicit or trailing primary"
|
||||
" on line %u of %s.\n",
|
||||
(int)line, filename);
|
||||
" on line %u of %s:\n"
|
||||
"%s\n",
|
||||
(int)lineNumber, filename, line);
|
||||
exit(U_INVALID_FORMAT_ERROR);
|
||||
}
|
||||
|
||||
|
@ -988,7 +998,7 @@ parseFractionalUCA(const char *filename,
|
|||
}
|
||||
|
||||
if (beVerbose) {
|
||||
printf("\nLines read: %u\n", (int)line);
|
||||
printf("\nLines read: %u\n", (int)lineNumber);
|
||||
}
|
||||
|
||||
fclose(data);
|
||||
|
|
Loading…
Add table
Reference in a new issue