From 798f5235ddb24bf0deb464cace989bb614b4dc48 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Fri, 6 May 2016 23:19:36 +0000 Subject: [PATCH] ICU-12526 genuca: add new script sample characters, more readable error output X-SVN-Rev: 38716 --- tools/unicode/c/genuca/genuca.cpp | 106 ++++++++++++++++-------------- 1 file changed, 58 insertions(+), 48 deletions(-) diff --git a/tools/unicode/c/genuca/genuca.cpp b/tools/unicode/c/genuca/genuca.cpp index b152cdbd35b..7332c4d8b1e 100644 --- a/tools/unicode/c/genuca/genuca.cpp +++ b/tools/unicode/c/genuca/genuca.cpp @@ -236,6 +236,7 @@ static const struct { { 0x078C, USCRIPT_THAANA }, { 0x07CA, USCRIPT_NKO }, { 0x07D8, USCRIPT_NKO }, + { 0x2D30, USCRIPT_TIFINAGH }, { 0x2D5E, USCRIPT_TIFINAGH }, { 0x12A0, USCRIPT_ETHIOPIC }, { 0x0905, USCRIPT_DEVANAGARI }, @@ -258,6 +259,7 @@ static const struct { { 0x112BE, USCRIPT_KHUDAWADI }, { 0x1128F, USCRIPT_MULTANI }, { 0x11315, USCRIPT_GRANTHA }, + { 0x11412, USCRIPT_NEWA }, { 0x11484, USCRIPT_TIRHUTA }, { 0x1158E, USCRIPT_SIDDHAM }, { 0x1160E, USCRIPT_MODI }, @@ -266,10 +268,12 @@ static const struct { { 0x1B83, USCRIPT_SUNDANESE }, { 0x11005, USCRIPT_BRAHMI }, { 0x10A00, USCRIPT_KHAROSHTHI }, + { 0x11C0E, USCRIPT_BHAIKSUKI }, { 0x0E17, USCRIPT_THAI }, { 0x0EA5, USCRIPT_LAO }, { 0xAA80, USCRIPT_TAI_VIET }, { 0x0F40, USCRIPT_TIBETAN }, + { 0x11C72, USCRIPT_MARCHEN }, { 0x1C00, USCRIPT_LEPCHA }, { 0xA840, USCRIPT_PHAGS_PA }, { 0x1900, USCRIPT_LIMBU }, @@ -293,6 +297,7 @@ static const struct { { 0x1826, USCRIPT_MONGOLIAN }, { 0x1C5A, USCRIPT_OL_CHIKI }, { 0x13C4, USCRIPT_CHEROKEE }, + { 0x104B5, USCRIPT_OSAGE }, { 0x14C0, USCRIPT_CANADIAN_ABORIGINAL }, { 0x168F, USCRIPT_OGHAM }, { 0x16A0, USCRIPT_RUNIC }, @@ -302,6 +307,7 @@ static const struct { { 0xA6A0, USCRIPT_BAMUM }, { 0x16AE6, USCRIPT_BASSA_VAH }, { 0x1E802, USCRIPT_MENDE }, + { 0x1E909, USCRIPT_ADLAM, }, { 0xAC00, USCRIPT_HANGUL }, { 0x304B, USCRIPT_HIRAGANA }, { 0x30AB, USCRIPT_KATAKANA }, @@ -350,6 +356,7 @@ static const struct { { 0x109A0, USCRIPT_MEROITIC_CURSIVE }, { 0x10980, USCRIPT_MEROITIC_HIEROGLYPHS }, { 0x14400, USCRIPT_ANATOLIAN_HIEROGLYPHS }, + { 0x18229, USCRIPT_TANGUT }, { 0x5B57, USCRIPT_HAN }, { 0xFDD0, USCRIPT_UNKNOWN } // unassigned-implicit primary weights }; @@ -632,7 +639,7 @@ static void readAnOption( } static UBool -readAnElement(FILE *data, +readAnElement(char *line, CollationBaseDataBuilder &builder, UnicodeString &prefix, UnicodeString &s, int64_t ces[32], int32_t &cesLength, @@ -640,79 +647,69 @@ readAnElement(FILE *data, if(U_FAILURE(*status)) { return FALSE; } - char buffer[30000]; - char *result = fgets(buffer, sizeof(buffer), data); - if(result == NULL) { - if(feof(data)) { - return FALSE; - } else { - fprintf(stderr, "empty line but no EOF!\n"); - *status = U_INVALID_FORMAT_ERROR; - return FALSE; - } - } - int32_t buflen = (int32_t)uprv_strlen(buffer); - while(buflen>0 && (buffer[buflen-1] == '\r' || buffer[buflen-1] == '\n')) { - buffer[--buflen] = 0; + int32_t lineLength = (int32_t)uprv_strlen(line); + while(lineLength>0 && (line[lineLength-1] == '\r' || line[lineLength-1] == '\n')) { + line[--lineLength] = 0; } - if(buflen >= 3 && buffer[0] == (char)0xef && - buffer[1] == (char)0xbb && buffer[2] == (char)0xbf) { + if(lineLength >= 3 && line[0] == (char)0xef && + line[1] == (char)0xbb && line[2] == (char)0xbf) { // U+FEFF UTF-8 signature byte sequence. // Ignore, assuming it is at the start of the file. - buflen -= 3; - uprv_memmove(buffer, buffer + 3, buflen + 1); // +1: including NUL terminator + line += 3; + lineLength -= 3; } - if(buffer[0] == 0 || buffer[0] == '#') { + if(line[0] == 0 || line[0] == '#') { return FALSE; // just a comment, skip whole line } // Directives. - if(buffer[0] == '[') { - readAnOption(builder, buffer, status); + if(line[0] == '[') { + readAnOption(builder, line, status); return FALSE; } - char *startCodePoint = buffer; + CharString input; + char *startCodePoint = line; char *endCodePoint = strchr(startCodePoint, ';'); if(endCodePoint == NULL) { - fprintf(stderr, "error - line with no code point!\n"); + fprintf(stderr, "error - line with no code point:\n%s\n", line); *status = U_INVALID_FORMAT_ERROR; /* No code point - could be an error, but probably only an empty line */ return FALSE; - } else { - *endCodePoint = 0; } - char *pipePointer = strchr(buffer, '|'); + char *pipePointer = strchr(line, '|'); if (pipePointer != NULL) { // Read the prefix string which precedes the actual string. - *pipePointer = 0; + input.append(startCodePoint, (int32_t)(pipePointer - startCodePoint), *status); UChar *prefixChars = prefix.getBuffer(32); int32_t prefixSize = - u_parseString(startCodePoint, + u_parseString(input.data(), prefixChars, prefix.getCapacity(), NULL, status); if(U_FAILURE(*status)) { prefix.releaseBuffer(0); - fprintf(stderr, "error - parsing of prefix \"%s\" failed: %s\n", - startCodePoint, u_errorName(*status)); + fprintf(stderr, "error - parsing of prefix \"%s\" failed: %s\n%s\n", + input.data(), line, u_errorName(*status)); *status = U_INVALID_FORMAT_ERROR; return FALSE; } prefix.releaseBuffer(prefixSize); startCodePoint = pipePointer + 1; + input.clear(); } // Read the string which gets the CE(s) assigned. + input.append(startCodePoint, (int32_t)(endCodePoint - startCodePoint), *status); UChar *uchars = s.getBuffer(32); int32_t cSize = - u_parseString(startCodePoint, + u_parseString(input.data(), uchars, s.getCapacity(), NULL, status); if(U_FAILURE(*status)) { s.releaseBuffer(0); - fprintf(stderr, "error - parsing of code point(s) \"%s\" failed: %s\n", - startCodePoint, u_errorName(*status)); + fprintf(stderr, "error - parsing of code point(s) \"%s\" failed: %s\n%s\n", + input.data(), line, u_errorName(*status)); *status = U_INVALID_FORMAT_ERROR; return FALSE; } @@ -732,14 +729,14 @@ readAnElement(FILE *data, break; } if(cesLength >= 31) { - fprintf(stderr, "Error: Too many CEs on line '%s'\n", buffer); + fprintf(stderr, "Error: Too many CEs on line '%s'\n", line); *status = U_INVALID_FORMAT_ERROR; return FALSE; } ces[cesLength++] = parseCE(builder, pointer, *status); if(U_FAILURE(*status)) { fprintf(stderr, "Syntax error parsing CE from line '%s' - %s\n", - buffer, u_errorName(*status)); + line, u_errorName(*status)); return FALSE; } } @@ -759,11 +756,11 @@ readAnElement(FILE *data, uint8_t b = (uint8_t)(ce >> (j * 8)); if(j <= 1) { b &= 0x3f; } // tertiary bytes use 6 bits if (b == 1) { - fprintf(stderr, "Warning: invalid UCA weight byte 01 for %s\n", buffer); + fprintf(stderr, "Warning: invalid UCA weight byte 01 for %s\n", line); return FALSE; } if (j == 7 && b == 2) { - fprintf(stderr, "Warning: invalid UCA primary weight lead byte 02 for %s\n", buffer); + fprintf(stderr, "Warning: invalid UCA primary weight lead byte 02 for %s\n", line); return FALSE; } if (j == 7) { @@ -774,7 +771,7 @@ readAnElement(FILE *data, // 02 is unusable and 03 is the low compression terminator when the lead byte is compressible. if (isCompressible && (b <= 3 || b == 0xff)) { fprintf(stderr, "Warning: invalid UCA primary second weight byte %02X for %s\n", - b, buffer); + b, line); return FALSE; } } @@ -797,23 +794,34 @@ parseFractionalUCA(const char *filename, *status = U_FILE_ACCESS_ERROR; return; } - uint32_t line = 0; + int32_t lineNumber = 0; + char buffer[30000]; UChar32 maxCodePoint = 0; while(!feof(data)) { if(U_FAILURE(*status)) { fprintf(stderr, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n", - *status, u_errorName(*status), (int)line, filename); + *status, u_errorName(*status), (int)lineNumber, filename); exit(*status); } - line++; + lineNumber++; + char *line = fgets(buffer, sizeof(buffer), data); + if(line == NULL) { + if(feof(data)) { + break; + } else { + fprintf(stderr, "no more input line and also no EOF!\n"); + *status = U_INVALID_FORMAT_ERROR; + return; + } + } UnicodeString prefix; UnicodeString s; int64_t ces[32]; int32_t cesLength = 0; - if(readAnElement(data, builder, prefix, s, ces, cesLength, status)) { + if(readAnElement(line, builder, prefix, s, ces, cesLength, status)) { // we have read the line, now do something sensible with the read data! uint32_t p = (uint32_t)(ces[0] >> 32); @@ -852,9 +860,10 @@ parseFractionalUCA(const char *filename, if(script < 0) { fprintf(stderr, "Error: Unknown script for first-primary sample character " - "U+%04x on line %u of %s\n" + "U+%04X on line %u of %s:\n" + "%s\n" " (add the character to genuca.cpp sampleCharsToScripts[])\n", - c2, (int)line, filename); + c2, (int)lineNumber, filename, line); exit(U_INVALID_FORMAT_ERROR); } if(script == USCRIPT_UNKNOWN) { @@ -874,8 +883,9 @@ parseFractionalUCA(const char *filename, if(0xe0000000 <= p && p < 0xf0000000) { fprintf(stderr, "Error: Unexpected mapping to an implicit or trailing primary" - " on line %u of %s.\n", - (int)line, filename); + " on line %u of %s:\n" + "%s\n", + (int)lineNumber, filename, line); exit(U_INVALID_FORMAT_ERROR); } @@ -988,7 +998,7 @@ parseFractionalUCA(const char *filename, } if (beVerbose) { - printf("\nLines read: %u\n", (int)line); + printf("\nLines read: %u\n", (int)lineNumber); } fclose(data);