From 798f5235ddb24bf0deb464cace989bb614b4dc48 Mon Sep 17 00:00:00 2001
From: Markus Scherer <markus.icu@gmail.com>
Date: Fri, 6 May 2016 23:19:36 +0000
Subject: [PATCH] ICU-12526 genuca: add new script sample characters, more
 readable error output

X-SVN-Rev: 38716
---
 tools/unicode/c/genuca/genuca.cpp | 106 ++++++++++++++++--------------
 1 file changed, 58 insertions(+), 48 deletions(-)

diff --git a/tools/unicode/c/genuca/genuca.cpp b/tools/unicode/c/genuca/genuca.cpp
index b152cdbd35b..7332c4d8b1e 100644
--- a/tools/unicode/c/genuca/genuca.cpp
+++ b/tools/unicode/c/genuca/genuca.cpp
@@ -236,6 +236,7 @@ static const struct {
     { 0x078C, USCRIPT_THAANA },
     { 0x07CA, USCRIPT_NKO },
     { 0x07D8, USCRIPT_NKO },
+    { 0x2D30, USCRIPT_TIFINAGH },
     { 0x2D5E, USCRIPT_TIFINAGH },
     { 0x12A0, USCRIPT_ETHIOPIC },
     { 0x0905, USCRIPT_DEVANAGARI },
@@ -258,6 +259,7 @@ static const struct {
     { 0x112BE, USCRIPT_KHUDAWADI },
     { 0x1128F, USCRIPT_MULTANI },
     { 0x11315, USCRIPT_GRANTHA },
+    { 0x11412, USCRIPT_NEWA },
     { 0x11484, USCRIPT_TIRHUTA },
     { 0x1158E, USCRIPT_SIDDHAM },
     { 0x1160E, USCRIPT_MODI },
@@ -266,10 +268,12 @@ static const struct {
     { 0x1B83, USCRIPT_SUNDANESE },
     { 0x11005, USCRIPT_BRAHMI },
     { 0x10A00, USCRIPT_KHAROSHTHI },
+    { 0x11C0E, USCRIPT_BHAIKSUKI },
     { 0x0E17, USCRIPT_THAI },
     { 0x0EA5, USCRIPT_LAO },
     { 0xAA80, USCRIPT_TAI_VIET },
     { 0x0F40, USCRIPT_TIBETAN },
+    { 0x11C72, USCRIPT_MARCHEN },
     { 0x1C00, USCRIPT_LEPCHA },
     { 0xA840, USCRIPT_PHAGS_PA },
     { 0x1900, USCRIPT_LIMBU },
@@ -293,6 +297,7 @@ static const struct {
     { 0x1826, USCRIPT_MONGOLIAN },
     { 0x1C5A, USCRIPT_OL_CHIKI },
     { 0x13C4, USCRIPT_CHEROKEE },
+    { 0x104B5, USCRIPT_OSAGE },
     { 0x14C0, USCRIPT_CANADIAN_ABORIGINAL },
     { 0x168F, USCRIPT_OGHAM },
     { 0x16A0, USCRIPT_RUNIC },
@@ -302,6 +307,7 @@ static const struct {
     { 0xA6A0, USCRIPT_BAMUM },
     { 0x16AE6, USCRIPT_BASSA_VAH },
     { 0x1E802, USCRIPT_MENDE },
+    { 0x1E909, USCRIPT_ADLAM, },
     { 0xAC00, USCRIPT_HANGUL },
     { 0x304B, USCRIPT_HIRAGANA },
     { 0x30AB, USCRIPT_KATAKANA },
@@ -350,6 +356,7 @@ static const struct {
     { 0x109A0, USCRIPT_MEROITIC_CURSIVE },
     { 0x10980, USCRIPT_MEROITIC_HIEROGLYPHS },
     { 0x14400, USCRIPT_ANATOLIAN_HIEROGLYPHS },
+    { 0x18229, USCRIPT_TANGUT },
     { 0x5B57, USCRIPT_HAN },
     { 0xFDD0, USCRIPT_UNKNOWN }  // unassigned-implicit primary weights
 };
@@ -632,7 +639,7 @@ static void readAnOption(
 }
 
 static UBool
-readAnElement(FILE *data,
+readAnElement(char *line,
         CollationBaseDataBuilder &builder,
         UnicodeString &prefix, UnicodeString &s,
         int64_t ces[32], int32_t &cesLength,
@@ -640,79 +647,69 @@ readAnElement(FILE *data,
     if(U_FAILURE(*status)) {
         return FALSE;
     }
-    char buffer[30000];
-    char *result = fgets(buffer, sizeof(buffer), data);
-    if(result == NULL) {
-        if(feof(data)) {
-            return FALSE;
-        } else {
-            fprintf(stderr, "empty line but no EOF!\n");
-            *status = U_INVALID_FORMAT_ERROR;
-            return FALSE;
-        }
-    }
-    int32_t buflen = (int32_t)uprv_strlen(buffer);
-    while(buflen>0 && (buffer[buflen-1] == '\r' || buffer[buflen-1] == '\n')) {
-      buffer[--buflen] = 0;
+    int32_t lineLength = (int32_t)uprv_strlen(line);
+    while(lineLength>0 && (line[lineLength-1] == '\r' || line[lineLength-1] == '\n')) {
+      line[--lineLength] = 0;
     }
 
-    if(buflen >= 3 && buffer[0] == (char)0xef &&
-            buffer[1] == (char)0xbb && buffer[2] == (char)0xbf) {
+    if(lineLength >= 3 && line[0] == (char)0xef &&
+            line[1] == (char)0xbb && line[2] == (char)0xbf) {
         // U+FEFF UTF-8 signature byte sequence.
         // Ignore, assuming it is at the start of the file.
-        buflen -= 3;
-        uprv_memmove(buffer, buffer + 3, buflen + 1);  // +1: including NUL terminator
+        line += 3;
+        lineLength -= 3;
     }
-    if(buffer[0] == 0 || buffer[0] == '#') {
+    if(line[0] == 0 || line[0] == '#') {
         return FALSE; // just a comment, skip whole line
     }
 
     // Directives.
-    if(buffer[0] == '[') {
-        readAnOption(builder, buffer, status);
+    if(line[0] == '[') {
+        readAnOption(builder, line, status);
         return FALSE;
     }
 
-    char *startCodePoint = buffer;
+    CharString input;
+    char *startCodePoint = line;
     char *endCodePoint = strchr(startCodePoint, ';');
     if(endCodePoint == NULL) {
-        fprintf(stderr, "error - line with no code point!\n");
+        fprintf(stderr, "error - line with no code point:\n%s\n", line);
         *status = U_INVALID_FORMAT_ERROR; /* No code point - could be an error, but probably only an empty line */
         return FALSE;
-    } else {
-        *endCodePoint = 0;
     }
 
-    char *pipePointer = strchr(buffer, '|');
+    char *pipePointer = strchr(line, '|');
     if (pipePointer != NULL) {
         // Read the prefix string which precedes the actual string.
-        *pipePointer = 0;
+        input.append(startCodePoint, (int32_t)(pipePointer - startCodePoint), *status);
         UChar *prefixChars = prefix.getBuffer(32);
         int32_t prefixSize =
-            u_parseString(startCodePoint,
+            u_parseString(input.data(),
                           prefixChars, prefix.getCapacity(),
                           NULL, status);
         if(U_FAILURE(*status)) {
             prefix.releaseBuffer(0);
-            fprintf(stderr, "error - parsing of prefix \"%s\" failed: %s\n",
-                    startCodePoint, u_errorName(*status));
+            fprintf(stderr, "error - parsing of prefix \"%s\" failed: %s\n%s\n",
+                    input.data(), line, u_errorName(*status));
             *status = U_INVALID_FORMAT_ERROR;
             return FALSE;
         }
         prefix.releaseBuffer(prefixSize);
         startCodePoint = pipePointer + 1;
+        input.clear();
     }
 
     // Read the string which gets the CE(s) assigned.
+    input.append(startCodePoint, (int32_t)(endCodePoint - startCodePoint), *status);
     UChar *uchars = s.getBuffer(32);
     int32_t cSize =
-        u_parseString(startCodePoint,
+        u_parseString(input.data(),
                       uchars, s.getCapacity(),
                       NULL, status);
     if(U_FAILURE(*status)) {
         s.releaseBuffer(0);
-        fprintf(stderr, "error - parsing of code point(s) \"%s\" failed: %s\n",
-                startCodePoint, u_errorName(*status));
+        fprintf(stderr, "error - parsing of code point(s) \"%s\" failed: %s\n%s\n",
+                input.data(), line, u_errorName(*status));
         *status = U_INVALID_FORMAT_ERROR;
         return FALSE;
     }
@@ -732,14 +729,14 @@ readAnElement(FILE *data,
             break;
         }
         if(cesLength >= 31) {
-            fprintf(stderr, "Error: Too many CEs on line '%s'\n", buffer);
+            fprintf(stderr, "Error: Too many CEs on line '%s'\n", line);
             *status = U_INVALID_FORMAT_ERROR;
             return FALSE;
         }
         ces[cesLength++] = parseCE(builder, pointer, *status);
         if(U_FAILURE(*status)) {
             fprintf(stderr, "Syntax error parsing CE from line '%s' - %s\n",
-                    buffer, u_errorName(*status));
+                    line, u_errorName(*status));
             return FALSE;
         }
     }
@@ -759,11 +756,11 @@ readAnElement(FILE *data,
                 uint8_t b = (uint8_t)(ce >> (j * 8));
                 if(j <= 1) { b &= 0x3f; }  // tertiary bytes use 6 bits
                 if (b == 1) {
-                    fprintf(stderr, "Warning: invalid UCA weight byte 01 for %s\n", buffer);
+                    fprintf(stderr, "Warning: invalid UCA weight byte 01 for %s\n", line);
                     return FALSE;
                 }
                 if (j == 7 && b == 2) {
-                    fprintf(stderr, "Warning: invalid UCA primary weight lead byte 02 for %s\n", buffer);
+                    fprintf(stderr, "Warning: invalid UCA primary weight lead byte 02 for %s\n", line);
                     return FALSE;
                 }
                 if (j == 7) {
@@ -774,7 +771,7 @@ readAnElement(FILE *data,
                     // 02 is unusable and 03 is the low compression terminator when the lead byte is compressible.
                     if (isCompressible && (b <= 3 || b == 0xff)) {
                         fprintf(stderr, "Warning: invalid UCA primary second weight byte %02X for %s\n",
-                                b, buffer);
+                                b, line);
                         return FALSE;
                     }
                 }
@@ -797,23 +794,34 @@ parseFractionalUCA(const char *filename,
         *status = U_FILE_ACCESS_ERROR;
         return;
     }
-    uint32_t line = 0;
+    int32_t lineNumber = 0;
+    char buffer[30000];
 
     UChar32 maxCodePoint = 0;
     while(!feof(data)) {
         if(U_FAILURE(*status)) {
             fprintf(stderr, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n",
-                *status, u_errorName(*status), (int)line, filename);
+                *status, u_errorName(*status), (int)lineNumber, filename);
             exit(*status);
         }
 
-        line++;
+        lineNumber++;
+        char *line = fgets(buffer, sizeof(buffer), data);
+        if(line == NULL) {
+            if(feof(data)) {
+                break;
+            } else {
+                fprintf(stderr, "no more input line and also no EOF!\n");
+                *status = U_INVALID_FORMAT_ERROR;
+                return;
+            }
+        }
 
         UnicodeString prefix;
         UnicodeString s;
         int64_t ces[32];
         int32_t cesLength = 0;
-        if(readAnElement(data, builder, prefix, s, ces, cesLength, status)) {
+        if(readAnElement(line, builder, prefix, s, ces, cesLength, status)) {
             // we have read the line, now do something sensible with the read data!
             uint32_t p = (uint32_t)(ces[0] >> 32);
 
@@ -852,9 +860,10 @@ parseFractionalUCA(const char *filename,
                     if(script < 0) {
                         fprintf(stderr,
                                 "Error: Unknown script for first-primary sample character "
-                                "U+%04x on line %u of %s\n"
+                                "U+%04X on line %u of %s:\n"
+                                "%s\n"
                                 "    (add the character to genuca.cpp sampleCharsToScripts[])\n",
-                                c2, (int)line, filename);
+                                c2, (int)lineNumber, filename, line);
                         exit(U_INVALID_FORMAT_ERROR);
                     }
                     if(script == USCRIPT_UNKNOWN) {
@@ -874,8 +883,9 @@ parseFractionalUCA(const char *filename,
                 if(0xe0000000 <= p && p < 0xf0000000) {
                     fprintf(stderr,
                             "Error: Unexpected mapping to an implicit or trailing primary"
-                            " on line %u of %s.\n",
-                            (int)line, filename);
+                            " on line %u of %s:\n"
+                            "%s\n",
+                            (int)lineNumber, filename, line);
                     exit(U_INVALID_FORMAT_ERROR);
                 }
 
@@ -988,7 +998,7 @@ parseFractionalUCA(const char *filename,
     }
 
     if (beVerbose) {
-        printf("\nLines read: %u\n", (int)line);
+        printf("\nLines read: %u\n", (int)lineNumber);
     }
 
     fclose(data);