ICU-2219 add java output option to RBBI rule parse state table generation perl script

X-SVN-Rev: 10936
2025-04-10 07:39:16 +00:00 · 2003-01-30 20:08:36 +00:00 · 2003-01-30 20:08:36 +00:00 · 578d384880
commit 578d384880
parent 707459650d
1 changed files with 196 additions and 119 deletions
--- a/icu4c/source/common/rbbicst.pl
+++ b/icu4c/source/common/rbbicst.pl
@ -6,7 +6,7 @@
 #  rbbicst   Compile the RBBI rule paser state table data into initialized C data.
 #            Usage:
 #                   cd icu/source/common
-#                   perl rbbicst.pl < rbbirpt.txt > rbbirpt.h
+#                   perl rbbicst.pl [-j] < rbbirpt.txt > rbbirpt.h
 #
 #             The output file, rbbrpt.h, is included by some of the .cpp rbbi
 #             implementation files.   This perl script is NOT run as part
@ -16,6 +16,11 @@
 #             See rbbirpt.h for a description of the input format for this script.
 #

+if ($ARGV[0] eq "-j") {
+    $javaOutput = 1;
+    shift @ARGV;
+}
+

 $num_states = 1;     # Always the state number for the line being compiled.
 $line_num  = 0;      # The line number in the input file.
@ -192,133 +197,205 @@ for ($state=1; $state<$num_states; $state++) {

 die if ($errors>0);

-print "//---------------------------------------------------------------------------------\n";
-print "//\n";
-print "// Generated Header File.  Do not edit by hand.\n";
-print "//    This file contains the state table for the ICU Rule Based Break Iterator\n";
-print "//    rule parser.\n";
-print "//    It is generated by the Perl script \"rbbicst.pl\" from\n";
-print "//    the rule parser state definitions file \"rbbirpt.txt\".\n";
-print "//\n";
-print "//   Copyright (C) 2002 International Business Machines Corporation \n";
-print "//   and others. All rights reserved.  \n";
-print "//\n";
-print "//---------------------------------------------------------------------------------\n";
-print "#ifndef RBBIRPT_H\n";
-print "#define RBBIRPT_H\n";
-print "\n";
-print "U_NAMESPACE_BEGIN\n";
+if ($javaOutput) {
+    print "//---------------------------------------------------------------------------------\n";
+    print "//\n";
+    print "// Generated Java File.  Do not edit by hand.\n";
+    print "//    This file contains the state table for the ICU Rule Based Break Iterator\n";
+    print "//    rule parser.\n";
+    print "//    It is generated by the Perl script \"rbbicst.pl\" from\n";
+    print "//    the rule parser state definitions file \"rbbirpt.txt\".\n";
+    print "//\n";
+    print "//   Copyright (C) 2003 International Business Machines Corporation \n";
+    print "//   and others. All rights reserved.  \n";
+    print "//\n";
+    print "//---------------------------------------------------------------------------------\n";

-#
-# Emit the constants for indicies of Unicode Sets
-#   Define one constant for each of the character classes encountered.
-#   At the same time, store the index corresponding to the set name back into hash.
-#
-print "//\n";
-print "// Character classes for RBBI rule scanning.\n";
-print "//\n";
-$i = 128;                   # State Table values for Unicode char sets range from 128-250.
-                            # Sets "default", "escaped", etc. get special handling.
-                            #  They have no corresponding UnicodeSet object in the state machine,
-                            #    but are handled by special case code.  So we emit no reference
-                            #    to a UnicodeSet object to them here.
-foreach $setName (keys %charClasses) {
-    if ($setName eq "default") {
-        $charClasses{$setName} = 255;}
-    elsif ($setName eq "escaped") {
-        $charClasses{$setName} = 254;}
-    elsif ($setName eq "escapedP") {
-        $charClasses{$setName} = 253;}
-    elsif ($setName eq "eof") {
-        $charClasses{$setName} = 252;}
-    else {
-        # Normal character class.  Fill in array with a ptr to the corresponding UnicodeSet in the state machine.
-       print "    static const uint8_t kRuleSet_$setName = $i;\n";
-        $charClasses{$setName} = $i;
-        $i++;
+    print "public class rbbirpt {\n";
+
+    #
+    # Emit the constants for the actions to be performed.
+    #
+    $n = 1;
+    foreach $act (keys %actions) {
+        print "    public static final int $act = $n;\n";
+        $n++;
    }
+    print " \n";
+    #
+    # emit the state transition table
+    #
+    print "public static final String[] gRuleParseStateTable = {\n";
+    printf("    \"\\u%04.4x\\u%04.4x\\u%04.4x\\u%04.4x\\u%04.4x\"\n", doNOP, 0, 0, 0, 1);
+    for ($state=1; $state < $num_states; $state++) {
+        printf("  , \"\\u%04.4x", $state_func_name[$state]);
+        # print "    , {$state_func_name[$state],";
+        if ($state_literal_chars[$state] ne "") {
+            printf("\\u%04.4x", $state_func_name[$state]);
+        }else {
+            printf("\\u%04.4x", $charClasses{$state_char_class[$state]});
+        }
+        printf("\\u%04.4x", $states{$state_dest_state[$state]});
+
+        # The push-state field is optional.  If omitted, fill field with a zero, which flags
+        #   the state machine that there is no push state.
+        if ($state_push_state[$state] eq "") {
+            print "\\u0000";
+        } else {
+           printf("\\u%04.4x", $states{$state_push_state[$state]});
+        }
+        printf("\\u%04.4x", $state_flag[$state]);
+
+        # For the first row of each state, append the state name.
+        #   Used for debugging only.
+        if ($stateNames[$state] ne "") {
+            printf("%-20s", $stateNames[$state]."\"");
+        } else {
+            printf("%-20s", "\"");
+        }
+
+        # Put out a C++ comment showing the number (index) of this state row,
+        print "    //  $state ";
+        print "\n";
+    };
+    print "};\n";
+    print "}\n";
 }
-print "\n\n";
+else
+{
+    #
+    #  C++ Output ...
+    #

-#
-# Emit the enum for the actions to be performed.
-#
-print "enum RBBI_RuleParseAction {\n";
-foreach $act (keys %actions) {
-    print "    $act,\n";
-}
-print "    rbbiLastAction};\n\n";

-#
-# Emit the struct definition for transtion table elements.
-#
-print "//-------------------------------------------------------------------------------\n";
-print "//\n";
-print "//  RBBIRuleTableEl    represents the structure of a row in the transition table\n";
-print "//                     for the rule parser state machine.\n";
-print "//-------------------------------------------------------------------------------\n";
-print "struct RBBIRuleTableEl {\n";
-print "    RBBI_RuleParseAction          fAction;\n";
-print "    uint8_t                       fCharClass;       // 0-127:    an individual ASCII character\n";
-print "                                                    // 128-255:  character class index\n";
-print "    uint8_t                       fNextState;       // 0-250:    normal next-stat numbers\n";
-print "                                                    // 255:      pop next-state from stack.\n";
-print "    uint8_t                       fPushState;\n";
-print "    UBool                         fNextChar;\n";
-print "};\n\n";
-
-#
-# emit the state transition table
-#
-print "static const struct RBBIRuleTableEl gRuleParseStateTable[] = {\n";
-print "    {doNOP, 0, 0, 0, TRUE}\n";    # State 0 is a dummy.  Real states start with index = 1.
-for ($state=1; $state < $num_states; $state++) {
-    print "    , {$state_func_name[$state],";
-    if ($state_literal_chars[$state] ne "") {
-        $c = $state_literal_chars[$state];
-        printf(" %d /* $c */,", ord($c));   #  use numeric value, so EBCDIC machines are ok.
-    }else {
-        print " $charClasses{$state_char_class[$state]},";
-    }
-    print " $states{$state_dest_state[$state]},";
-
-    # The push-state field is optional.  If omitted, fill field with a zero, which flags
-    #   the state machine that there is no push state.
-    if ($state_push_state[$state] eq "") {
-        print "0, ";
-    } else {
-        print " $states{$state_push_state[$state]},";
-    }
-    print " $state_flag[$state]} ";
-
-    # Put out a C++ comment showing the number (index) of this state row,
-    #   and, if this is the first row of the table for this state, the state name.
-    print "    //  $state ";
-    if ($stateNames[$state] ne "") {
-        print "     $stateNames[$state]";
-    }
+    print "//---------------------------------------------------------------------------------\n";
+    print "//\n";
+    print "// Generated Header File.  Do not edit by hand.\n";
+    print "//    This file contains the state table for the ICU Rule Based Break Iterator\n";
+    print "//    rule parser.\n";
+    print "//    It is generated by the Perl script \"rbbicst.pl\" from\n";
+    print "//    the rule parser state definitions file \"rbbirpt.txt\".\n";
+    print "//\n";
+    print "//   Copyright (C) 2002 International Business Machines Corporation \n";
+    print "//   and others. All rights reserved.  \n";
+    print "//\n";
+    print "//---------------------------------------------------------------------------------\n";
+    print "#ifndef RBBIRPT_H\n";
+    print "#define RBBIRPT_H\n";
    print "\n";
-};
-print " };\n";
+    print "U_NAMESPACE_BEGIN\n";

-
-#
-# emit a mapping array from state numbers to state names.
-#
-#    This array is used for producing debugging output from the rule parser.
-#
-print "static const char * const RBBIRuleStateNames[] = {";
-for ($state=0; $state<$num_states; $state++) {
-    if ($stateNames[$state] ne "") {
-        print "     \"$stateNames[$state]\",\n";
-    } else {
-        print "    0,\n";
+    #
+    # Emit the constants for indicies of Unicode Sets
+    #   Define one constant for each of the character classes encountered.
+    #   At the same time, store the index corresponding to the set name back into hash.
+    #
+    print "//\n";
+    print "// Character classes for RBBI rule scanning.\n";
+    print "//\n";
+    $i = 128;                   # State Table values for Unicode char sets range from 128-250.
+                                # Sets "default", "escaped", etc. get special handling.
+                                #  They have no corresponding UnicodeSet object in the state machine,
+                                #    but are handled by special case code.  So we emit no reference
+                                #    to a UnicodeSet object to them here.
+    foreach $setName (keys %charClasses) {
+        if ($setName eq "default") {
+            $charClasses{$setName} = 255;}
+        elsif ($setName eq "escaped") {
+            $charClasses{$setName} = 254;}
+        elsif ($setName eq "escapedP") {
+            $charClasses{$setName} = 253;}
+        elsif ($setName eq "eof") {
+            $charClasses{$setName} = 252;}
+        else {
+            # Normal character class.  Fill in array with a ptr to the corresponding UnicodeSet in the state machine.
+           print "    static const uint8_t kRuleSet_$setName = $i;\n";
+            $charClasses{$setName} = $i;
+            $i++;
+        }
    }
+    print "\n\n";
+
+    #
+    # Emit the enum for the actions to be performed.
+    #
+    print "enum RBBI_RuleParseAction {\n";
+    foreach $act (keys %actions) {
+        print "    $act,\n";
+    }
+    print "    rbbiLastAction};\n\n";
+
+    #
+    # Emit the struct definition for transtion table elements.
+    #
+    print "//-------------------------------------------------------------------------------\n";
+    print "//\n";
+    print "//  RBBIRuleTableEl    represents the structure of a row in the transition table\n";
+    print "//                     for the rule parser state machine.\n";
+    print "//-------------------------------------------------------------------------------\n";
+    print "struct RBBIRuleTableEl {\n";
+    print "    RBBI_RuleParseAction          fAction;\n";
+    print "    uint8_t                       fCharClass;       // 0-127:    an individual ASCII character\n";
+    print "                                                    // 128-255:  character class index\n";
+    print "    uint8_t                       fNextState;       // 0-250:    normal next-stat numbers\n";
+    print "                                                    // 255:      pop next-state from stack.\n";
+    print "    uint8_t                       fPushState;\n";
+    print "    UBool                         fNextChar;\n";
+    print "};\n\n";
+
+    #
+    # emit the state transition table
+    #
+    print "static const struct RBBIRuleTableEl gRuleParseStateTable[] = {\n";
+    print "    {doNOP, 0, 0, 0, TRUE}\n";    # State 0 is a dummy.  Real states start with index = 1.
+    for ($state=1; $state < $num_states; $state++) {
+        print "    , {$state_func_name[$state],";
+        if ($state_literal_chars[$state] ne "") {
+            $c = $state_literal_chars[$state];
+            printf(" %d /* $c */,", ord($c));   #  use numeric value, so EBCDIC machines are ok.
+        }else {
+            print " $charClasses{$state_char_class[$state]},";
+        }
+        print " $states{$state_dest_state[$state]},";
+
+        # The push-state field is optional.  If omitted, fill field with a zero, which flags
+        #   the state machine that there is no push state.
+        if ($state_push_state[$state] eq "") {
+            print "0, ";
+        } else {
+            print " $states{$state_push_state[$state]},";
+        }
+        print " $state_flag[$state]} ";
+
+        # Put out a C++ comment showing the number (index) of this state row,
+        #   and, if this is the first row of the table for this state, the state name.
+        print "    //  $state ";
+        if ($stateNames[$state] ne "") {
+            print "     $stateNames[$state]";
+        }
+        print "\n";
+    };
+    print " };\n";
+
+
+    #
+    # emit a mapping array from state numbers to state names.
+    #
+    #    This array is used for producing debugging output from the rule parser.
+    #
+    print "static const char * const RBBIRuleStateNames[] = {";
+    for ($state=0; $state<$num_states; $state++) {
+        if ($stateNames[$state] ne "") {
+            print "     \"$stateNames[$state]\",\n";
+        } else {
+            print "    0,\n";
+        }
+    }
+    print "    0};\n\n";
+
+    print "U_NAMESPACE_END\n";
+    print "#endif\n";
 }
-print "    0};\n\n";
-
-print "U_NAMESPACE_END\n";
-print "#endif\n";