ICU-2219 Produce good java output from rbbi scanner state table tool.

X-SVN-Rev: 18921
This commit is contained in:
Andy Heninger 2005-12-21 22:00:45 +00:00
parent 9ee6e47841
commit bcf7c4fe25

View file

@ -6,14 +6,15 @@
# rbbicst Compile the RBBI rule paser state table data into initialized C data.
# Usage:
# cd icu/source/common
# perl rbbicst.pl [-j] < rbbirpt.txt > rbbirpt.h
# perl rbbicst.pl < rbbirpt.txt > rbbirpt.h
# perl rbbicst.pl -j < rbbirpt.txt > RBBIRuleParseTable.java
#
# The output file, rbbrpt.h, is included by some of the .cpp rbbi
# implementation files. This perl script is NOT run as part
# of a normal ICU build. It is run by hand when needed, and the
# rbbirpt.h generated file is put back into cvs.
#
# See rbbirpt.h for a description of the input format for this script.
# See rbbirpt.txt for a description of the input format for this script.
#
if ($ARGV[0] eq "-j") {
@ -106,9 +107,9 @@ line_loop: while (<>) {
#
# do the 'n' flag
#
$state_flag[$num_states] = "FALSE";
$state_flag[$num_states] = $javaOutput? "false" : "FALSE";
if ($fields[0] eq "n") {
$state_flag[$num_states] = "TRUE";
$state_flag[$num_states] = $javaOutput? "true": "TRUE";
shift @fields;
}
@ -197,6 +198,33 @@ for ($state=1; $state<$num_states; $state++) {
die if ($errors>0);
#
# Assign numbers to each of the character classes classes used.
# Sets are numbered from 128 - 250
# The values 0-127 in the state table are used for matching
# individual ASCII characters (the only thing that can appear in the rules.)
# The "set" names appearing in the code below (default, etc.) need special
# handling because they do not correspond to a normal set of characters,
# but trigger special handling by code in the state machine.
#
$i = 128;
foreach $setName (sort keys %charClasses) {
if ($setName eq "default") {
$charClasses{$setName} = 255;}
elsif ($setName eq "escaped") {
$charClasses{$setName} = 254;}
elsif ($setName eq "escapedP") {
$charClasses{$setName} = 253;}
elsif ($setName eq "eof") {
$charClasses{$setName} = 252;}
else {
# Normal (single) character class. Number them.
$charClasses{$setName} = $i;
$i++;
}
}
my ($sec, $min, $hour, , $day, $mon, $year, $wday, $yday, $isdst) = localtime;
$year += 1900;
@ -216,59 +244,87 @@ if ($javaOutput) {
print " * rule parser.\n";
print " * It is generated by the Perl script \"rbbicst.pl\" from\n";
print " * the rule parser state definitions file \"rbbirpt.txt\".\n";
print " * \@internal \n";
print " *\n";
print " */\n";
print "public class RuleBasedBreakIteratorStateTable\n";
print "class RBBIRuleParseTable\n";
print "{\n";
#
#
# Emit the constants for the actions to be performed.
#
$n = 1;
foreach $act (sort keys %actions) {
print " public static final int $act = $n;\n";
print " static final short $act = $n;\n";
$n++;
}
print " \n";
#
# emit the state transition table
# Emit constants for char class names
#
print " public static final String[] gRuleParseStateTable = {\n";
printf(" \"\\u%04.4x\\u%04.4x\\u%04.4x\\u%04.4x\\u%04.4x\"\n", doNOP, 0, 0, 0, 1);
foreach $setName (sort keys %charClasses) {
print " static final short kRuleSet_$setName = $charClasses{$setName};\n";
}
print "\n\n";
print " static class RBBIRuleTableElement { \n";
print " short fAction; \n";
print " short fCharClass; \n";
print " short fNextState; \n";
print " short fPushState; \n";
print " boolean fNextChar; \n";
print " String fStateName; \n";
print " RBBIRuleTableElement(short a, int cc, int ns, int ps, boolean nc, String sn) { \n";
print " fAction = a; \n";
print " fCharClass = (short)cc; \n";
print " fNextState = (short)ns; \n";
print " fPushState = (short)ps; \n";
print " fNextChar = nc; \n";
print " fStateName = sn; \n";
print " } \n";
print " }; \n";
print " \n";
print " static RBBIRuleTableElement[] gRuleParseStateTable = { \n ";
print " new RBBIRuleTableElement(doNOP, 0, 0,0, true, null ) // 0 \n"; #output the unused state 0.
for ($state=1; $state < $num_states; $state++) {
printf(" , \"\\u%04.4x", $state_func_name[$state]);
# print " , {$state_func_name[$state],";
print " , new RBBIRuleTableElement($state_func_name[$state],";
if ($state_literal_chars[$state] ne "") {
printf("\\u%04.4x", $state_func_name[$state]);
$c = $state_literal_chars[$state];
print("'$c', ");
}else {
printf("\\u%04.4x", $charClasses{$state_char_class[$state]});
print " $charClasses{$state_char_class[$state]},";
}
printf("\\u%04.4x", $states{$state_dest_state[$state]});
print " $states{$state_dest_state[$state]},";
# The push-state field is optional. If omitted, fill field with a zero, which flags
# the state machine that there is no push state.
if ($state_push_state[$state] eq "") {
print "\\u0000";
print "0, ";
} else {
printf("\\u%04.4x", $states{$state_push_state[$state]});
print " $states{$state_push_state[$state]},";
}
printf("\\u%04.4x", $state_flag[$state]);
# For the first row of each state, append the state name.
# Used for debugging only.
print " $state_flag[$state], ";
# if this is the first row of the table for this state, put out the state name.
if ($stateNames[$state] ne "") {
printf("%-20s", $stateNames[$state]."\"");
print " \"$stateNames[$state]\") ";
} else {
printf("%-20s", "\"");
print " null ) ";
}
# Put out a C++ comment showing the number (index) of this state row,
print " // $state ";
# Put out a comment showing the number (index) of this state row,
print " // $state ";
print "\n";
};
print " };\n";
print "}\n";
}
print " };\n";
print "}; \n";
}
else
{
@ -302,25 +358,10 @@ else
print "//\n";
print "// Character classes for RBBI rule scanning.\n";
print "//\n";
$i = 128; # State Table values for Unicode char sets range from 128-250.
# Sets "default", "escaped", etc. get special handling.
# They have no corresponding UnicodeSet object in the state machine,
# but are handled by special case code. So we emit no reference
# to a UnicodeSet object to them here.
foreach $setName (sort keys %charClasses) {
if ($setName eq "default") {
$charClasses{$setName} = 255;}
elsif ($setName eq "escaped") {
$charClasses{$setName} = 254;}
elsif ($setName eq "escapedP") {
$charClasses{$setName} = 253;}
elsif ($setName eq "eof") {
$charClasses{$setName} = 252;}
else {
# Normal character class. Fill in array with a ptr to the corresponding UnicodeSet in the state machine.
print " static const uint8_t kRuleSet_$setName = $i;\n";
$charClasses{$setName} = $i;
$i++;
if ($charClasses{$setName} < 250) {
# Normal character class.
print " static const uint8_t kRuleSet_$setName = $charClasses{$setName};\n";
}
}
print "\n\n";