ICU-2014 use index file, update name mapping, cleanup

X-SVN-Rev: 9392
2025-04-20 20:19:32 +00:00 · 2002-07-27 18:15:22 +00:00 · 2002-07-27 18:15:22 +00:00 · c26cb4aaaf
commit c26cb4aaaf
parent 5f481fb5c7
1 changed files with 200 additions and 128 deletions
--- a/icu4j/src/com/ibm/icu/dev/tool/translit/dumpICUrules.bat
+++ b/icu4j/src/com/ibm/icu/dev/tool/translit/dumpICUrules.bat
@ -14,21 +14,25 @@ goto endofperl

 # This perl script creates ICU transliterator data files, that live
 # in icu/data, from ICU4J UTF8 transliterator data files, in
-# icu4j/src/com/ibm/text/resources.
+# icu4j/src/com/ibm/icu/impl/data/.
 #
 # The transformation that is done is very minimal.  The script assumes
 # that the input files use only # comments
 # and that they follow a rigid format.
 #
 # The output files are named according to ICU conventions (see NAME_MAP
-# below) and created in the current directory.	They should be manually
-# checked and then copied into the icu/data directory.  An ICU build must
+# below) and created in the current directory.  They should be manually
+# checked and then copied into the icu/data/tranlit directory.
+# An ICU build must
 # then be initiated, and the standard suite of ICU transliterator tests
 # should be run after that.
 #
 # Alan Liu 5/19/00 2/27/01

 use Getopt::Long;
+use strict;
+
+use vars qw(%USED_FILES);

 my $DIR = "../../../impl/data";
 my $ID = '';
@ -39,7 +43,7 @@ GetOptions('dir=s' => \$DIR,

 usage() if (@ARGV);

-$ID =~ s/-/_/;
+my $ID =~ s/-/_/;
 if (! -d $DIR) {
    print STDERR "$DIR is not a directory\n";
    usage();
@ -50,87 +54,149 @@ sub usage {
    $me =~ s|.+[/\\]||;
    print "Usage: $me [-dir <dir>] [-id <id>]\n";
    print " --dir <dir> Specify the directory containing the\n";
-    print " 	Transliterator_*.txt files\n";
-    print " --id <id>	Specify a single ID to transform, e.g.\n";
-    print " 	Fullwidth-Halfwidth\n";
-    die;
+    print "             Transliterator_*.txt files\n";
+    print " --id <id>   Specify a single ID to transform, e.g.\n";
+    print "             Fullwidth-Halfwidth\n";
+    exit(1);
 }

-$JAVA_ONLY = '-';
+my $JAVA_ONLY = '-';

-$OUTDIR = "icu4c";
+my $OUTDIR = "icu4c";
 mkdir($OUTDIR,0777);

 # Mapping from Java file names to ICU file names
-%NAME_MAP = (
-	 # An ICU name of "" means the ICU name == the ID
+# Constraints on ICU4C file name: icudt20b_
+# |--9 (prefix)---|---18(name with distinguisher,e.g. "t_" )----|
+#  --4 ("."+extn)--| = 31 characters total.
+# That is, must have length(%NAME_MAP{x}) <= 16

-	 "Any_Accents" => "",
-	 "Any_Publishing" => "",
-	 "Bengali_InterIndic" => "Beng_InterIndic",
-	 "Cyrillic_Latin" => "Cyrl_Latn",
-	 "Devanagari_InterIndic" => "Deva_InterIndic",
-	 "Fullwidth_Halfwidth" => "FWidth_HWidth",
-	 "Greek_Latin" => "Grek_Latn",
-	 "Gujarati_InterIndic" => "Gujr_InterIndic",
-	 "Gurmukhi_InterIndic" => "Guru_InterIndic",
-	 "Hiragana_Katakana" => "Hira_Kana",
-	 "Hiragana_Latin" => "Hira_Latn",
-	 "InterIndic_Bengali" => "InterIndic_Beng",
-	 "InterIndic_Devanagari" => "InterIndic_Deva",
-	 "InterIndic_Gujarati" => "InterIndic_Gujr",
-	 "InterIndic_Gurmukhi" => "InterIndic_Guru",
-	 "InterIndic_Kannada" => "InterIndic_Knda",
-	 "InterIndic_Latin" => "InterIndic_Latn",
-	 "InterIndic_Malayalam" => "InterIndic_Mlym",
-	 "InterIndic_Oriya" => "InterIndic_Orya",
-	 "InterIndic_Tamil" => "InterIndic_Taml",
-	 "InterIndic_Telugu" => "InterIndic_Telu",
-	 "Kannada_InterIndic" => "Knda_InterIndic",
-	 "Latin_InterIndic" => "Latn_InterIndic",
-	 "Latin_Jamo" => "Latn_Jamo",
-	 "Latin_Katakana" => "Latn_Kana",
-	 "Malayalam_InterIndic" => "Mlym_InterIndic",
-	 "Oriya_InterIndic" => "Orya_InterIndic",
-	 "Tamil_InterIndic" => "Taml_InterIndic",
-	 "Telugu_InterIndic" => "Telu_InterIndic",
-	 
-	 "Han_Pinyin" => $JAVA_ONLY,
-	 "Kanji_English" => $JAVA_ONLY,
-	 "Kanji_OnRomaji" => $JAVA_ONLY,
-	 );
+my $MAX_ICU4C_FILENAME_LEN = 18;
+
+# -- HISTORY -- If not marked, then pre 2.2.
+#               All InterIndic are pre 2.2.
+# Any_Accents
+# Any_Publishing
+# Arabic_Latin           * 2.2
+# Cyrillic_Latin
+# Fullwidth_Halfwidth
+# Greek_Latin
+# Greek_Latin_UNGEGN     * 2.2 (moved from el.txt)
+# Han_Latin              * 2.2
+# Han_Latin_Definition   * 2.2
+# Han_Latin_EDICT        * 2.2 J only
+# Hebrew_Latin           * 2.2
+# Hiragana_Katakana
+# Hiragana_Latin
+# Latin_Jamo
+# Latin_Katakana
+# ThaiLogical_Latin      * 2.2 J only
+# Thai_ThaiLogical       * 2.2 J only
+# Thai_ThaiSemi          * 2.2 J only
+
+my %NAME_MAP = (
+     # An ICU name of "" means the ICU name == the ID
+
+     # We filter names based on what is in use in the index file.
+
+     # Flag a rule as JAVA_ONLY if it exists and we use it in Java,
+     # but we don't use it in C.
+
+     # Use official script abbreviations where possible.
+
+   # |..............|           |..............|             
+   # 1234567890123456           1234567890123456
+     Any_Accents            => "",
+     Any_Publishing         => "",
+     Cyrillic_Latin         => "Cyrl_Latn",
+     Fullwidth_Halfwidth    => "FWidth_HWidth",
+     Greek_Latin            => "Grek_Latn",
+     Hiragana_Katakana      => "Hira_Kana",
+     Hiragana_Latin         => "Hira_Latn",
+     Latin_Jamo             => "Latn_Jamo",
+     Latin_Katakana         => "Latn_Kana",
+
+     Arabic_Latin           => "Arab_Latn",
+     Greek_Latin_UNGEGN     => "Grek_Latn_UNGEGN",
+     Han_Latin              => "Hani_Latn",
+     Han_Latin_Definition   => "Hani_Latn_Def",
+     Han_Latin_EDICT        => "Hani_Latn_EDICT",
+     Hebrew_Latin           => "Hebr_Latn",
+     ThaiLogical_Latin      => $JAVA_ONLY, # "ThaiLog_Latn",
+     Thai_ThaiLogical       => $JAVA_ONLY, # "Thai_ThaiLog",
+     Thai_ThaiSemi          => $JAVA_ONLY, # "Thai_ThaiSemi",
+
+     InterIndic_Bengali     => "InterIndic_Beng",
+     InterIndic_Devanagari  => "InterIndic_Deva",
+     InterIndic_Gujarati    => "InterIndic_Gujr",
+     InterIndic_Gurmukhi    => "InterIndic_Guru",
+     InterIndic_Kannada     => "InterIndic_Knda",
+     InterIndic_Latin       => "InterIndic_Latn",
+     InterIndic_Malayalam   => "InterIndic_Mlym",
+     InterIndic_Oriya       => "InterIndic_Orya",
+     InterIndic_Tamil       => "InterIndic_Taml",
+     InterIndic_Telugu      => "InterIndic_Telu",
+
+     Bengali_InterIndic     => "Beng_InterIndic",
+     Devanagari_InterIndic  => "Deva_InterIndic",
+     Gujarati_InterIndic    => "Gujr_InterIndic",
+     Gurmukhi_InterIndic    => "Guru_InterIndic",
+     Kannada_InterIndic     => "Knda_InterIndic",
+     Latin_InterIndic       => "Latn_InterIndic",
+     Malayalam_InterIndic   => "Mlym_InterIndic",
+     Oriya_InterIndic       => "Orya_InterIndic",
+     Tamil_InterIndic       => "Taml_InterIndic",
+     Telugu_InterIndic      => "Telu_InterIndic",
+     
+     Han_Pinyin             => $JAVA_ONLY,
+     Kanji_English          => $JAVA_ONLY,
+     Kanji_OnRomaji         => $JAVA_ONLY,
+     );
+
+my ($x,$x,$x,$x,$x,$THIS_YEAR) = localtime();
+$THIS_YEAR += 1900;

 # Header blocks of text written at start of ICU output files
-$HEADER1 = <<END;
+my $HEADER1 = <<END;
 //--------------------------------------------------------------------
-// Copyright (c) 1999-2004, International Business Machines
+// Copyright (c) 1999-$THIS_YEAR, International Business Machines
 // Corporation and others.  All Rights Reserved.
 //--------------------------------------------------------------------
 // THIS IS A MACHINE-GENERATED FILE
 END
-$HEADER2 = <<END;
+my $HEADER2 = <<END;
 //--------------------------------------------------------------------
 END

-$TOOL = $0;
+my $TOOL = $0;
+
+# Convert the index first; this tells us which rule files are in use.
+convertIndex();
+
+# print "In use:\n", join("\n", sort keys(%USED_FILES)), "\n";

 # Iterate over all Java RBT rule files
 foreach (<$DIR/Transliterator_*.txt>) {
-    next if (/~$/);
-    next if (/_index\.txt$/);
-    next if ($ID && !/$ID/);
-    my ($out, $id) = convertFileName($_);
-    if ($out) {
-        if ($out eq $JAVA_ONLY) {
-	print STDERR "$id: Java only\n";
-	next;
+    next if (/~$/); # Ignore emacs backups
+    next if (/_index\.txt$/); # The index file was processed above
+    # Select either the command-line arg, if there was one, or
+    # any files mentioned in the index.
+    my $leaf = $_;
+    $leaf =~ s|.+[/\\]||;
+    if (($ID && $leaf =~ /$ID/) || exists $USED_FILES{$leaf}) {
+        my ($out, $id) = convertFileName($_);
+        if ($out) {
+            if ($out eq $JAVA_ONLY) {
+                print STDERR "*** $id skipped: Java only ***\n";
+                next;
+            }
+            file($id, $_, $out);
        }
-        file($id, $_, $out);
+    } elsif (!$ID) {
+        print "*** $leaf skipped: not in use ***\n";
    }
 }

-convertIndex();
-
 ######################################################################
 # Convert a Java file name to C
 # Param: Java file name of the form m|Transliterator_(.+)\.utf8\.txt$|
@ -155,14 +221,18 @@ sub convertFileName {
    if ($out ne $JAVA_ONLY) {
        $out = 't_' . $out;
    }
+    if (length($out) > $MAX_ICU4C_FILENAME_LEN) {
+        print STDERR "ERROR: ICU4C file name \"$out\" too long; please update $0\n";
+        return '';
+    }
    return ($out, $id);
 }

 ######################################################################
 # Convert the index file from Java to C format
 sub convertIndex {
-    $JAVA_INDEX = "Transliterator_index.txt";
-    $C_INDEX = "translit_index.txt";
+    my $JAVA_INDEX = "Transliterator_index.txt";
+    my $C_INDEX = "translit_index.txt";
    open(JAVA_INDEX, "$DIR/$JAVA_INDEX") or die;
    open(C_INDEX, ">$OUTDIR/$C_INDEX") or die;
    
@ -182,7 +252,7 @@ sub convertIndex {
 translit_index {
  RuleBasedTransliteratorIDs {
 END
-	    
+        
    while (<JAVA_INDEX>) {
        # ignore $Source $Revision $Date CVS keyword substitutions
        next if /\$Source/ ;
@ -195,13 +265,13 @@ END

        # Comments; change # to //
        if (s|^(\s*)\#|$1//|) {
-	print C_INDEX;
-	next;
+            print C_INDEX;
+            next;
        }
        # Blank lines
        if (!/\S/) {
-	print C_INDEX;
-	next;
+            print C_INDEX;
+            next;
        }
        # Content lines
        chomp;
@ -210,26 +280,28 @@ END
        $_=~ s/\\p/\\\\p/g;
        my @a = split(':', $_);
        if ($a[1] eq 'file' || $a[1] eq 'internal') {
-	# Convert the file name
-	my $id;
-	($a[2], $id) = convertFileName($a[2]);
-	if ($a[2] eq $JAVA_ONLY) {
-	    $prefix = '// Java only: ';
-	}
+            # Convert the file name
+            my $id;
+            # Record file names in use
+            $USED_FILES{$a[2]} = 1;
+            ($a[2], $id) = convertFileName($a[2]);
+            if ($a[2] eq $JAVA_ONLY) {
+                $prefix = '// Java only: ';
+            }

-	# Delete the encoding field
-	splice(@a, 3, 1);
+            # Delete the encoding field
+            splice(@a, 3, 1);
        } elsif ($a[1] eq 'alias') {
-	# Pad out with extra blank fields to make the
-	# 2-d array square
-	push @a, "";
+            # Pad out with extra blank fields to make the
+            # 2-d array square
+            push @a, "";
        } else {
-	die "Can't parse $_";
+            die "Can't parse $_";
        }
        print C_INDEX
-	$prefix, "{ ",
-	join(", ", map("\"$_\"", @a)),
-	" },\n";
+            $prefix, "{ ",
+            join(", ", map("\"$_\"", @a)),
+            " },\n";
    }

    print C_INDEX <<END;
@ -273,7 +345,7 @@ sub file {

    # Open file, write UTF8 marker, close it, and reopen in text mode
    open(OUT, ">$OUTDIR/$OUT") or die;
-    binmode OUT; # Must do this so we can write our UTF8 marker
+    binmode OUT;        # Must do this so we can write our UTF8 marker
    print OUT pack("C3", 0xEF, 0xBB, 0xBF); # Write UTF8 marker
    close(OUT);

@ -287,7 +359,7 @@ sub file {
    print OUT "  Rule {\n";

    open(IN, $IN) or die;
-    binmode IN; # IN is a UTF8 file
+    binmode IN;                 # IN is a UTF8 file

    my $first = 1;
    my $BOM = pack("C3", 239, 187, 191); # a UTF8 byte order mark
@ -296,21 +368,21 @@ sub file {
    # and taking other text and enclosing it in double quotes
    while (<IN>) {
        my $raw = $_;
-		# ignore $Source $Revision $Date CVS keyword substitutions
-		next if /\$Source/ ;
-		next if /\$Revision/ ;
-		next if /\$Date/ ;
-		
-		# we have printed out the copyright info ... ignore one in Java version
-		next if /Copyright/ ;
-		next if /Corporation/;
-		
+        # ignore $Source $Revision $Date CVS keyword substitutions
+        next if /\$Source/ ;
+        next if /\$Revision/ ;
+        next if /\$Date/ ;
+
+        # we have printed out the copyright info ... ignore one in Java version
+        next if /Copyright/ ;
+        next if /Corporation/;
+
        # Look for and delete BOM
        if ($first) {
-	s/^$BOM//;
-	$first = 0;
+            s/^$BOM//;
+            $first = 0;
        }
-        
+
        # Clean the eol junk up
        s/[\x0D\x0A]+$//;

@ -319,51 +391,51 @@ sub file {
        # concatenated.  Count trailing backslashes; if they are odd,
        # one is trailing.
        if (m|(\\+)$|) {
-	if ((length($1) % 2) == 1) {
-	    s|\\$||;
-	}
+            if ((length($1) % 2) == 1) {
+                s|\\$||;
+            }
        }

        # Transform escaped characters
        hideEscapes();

        if (/^(\s*)(\#.*)$/) {
-	# Comment-only line
-	my ($white, $cmt) = ($1, $2);
-	$cmt =~ s|\#|//|;
-	$_ = $white . $cmt;
+            # Comment-only line
+            my ($white, $cmt) = ($1, $2);
+            $cmt =~ s|\#|//|;
+            $_ = $white . $cmt;

        } elsif (!/\S/) {
-	# Blank line -- leave as-is
+            # Blank line -- leave as-is

        } else {
-	# Remove single-quoted matter 
-	my @quotes;
-	my $nquotes = 0;
-	my $x = $_;
-	while (s/^([^\']*)(\'[^\']*\')/$1<<x$nquotes>>/) {
-	    push @quotes, $2;
-	    ++$nquotes;
-	}
+            # Remove single-quoted matter 
+            my @quotes;
+            my $nquotes = 0;
+            my $x = $_;
+            while (s/^([^\']*)(\'[^\']*\')/$1<<x$nquotes>>/) {
+                push @quotes, $2;
+                ++$nquotes;
+            }

-	# Extract comment
-	my $cmt = '';
-	if (s|\#(.*)||) {
-	    $cmt = '//' . $1;
-	}
-	
-	# Add quotes
-	s|^(\s*)(\S.*?)(\s*)$|$1\"$2\"$3|;
+            # Extract comment
+            my $cmt = '';
+            if (s|\#(.*)||) {
+                $cmt = '//' . $1;
+            }

-	# Restore single-quoted matter
-	for (my $i=0; $i<$nquotes; ++$i) {
-	    s|<<x$i>>|$quotes[$i]|;
-	}
+            # Add quotes
+            s|^(\s*)(\S.*?)(\s*)$|$1\"$2\"$3|;

-	# Restore comment
-	$_ .= $cmt;
+            # Restore single-quoted matter
+            for (my $i=0; $i<$nquotes; ++$i) {
+                s|<<x$i>>|$quotes[$i]|;
+            }
+
+            # Restore comment
+            $_ .= $cmt;
        }
-        
+
        # Restore escaped characters
        restoreEscapes();