ICU-2014 use index file, update name mapping, cleanup

X-SVN-Rev: 9392
This commit is contained in:
Alan Liu 2002-07-27 18:15:22 +00:00
parent 5f481fb5c7
commit c26cb4aaaf

View file

@ -14,21 +14,25 @@ goto endofperl
# This perl script creates ICU transliterator data files, that live
# in icu/data, from ICU4J UTF8 transliterator data files, in
# icu4j/src/com/ibm/text/resources.
# icu4j/src/com/ibm/icu/impl/data/.
#
# The transformation that is done is very minimal. The script assumes
# that the input files use only # comments
# and that they follow a rigid format.
#
# The output files are named according to ICU conventions (see NAME_MAP
# below) and created in the current directory. They should be manually
# checked and then copied into the icu/data directory. An ICU build must
# below) and created in the current directory. They should be manually
# checked and then copied into the icu/data/tranlit directory.
# An ICU build must
# then be initiated, and the standard suite of ICU transliterator tests
# should be run after that.
#
# Alan Liu 5/19/00 2/27/01
use Getopt::Long;
use strict;
use vars qw(%USED_FILES);
my $DIR = "../../../impl/data";
my $ID = '';
@ -39,7 +43,7 @@ GetOptions('dir=s' => \$DIR,
usage() if (@ARGV);
$ID =~ s/-/_/;
my $ID =~ s/-/_/;
if (! -d $DIR) {
print STDERR "$DIR is not a directory\n";
usage();
@ -50,87 +54,149 @@ sub usage {
$me =~ s|.+[/\\]||;
print "Usage: $me [-dir <dir>] [-id <id>]\n";
print " --dir <dir> Specify the directory containing the\n";
print " Transliterator_*.txt files\n";
print " --id <id> Specify a single ID to transform, e.g.\n";
print " Fullwidth-Halfwidth\n";
die;
print " Transliterator_*.txt files\n";
print " --id <id> Specify a single ID to transform, e.g.\n";
print " Fullwidth-Halfwidth\n";
exit(1);
}
$JAVA_ONLY = '-';
my $JAVA_ONLY = '-';
$OUTDIR = "icu4c";
my $OUTDIR = "icu4c";
mkdir($OUTDIR,0777);
# Mapping from Java file names to ICU file names
%NAME_MAP = (
# An ICU name of "" means the ICU name == the ID
# Constraints on ICU4C file name: icudt20b_
# |--9 (prefix)---|---18(name with distinguisher,e.g. "t_" )----|
# --4 ("."+extn)--| = 31 characters total.
# That is, must have length(%NAME_MAP{x}) <= 16
"Any_Accents" => "",
"Any_Publishing" => "",
"Bengali_InterIndic" => "Beng_InterIndic",
"Cyrillic_Latin" => "Cyrl_Latn",
"Devanagari_InterIndic" => "Deva_InterIndic",
"Fullwidth_Halfwidth" => "FWidth_HWidth",
"Greek_Latin" => "Grek_Latn",
"Gujarati_InterIndic" => "Gujr_InterIndic",
"Gurmukhi_InterIndic" => "Guru_InterIndic",
"Hiragana_Katakana" => "Hira_Kana",
"Hiragana_Latin" => "Hira_Latn",
"InterIndic_Bengali" => "InterIndic_Beng",
"InterIndic_Devanagari" => "InterIndic_Deva",
"InterIndic_Gujarati" => "InterIndic_Gujr",
"InterIndic_Gurmukhi" => "InterIndic_Guru",
"InterIndic_Kannada" => "InterIndic_Knda",
"InterIndic_Latin" => "InterIndic_Latn",
"InterIndic_Malayalam" => "InterIndic_Mlym",
"InterIndic_Oriya" => "InterIndic_Orya",
"InterIndic_Tamil" => "InterIndic_Taml",
"InterIndic_Telugu" => "InterIndic_Telu",
"Kannada_InterIndic" => "Knda_InterIndic",
"Latin_InterIndic" => "Latn_InterIndic",
"Latin_Jamo" => "Latn_Jamo",
"Latin_Katakana" => "Latn_Kana",
"Malayalam_InterIndic" => "Mlym_InterIndic",
"Oriya_InterIndic" => "Orya_InterIndic",
"Tamil_InterIndic" => "Taml_InterIndic",
"Telugu_InterIndic" => "Telu_InterIndic",
"Han_Pinyin" => $JAVA_ONLY,
"Kanji_English" => $JAVA_ONLY,
"Kanji_OnRomaji" => $JAVA_ONLY,
);
my $MAX_ICU4C_FILENAME_LEN = 18;
# -- HISTORY -- If not marked, then pre 2.2.
# All InterIndic are pre 2.2.
# Any_Accents
# Any_Publishing
# Arabic_Latin * 2.2
# Cyrillic_Latin
# Fullwidth_Halfwidth
# Greek_Latin
# Greek_Latin_UNGEGN * 2.2 (moved from el.txt)
# Han_Latin * 2.2
# Han_Latin_Definition * 2.2
# Han_Latin_EDICT * 2.2 J only
# Hebrew_Latin * 2.2
# Hiragana_Katakana
# Hiragana_Latin
# Latin_Jamo
# Latin_Katakana
# ThaiLogical_Latin * 2.2 J only
# Thai_ThaiLogical * 2.2 J only
# Thai_ThaiSemi * 2.2 J only
my %NAME_MAP = (
# An ICU name of "" means the ICU name == the ID
# We filter names based on what is in use in the index file.
# Flag a rule as JAVA_ONLY if it exists and we use it in Java,
# but we don't use it in C.
# Use official script abbreviations where possible.
# |..............| |..............|
# 1234567890123456 1234567890123456
Any_Accents => "",
Any_Publishing => "",
Cyrillic_Latin => "Cyrl_Latn",
Fullwidth_Halfwidth => "FWidth_HWidth",
Greek_Latin => "Grek_Latn",
Hiragana_Katakana => "Hira_Kana",
Hiragana_Latin => "Hira_Latn",
Latin_Jamo => "Latn_Jamo",
Latin_Katakana => "Latn_Kana",
Arabic_Latin => "Arab_Latn",
Greek_Latin_UNGEGN => "Grek_Latn_UNGEGN",
Han_Latin => "Hani_Latn",
Han_Latin_Definition => "Hani_Latn_Def",
Han_Latin_EDICT => "Hani_Latn_EDICT",
Hebrew_Latin => "Hebr_Latn",
ThaiLogical_Latin => $JAVA_ONLY, # "ThaiLog_Latn",
Thai_ThaiLogical => $JAVA_ONLY, # "Thai_ThaiLog",
Thai_ThaiSemi => $JAVA_ONLY, # "Thai_ThaiSemi",
InterIndic_Bengali => "InterIndic_Beng",
InterIndic_Devanagari => "InterIndic_Deva",
InterIndic_Gujarati => "InterIndic_Gujr",
InterIndic_Gurmukhi => "InterIndic_Guru",
InterIndic_Kannada => "InterIndic_Knda",
InterIndic_Latin => "InterIndic_Latn",
InterIndic_Malayalam => "InterIndic_Mlym",
InterIndic_Oriya => "InterIndic_Orya",
InterIndic_Tamil => "InterIndic_Taml",
InterIndic_Telugu => "InterIndic_Telu",
Bengali_InterIndic => "Beng_InterIndic",
Devanagari_InterIndic => "Deva_InterIndic",
Gujarati_InterIndic => "Gujr_InterIndic",
Gurmukhi_InterIndic => "Guru_InterIndic",
Kannada_InterIndic => "Knda_InterIndic",
Latin_InterIndic => "Latn_InterIndic",
Malayalam_InterIndic => "Mlym_InterIndic",
Oriya_InterIndic => "Orya_InterIndic",
Tamil_InterIndic => "Taml_InterIndic",
Telugu_InterIndic => "Telu_InterIndic",
Han_Pinyin => $JAVA_ONLY,
Kanji_English => $JAVA_ONLY,
Kanji_OnRomaji => $JAVA_ONLY,
);
my ($x,$x,$x,$x,$x,$THIS_YEAR) = localtime();
$THIS_YEAR += 1900;
# Header blocks of text written at start of ICU output files
$HEADER1 = <<END;
my $HEADER1 = <<END;
//--------------------------------------------------------------------
// Copyright (c) 1999-2004, International Business Machines
// Copyright (c) 1999-$THIS_YEAR, International Business Machines
// Corporation and others. All Rights Reserved.
//--------------------------------------------------------------------
// THIS IS A MACHINE-GENERATED FILE
END
$HEADER2 = <<END;
my $HEADER2 = <<END;
//--------------------------------------------------------------------
END
$TOOL = $0;
my $TOOL = $0;
# Convert the index first; this tells us which rule files are in use.
convertIndex();
# print "In use:\n", join("\n", sort keys(%USED_FILES)), "\n";
# Iterate over all Java RBT rule files
foreach (<$DIR/Transliterator_*.txt>) {
next if (/~$/);
next if (/_index\.txt$/);
next if ($ID && !/$ID/);
my ($out, $id) = convertFileName($_);
if ($out) {
if ($out eq $JAVA_ONLY) {
print STDERR "$id: Java only\n";
next;
next if (/~$/); # Ignore emacs backups
next if (/_index\.txt$/); # The index file was processed above
# Select either the command-line arg, if there was one, or
# any files mentioned in the index.
my $leaf = $_;
$leaf =~ s|.+[/\\]||;
if (($ID && $leaf =~ /$ID/) || exists $USED_FILES{$leaf}) {
my ($out, $id) = convertFileName($_);
if ($out) {
if ($out eq $JAVA_ONLY) {
print STDERR "*** $id skipped: Java only ***\n";
next;
}
file($id, $_, $out);
}
file($id, $_, $out);
} elsif (!$ID) {
print "*** $leaf skipped: not in use ***\n";
}
}
convertIndex();
######################################################################
# Convert a Java file name to C
# Param: Java file name of the form m|Transliterator_(.+)\.utf8\.txt$|
@ -155,14 +221,18 @@ sub convertFileName {
if ($out ne $JAVA_ONLY) {
$out = 't_' . $out;
}
if (length($out) > $MAX_ICU4C_FILENAME_LEN) {
print STDERR "ERROR: ICU4C file name \"$out\" too long; please update $0\n";
return '';
}
return ($out, $id);
}
######################################################################
# Convert the index file from Java to C format
sub convertIndex {
$JAVA_INDEX = "Transliterator_index.txt";
$C_INDEX = "translit_index.txt";
my $JAVA_INDEX = "Transliterator_index.txt";
my $C_INDEX = "translit_index.txt";
open(JAVA_INDEX, "$DIR/$JAVA_INDEX") or die;
open(C_INDEX, ">$OUTDIR/$C_INDEX") or die;
@ -182,7 +252,7 @@ sub convertIndex {
translit_index {
RuleBasedTransliteratorIDs {
END
while (<JAVA_INDEX>) {
# ignore $Source $Revision $Date CVS keyword substitutions
next if /\$Source/ ;
@ -195,13 +265,13 @@ END
# Comments; change # to //
if (s|^(\s*)\#|$1//|) {
print C_INDEX;
next;
print C_INDEX;
next;
}
# Blank lines
if (!/\S/) {
print C_INDEX;
next;
print C_INDEX;
next;
}
# Content lines
chomp;
@ -210,26 +280,28 @@ END
$_=~ s/\\p/\\\\p/g;
my @a = split(':', $_);
if ($a[1] eq 'file' || $a[1] eq 'internal') {
# Convert the file name
my $id;
($a[2], $id) = convertFileName($a[2]);
if ($a[2] eq $JAVA_ONLY) {
$prefix = '// Java only: ';
}
# Convert the file name
my $id;
# Record file names in use
$USED_FILES{$a[2]} = 1;
($a[2], $id) = convertFileName($a[2]);
if ($a[2] eq $JAVA_ONLY) {
$prefix = '// Java only: ';
}
# Delete the encoding field
splice(@a, 3, 1);
# Delete the encoding field
splice(@a, 3, 1);
} elsif ($a[1] eq 'alias') {
# Pad out with extra blank fields to make the
# 2-d array square
push @a, "";
# Pad out with extra blank fields to make the
# 2-d array square
push @a, "";
} else {
die "Can't parse $_";
die "Can't parse $_";
}
print C_INDEX
$prefix, "{ ",
join(", ", map("\"$_\"", @a)),
" },\n";
$prefix, "{ ",
join(", ", map("\"$_\"", @a)),
" },\n";
}
print C_INDEX <<END;
@ -273,7 +345,7 @@ sub file {
# Open file, write UTF8 marker, close it, and reopen in text mode
open(OUT, ">$OUTDIR/$OUT") or die;
binmode OUT; # Must do this so we can write our UTF8 marker
binmode OUT; # Must do this so we can write our UTF8 marker
print OUT pack("C3", 0xEF, 0xBB, 0xBF); # Write UTF8 marker
close(OUT);
@ -287,7 +359,7 @@ sub file {
print OUT " Rule {\n";
open(IN, $IN) or die;
binmode IN; # IN is a UTF8 file
binmode IN; # IN is a UTF8 file
my $first = 1;
my $BOM = pack("C3", 239, 187, 191); # a UTF8 byte order mark
@ -296,21 +368,21 @@ sub file {
# and taking other text and enclosing it in double quotes
while (<IN>) {
my $raw = $_;
# ignore $Source $Revision $Date CVS keyword substitutions
next if /\$Source/ ;
next if /\$Revision/ ;
next if /\$Date/ ;
# we have printed out the copyright info ... ignore one in Java version
next if /Copyright/ ;
next if /Corporation/;
# ignore $Source $Revision $Date CVS keyword substitutions
next if /\$Source/ ;
next if /\$Revision/ ;
next if /\$Date/ ;
# we have printed out the copyright info ... ignore one in Java version
next if /Copyright/ ;
next if /Corporation/;
# Look for and delete BOM
if ($first) {
s/^$BOM//;
$first = 0;
s/^$BOM//;
$first = 0;
}
# Clean the eol junk up
s/[\x0D\x0A]+$//;
@ -319,51 +391,51 @@ sub file {
# concatenated. Count trailing backslashes; if they are odd,
# one is trailing.
if (m|(\\+)$|) {
if ((length($1) % 2) == 1) {
s|\\$||;
}
if ((length($1) % 2) == 1) {
s|\\$||;
}
}
# Transform escaped characters
hideEscapes();
if (/^(\s*)(\#.*)$/) {
# Comment-only line
my ($white, $cmt) = ($1, $2);
$cmt =~ s|\#|//|;
$_ = $white . $cmt;
# Comment-only line
my ($white, $cmt) = ($1, $2);
$cmt =~ s|\#|//|;
$_ = $white . $cmt;
} elsif (!/\S/) {
# Blank line -- leave as-is
# Blank line -- leave as-is
} else {
# Remove single-quoted matter
my @quotes;
my $nquotes = 0;
my $x = $_;
while (s/^([^\']*)(\'[^\']*\')/$1<<x$nquotes>>/) {
push @quotes, $2;
++$nquotes;
}
# Remove single-quoted matter
my @quotes;
my $nquotes = 0;
my $x = $_;
while (s/^([^\']*)(\'[^\']*\')/$1<<x$nquotes>>/) {
push @quotes, $2;
++$nquotes;
}
# Extract comment
my $cmt = '';
if (s|\#(.*)||) {
$cmt = '//' . $1;
}
# Add quotes
s|^(\s*)(\S.*?)(\s*)$|$1\"$2\"$3|;
# Extract comment
my $cmt = '';
if (s|\#(.*)||) {
$cmt = '//' . $1;
}
# Restore single-quoted matter
for (my $i=0; $i<$nquotes; ++$i) {
s|<<x$i>>|$quotes[$i]|;
}
# Add quotes
s|^(\s*)(\S.*?)(\s*)$|$1\"$2\"$3|;
# Restore comment
$_ .= $cmt;
# Restore single-quoted matter
for (my $i=0; $i<$nquotes; ++$i) {
s|<<x$i>>|$quotes[$i]|;
}
# Restore comment
$_ .= $cmt;
}
# Restore escaped characters
restoreEscapes();