mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-20 20:19:32 +00:00
ICU-2014 use index file, update name mapping, cleanup
X-SVN-Rev: 9392
This commit is contained in:
parent
5f481fb5c7
commit
c26cb4aaaf
1 changed files with 200 additions and 128 deletions
|
@ -14,21 +14,25 @@ goto endofperl
|
|||
|
||||
# This perl script creates ICU transliterator data files, that live
|
||||
# in icu/data, from ICU4J UTF8 transliterator data files, in
|
||||
# icu4j/src/com/ibm/text/resources.
|
||||
# icu4j/src/com/ibm/icu/impl/data/.
|
||||
#
|
||||
# The transformation that is done is very minimal. The script assumes
|
||||
# that the input files use only # comments
|
||||
# and that they follow a rigid format.
|
||||
#
|
||||
# The output files are named according to ICU conventions (see NAME_MAP
|
||||
# below) and created in the current directory. They should be manually
|
||||
# checked and then copied into the icu/data directory. An ICU build must
|
||||
# below) and created in the current directory. They should be manually
|
||||
# checked and then copied into the icu/data/tranlit directory.
|
||||
# An ICU build must
|
||||
# then be initiated, and the standard suite of ICU transliterator tests
|
||||
# should be run after that.
|
||||
#
|
||||
# Alan Liu 5/19/00 2/27/01
|
||||
|
||||
use Getopt::Long;
|
||||
use strict;
|
||||
|
||||
use vars qw(%USED_FILES);
|
||||
|
||||
my $DIR = "../../../impl/data";
|
||||
my $ID = '';
|
||||
|
@ -39,7 +43,7 @@ GetOptions('dir=s' => \$DIR,
|
|||
|
||||
usage() if (@ARGV);
|
||||
|
||||
$ID =~ s/-/_/;
|
||||
my $ID =~ s/-/_/;
|
||||
if (! -d $DIR) {
|
||||
print STDERR "$DIR is not a directory\n";
|
||||
usage();
|
||||
|
@ -50,87 +54,149 @@ sub usage {
|
|||
$me =~ s|.+[/\\]||;
|
||||
print "Usage: $me [-dir <dir>] [-id <id>]\n";
|
||||
print " --dir <dir> Specify the directory containing the\n";
|
||||
print " Transliterator_*.txt files\n";
|
||||
print " --id <id> Specify a single ID to transform, e.g.\n";
|
||||
print " Fullwidth-Halfwidth\n";
|
||||
die;
|
||||
print " Transliterator_*.txt files\n";
|
||||
print " --id <id> Specify a single ID to transform, e.g.\n";
|
||||
print " Fullwidth-Halfwidth\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
$JAVA_ONLY = '-';
|
||||
my $JAVA_ONLY = '-';
|
||||
|
||||
$OUTDIR = "icu4c";
|
||||
my $OUTDIR = "icu4c";
|
||||
mkdir($OUTDIR,0777);
|
||||
|
||||
# Mapping from Java file names to ICU file names
|
||||
%NAME_MAP = (
|
||||
# An ICU name of "" means the ICU name == the ID
|
||||
# Constraints on ICU4C file name: icudt20b_
|
||||
# |--9 (prefix)---|---18(name with distinguisher,e.g. "t_" )----|
|
||||
# --4 ("."+extn)--| = 31 characters total.
|
||||
# That is, must have length(%NAME_MAP{x}) <= 16
|
||||
|
||||
"Any_Accents" => "",
|
||||
"Any_Publishing" => "",
|
||||
"Bengali_InterIndic" => "Beng_InterIndic",
|
||||
"Cyrillic_Latin" => "Cyrl_Latn",
|
||||
"Devanagari_InterIndic" => "Deva_InterIndic",
|
||||
"Fullwidth_Halfwidth" => "FWidth_HWidth",
|
||||
"Greek_Latin" => "Grek_Latn",
|
||||
"Gujarati_InterIndic" => "Gujr_InterIndic",
|
||||
"Gurmukhi_InterIndic" => "Guru_InterIndic",
|
||||
"Hiragana_Katakana" => "Hira_Kana",
|
||||
"Hiragana_Latin" => "Hira_Latn",
|
||||
"InterIndic_Bengali" => "InterIndic_Beng",
|
||||
"InterIndic_Devanagari" => "InterIndic_Deva",
|
||||
"InterIndic_Gujarati" => "InterIndic_Gujr",
|
||||
"InterIndic_Gurmukhi" => "InterIndic_Guru",
|
||||
"InterIndic_Kannada" => "InterIndic_Knda",
|
||||
"InterIndic_Latin" => "InterIndic_Latn",
|
||||
"InterIndic_Malayalam" => "InterIndic_Mlym",
|
||||
"InterIndic_Oriya" => "InterIndic_Orya",
|
||||
"InterIndic_Tamil" => "InterIndic_Taml",
|
||||
"InterIndic_Telugu" => "InterIndic_Telu",
|
||||
"Kannada_InterIndic" => "Knda_InterIndic",
|
||||
"Latin_InterIndic" => "Latn_InterIndic",
|
||||
"Latin_Jamo" => "Latn_Jamo",
|
||||
"Latin_Katakana" => "Latn_Kana",
|
||||
"Malayalam_InterIndic" => "Mlym_InterIndic",
|
||||
"Oriya_InterIndic" => "Orya_InterIndic",
|
||||
"Tamil_InterIndic" => "Taml_InterIndic",
|
||||
"Telugu_InterIndic" => "Telu_InterIndic",
|
||||
|
||||
"Han_Pinyin" => $JAVA_ONLY,
|
||||
"Kanji_English" => $JAVA_ONLY,
|
||||
"Kanji_OnRomaji" => $JAVA_ONLY,
|
||||
);
|
||||
my $MAX_ICU4C_FILENAME_LEN = 18;
|
||||
|
||||
# -- HISTORY -- If not marked, then pre 2.2.
|
||||
# All InterIndic are pre 2.2.
|
||||
# Any_Accents
|
||||
# Any_Publishing
|
||||
# Arabic_Latin * 2.2
|
||||
# Cyrillic_Latin
|
||||
# Fullwidth_Halfwidth
|
||||
# Greek_Latin
|
||||
# Greek_Latin_UNGEGN * 2.2 (moved from el.txt)
|
||||
# Han_Latin * 2.2
|
||||
# Han_Latin_Definition * 2.2
|
||||
# Han_Latin_EDICT * 2.2 J only
|
||||
# Hebrew_Latin * 2.2
|
||||
# Hiragana_Katakana
|
||||
# Hiragana_Latin
|
||||
# Latin_Jamo
|
||||
# Latin_Katakana
|
||||
# ThaiLogical_Latin * 2.2 J only
|
||||
# Thai_ThaiLogical * 2.2 J only
|
||||
# Thai_ThaiSemi * 2.2 J only
|
||||
|
||||
my %NAME_MAP = (
|
||||
# An ICU name of "" means the ICU name == the ID
|
||||
|
||||
# We filter names based on what is in use in the index file.
|
||||
|
||||
# Flag a rule as JAVA_ONLY if it exists and we use it in Java,
|
||||
# but we don't use it in C.
|
||||
|
||||
# Use official script abbreviations where possible.
|
||||
|
||||
# |..............| |..............|
|
||||
# 1234567890123456 1234567890123456
|
||||
Any_Accents => "",
|
||||
Any_Publishing => "",
|
||||
Cyrillic_Latin => "Cyrl_Latn",
|
||||
Fullwidth_Halfwidth => "FWidth_HWidth",
|
||||
Greek_Latin => "Grek_Latn",
|
||||
Hiragana_Katakana => "Hira_Kana",
|
||||
Hiragana_Latin => "Hira_Latn",
|
||||
Latin_Jamo => "Latn_Jamo",
|
||||
Latin_Katakana => "Latn_Kana",
|
||||
|
||||
Arabic_Latin => "Arab_Latn",
|
||||
Greek_Latin_UNGEGN => "Grek_Latn_UNGEGN",
|
||||
Han_Latin => "Hani_Latn",
|
||||
Han_Latin_Definition => "Hani_Latn_Def",
|
||||
Han_Latin_EDICT => "Hani_Latn_EDICT",
|
||||
Hebrew_Latin => "Hebr_Latn",
|
||||
ThaiLogical_Latin => $JAVA_ONLY, # "ThaiLog_Latn",
|
||||
Thai_ThaiLogical => $JAVA_ONLY, # "Thai_ThaiLog",
|
||||
Thai_ThaiSemi => $JAVA_ONLY, # "Thai_ThaiSemi",
|
||||
|
||||
InterIndic_Bengali => "InterIndic_Beng",
|
||||
InterIndic_Devanagari => "InterIndic_Deva",
|
||||
InterIndic_Gujarati => "InterIndic_Gujr",
|
||||
InterIndic_Gurmukhi => "InterIndic_Guru",
|
||||
InterIndic_Kannada => "InterIndic_Knda",
|
||||
InterIndic_Latin => "InterIndic_Latn",
|
||||
InterIndic_Malayalam => "InterIndic_Mlym",
|
||||
InterIndic_Oriya => "InterIndic_Orya",
|
||||
InterIndic_Tamil => "InterIndic_Taml",
|
||||
InterIndic_Telugu => "InterIndic_Telu",
|
||||
|
||||
Bengali_InterIndic => "Beng_InterIndic",
|
||||
Devanagari_InterIndic => "Deva_InterIndic",
|
||||
Gujarati_InterIndic => "Gujr_InterIndic",
|
||||
Gurmukhi_InterIndic => "Guru_InterIndic",
|
||||
Kannada_InterIndic => "Knda_InterIndic",
|
||||
Latin_InterIndic => "Latn_InterIndic",
|
||||
Malayalam_InterIndic => "Mlym_InterIndic",
|
||||
Oriya_InterIndic => "Orya_InterIndic",
|
||||
Tamil_InterIndic => "Taml_InterIndic",
|
||||
Telugu_InterIndic => "Telu_InterIndic",
|
||||
|
||||
Han_Pinyin => $JAVA_ONLY,
|
||||
Kanji_English => $JAVA_ONLY,
|
||||
Kanji_OnRomaji => $JAVA_ONLY,
|
||||
);
|
||||
|
||||
my ($x,$x,$x,$x,$x,$THIS_YEAR) = localtime();
|
||||
$THIS_YEAR += 1900;
|
||||
|
||||
# Header blocks of text written at start of ICU output files
|
||||
$HEADER1 = <<END;
|
||||
my $HEADER1 = <<END;
|
||||
//--------------------------------------------------------------------
|
||||
// Copyright (c) 1999-2004, International Business Machines
|
||||
// Copyright (c) 1999-$THIS_YEAR, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
// THIS IS A MACHINE-GENERATED FILE
|
||||
END
|
||||
$HEADER2 = <<END;
|
||||
my $HEADER2 = <<END;
|
||||
//--------------------------------------------------------------------
|
||||
END
|
||||
|
||||
$TOOL = $0;
|
||||
my $TOOL = $0;
|
||||
|
||||
# Convert the index first; this tells us which rule files are in use.
|
||||
convertIndex();
|
||||
|
||||
# print "In use:\n", join("\n", sort keys(%USED_FILES)), "\n";
|
||||
|
||||
# Iterate over all Java RBT rule files
|
||||
foreach (<$DIR/Transliterator_*.txt>) {
|
||||
next if (/~$/);
|
||||
next if (/_index\.txt$/);
|
||||
next if ($ID && !/$ID/);
|
||||
my ($out, $id) = convertFileName($_);
|
||||
if ($out) {
|
||||
if ($out eq $JAVA_ONLY) {
|
||||
print STDERR "$id: Java only\n";
|
||||
next;
|
||||
next if (/~$/); # Ignore emacs backups
|
||||
next if (/_index\.txt$/); # The index file was processed above
|
||||
# Select either the command-line arg, if there was one, or
|
||||
# any files mentioned in the index.
|
||||
my $leaf = $_;
|
||||
$leaf =~ s|.+[/\\]||;
|
||||
if (($ID && $leaf =~ /$ID/) || exists $USED_FILES{$leaf}) {
|
||||
my ($out, $id) = convertFileName($_);
|
||||
if ($out) {
|
||||
if ($out eq $JAVA_ONLY) {
|
||||
print STDERR "*** $id skipped: Java only ***\n";
|
||||
next;
|
||||
}
|
||||
file($id, $_, $out);
|
||||
}
|
||||
file($id, $_, $out);
|
||||
} elsif (!$ID) {
|
||||
print "*** $leaf skipped: not in use ***\n";
|
||||
}
|
||||
}
|
||||
|
||||
convertIndex();
|
||||
|
||||
######################################################################
|
||||
# Convert a Java file name to C
|
||||
# Param: Java file name of the form m|Transliterator_(.+)\.utf8\.txt$|
|
||||
|
@ -155,14 +221,18 @@ sub convertFileName {
|
|||
if ($out ne $JAVA_ONLY) {
|
||||
$out = 't_' . $out;
|
||||
}
|
||||
if (length($out) > $MAX_ICU4C_FILENAME_LEN) {
|
||||
print STDERR "ERROR: ICU4C file name \"$out\" too long; please update $0\n";
|
||||
return '';
|
||||
}
|
||||
return ($out, $id);
|
||||
}
|
||||
|
||||
######################################################################
|
||||
# Convert the index file from Java to C format
|
||||
sub convertIndex {
|
||||
$JAVA_INDEX = "Transliterator_index.txt";
|
||||
$C_INDEX = "translit_index.txt";
|
||||
my $JAVA_INDEX = "Transliterator_index.txt";
|
||||
my $C_INDEX = "translit_index.txt";
|
||||
open(JAVA_INDEX, "$DIR/$JAVA_INDEX") or die;
|
||||
open(C_INDEX, ">$OUTDIR/$C_INDEX") or die;
|
||||
|
||||
|
@ -182,7 +252,7 @@ sub convertIndex {
|
|||
translit_index {
|
||||
RuleBasedTransliteratorIDs {
|
||||
END
|
||||
|
||||
|
||||
while (<JAVA_INDEX>) {
|
||||
# ignore $Source $Revision $Date CVS keyword substitutions
|
||||
next if /\$Source/ ;
|
||||
|
@ -195,13 +265,13 @@ END
|
|||
|
||||
# Comments; change # to //
|
||||
if (s|^(\s*)\#|$1//|) {
|
||||
print C_INDEX;
|
||||
next;
|
||||
print C_INDEX;
|
||||
next;
|
||||
}
|
||||
# Blank lines
|
||||
if (!/\S/) {
|
||||
print C_INDEX;
|
||||
next;
|
||||
print C_INDEX;
|
||||
next;
|
||||
}
|
||||
# Content lines
|
||||
chomp;
|
||||
|
@ -210,26 +280,28 @@ END
|
|||
$_=~ s/\\p/\\\\p/g;
|
||||
my @a = split(':', $_);
|
||||
if ($a[1] eq 'file' || $a[1] eq 'internal') {
|
||||
# Convert the file name
|
||||
my $id;
|
||||
($a[2], $id) = convertFileName($a[2]);
|
||||
if ($a[2] eq $JAVA_ONLY) {
|
||||
$prefix = '// Java only: ';
|
||||
}
|
||||
# Convert the file name
|
||||
my $id;
|
||||
# Record file names in use
|
||||
$USED_FILES{$a[2]} = 1;
|
||||
($a[2], $id) = convertFileName($a[2]);
|
||||
if ($a[2] eq $JAVA_ONLY) {
|
||||
$prefix = '// Java only: ';
|
||||
}
|
||||
|
||||
# Delete the encoding field
|
||||
splice(@a, 3, 1);
|
||||
# Delete the encoding field
|
||||
splice(@a, 3, 1);
|
||||
} elsif ($a[1] eq 'alias') {
|
||||
# Pad out with extra blank fields to make the
|
||||
# 2-d array square
|
||||
push @a, "";
|
||||
# Pad out with extra blank fields to make the
|
||||
# 2-d array square
|
||||
push @a, "";
|
||||
} else {
|
||||
die "Can't parse $_";
|
||||
die "Can't parse $_";
|
||||
}
|
||||
print C_INDEX
|
||||
$prefix, "{ ",
|
||||
join(", ", map("\"$_\"", @a)),
|
||||
" },\n";
|
||||
$prefix, "{ ",
|
||||
join(", ", map("\"$_\"", @a)),
|
||||
" },\n";
|
||||
}
|
||||
|
||||
print C_INDEX <<END;
|
||||
|
@ -273,7 +345,7 @@ sub file {
|
|||
|
||||
# Open file, write UTF8 marker, close it, and reopen in text mode
|
||||
open(OUT, ">$OUTDIR/$OUT") or die;
|
||||
binmode OUT; # Must do this so we can write our UTF8 marker
|
||||
binmode OUT; # Must do this so we can write our UTF8 marker
|
||||
print OUT pack("C3", 0xEF, 0xBB, 0xBF); # Write UTF8 marker
|
||||
close(OUT);
|
||||
|
||||
|
@ -287,7 +359,7 @@ sub file {
|
|||
print OUT " Rule {\n";
|
||||
|
||||
open(IN, $IN) or die;
|
||||
binmode IN; # IN is a UTF8 file
|
||||
binmode IN; # IN is a UTF8 file
|
||||
|
||||
my $first = 1;
|
||||
my $BOM = pack("C3", 239, 187, 191); # a UTF8 byte order mark
|
||||
|
@ -296,21 +368,21 @@ sub file {
|
|||
# and taking other text and enclosing it in double quotes
|
||||
while (<IN>) {
|
||||
my $raw = $_;
|
||||
# ignore $Source $Revision $Date CVS keyword substitutions
|
||||
next if /\$Source/ ;
|
||||
next if /\$Revision/ ;
|
||||
next if /\$Date/ ;
|
||||
|
||||
# we have printed out the copyright info ... ignore one in Java version
|
||||
next if /Copyright/ ;
|
||||
next if /Corporation/;
|
||||
|
||||
# ignore $Source $Revision $Date CVS keyword substitutions
|
||||
next if /\$Source/ ;
|
||||
next if /\$Revision/ ;
|
||||
next if /\$Date/ ;
|
||||
|
||||
# we have printed out the copyright info ... ignore one in Java version
|
||||
next if /Copyright/ ;
|
||||
next if /Corporation/;
|
||||
|
||||
# Look for and delete BOM
|
||||
if ($first) {
|
||||
s/^$BOM//;
|
||||
$first = 0;
|
||||
s/^$BOM//;
|
||||
$first = 0;
|
||||
}
|
||||
|
||||
|
||||
# Clean the eol junk up
|
||||
s/[\x0D\x0A]+$//;
|
||||
|
||||
|
@ -319,51 +391,51 @@ sub file {
|
|||
# concatenated. Count trailing backslashes; if they are odd,
|
||||
# one is trailing.
|
||||
if (m|(\\+)$|) {
|
||||
if ((length($1) % 2) == 1) {
|
||||
s|\\$||;
|
||||
}
|
||||
if ((length($1) % 2) == 1) {
|
||||
s|\\$||;
|
||||
}
|
||||
}
|
||||
|
||||
# Transform escaped characters
|
||||
hideEscapes();
|
||||
|
||||
if (/^(\s*)(\#.*)$/) {
|
||||
# Comment-only line
|
||||
my ($white, $cmt) = ($1, $2);
|
||||
$cmt =~ s|\#|//|;
|
||||
$_ = $white . $cmt;
|
||||
# Comment-only line
|
||||
my ($white, $cmt) = ($1, $2);
|
||||
$cmt =~ s|\#|//|;
|
||||
$_ = $white . $cmt;
|
||||
|
||||
} elsif (!/\S/) {
|
||||
# Blank line -- leave as-is
|
||||
# Blank line -- leave as-is
|
||||
|
||||
} else {
|
||||
# Remove single-quoted matter
|
||||
my @quotes;
|
||||
my $nquotes = 0;
|
||||
my $x = $_;
|
||||
while (s/^([^\']*)(\'[^\']*\')/$1<<x$nquotes>>/) {
|
||||
push @quotes, $2;
|
||||
++$nquotes;
|
||||
}
|
||||
# Remove single-quoted matter
|
||||
my @quotes;
|
||||
my $nquotes = 0;
|
||||
my $x = $_;
|
||||
while (s/^([^\']*)(\'[^\']*\')/$1<<x$nquotes>>/) {
|
||||
push @quotes, $2;
|
||||
++$nquotes;
|
||||
}
|
||||
|
||||
# Extract comment
|
||||
my $cmt = '';
|
||||
if (s|\#(.*)||) {
|
||||
$cmt = '//' . $1;
|
||||
}
|
||||
|
||||
# Add quotes
|
||||
s|^(\s*)(\S.*?)(\s*)$|$1\"$2\"$3|;
|
||||
# Extract comment
|
||||
my $cmt = '';
|
||||
if (s|\#(.*)||) {
|
||||
$cmt = '//' . $1;
|
||||
}
|
||||
|
||||
# Restore single-quoted matter
|
||||
for (my $i=0; $i<$nquotes; ++$i) {
|
||||
s|<<x$i>>|$quotes[$i]|;
|
||||
}
|
||||
# Add quotes
|
||||
s|^(\s*)(\S.*?)(\s*)$|$1\"$2\"$3|;
|
||||
|
||||
# Restore comment
|
||||
$_ .= $cmt;
|
||||
# Restore single-quoted matter
|
||||
for (my $i=0; $i<$nquotes; ++$i) {
|
||||
s|<<x$i>>|$quotes[$i]|;
|
||||
}
|
||||
|
||||
# Restore comment
|
||||
$_ .= $cmt;
|
||||
}
|
||||
|
||||
|
||||
# Restore escaped characters
|
||||
restoreEscapes();
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue