ICU-4301 committ the collation probe tools

X-SVN-Rev: 20601
This commit is contained in:
Andy Heninger 2006-10-27 00:03:21 +00:00
parent 615bf48660
commit 8fbddcf5c7
27 changed files with 7832 additions and 0 deletions

81
tools/colprobe/Makefile.in Executable file
View file

@ -0,0 +1,81 @@
## Makefile.in for ICU - extra/colprobe
## Copyright (c) 2001, International Business Machines Corporation and
## others. All Rights Reserved.
## Source directory information
srcdir = @srcdir@
top_srcdir = @top_srcdir@
top_builddir = ../..
include $(top_builddir)/icudefs.mk
## Build directory information
subdir = extra/colprobe
## Extra files to remove for 'make clean'
CLEANFILES = *~ $(DEPS)
## Target information
TARGET = colprobe
LONGNAME = longname
CPPFLAGS += -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(top_srcdir)/tools/toolutil -I$(top_srcdir)/io
LIBS = $(LIBICUI18N) $(LIBICUUC) $(LIBUSTDIO) $(LIBICUTOOLUTIL) $(DEFAULT_LIBS) $(LIB_M)
OBJECTS = colprobeNew.o line.o sortedlines.o strengthprobe.o uprinter.o
LONGNAME_OBJ = longname.o
DEPS = $(OBJECTS:.o=.d)
LONGNAME_DEPS = $(LONGNAME_OBJ:.o=.d)
## List of phony targets
.PHONY : all all-local install install-local clean clean-local \
distclean distclean-local dist dist-local check check-local
## Clear suffix list
.SUFFIXES :
## List of standard targets
all: all-local
install: install-local
clean: clean-local
distclean : distclean-local
dist: dist-local
check: all check-local
all-local: $(TARGET)
install-local:
dist-local:
clean-local:
test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
$(RMV) $(OBJECTS) $(TARGET)
distclean-local: clean-local
$(RMV) Makefile
check-local: all-local
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
cd $(top_builddir) \
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
$(TARGET) : $(OBJECTS)
$(LINK.cc) -o $@ $^ $(LIBS)
$(LONGNAME) : $(LONGNAME_OBJ)
$(LINK.cc) -o $@ $^ $(LIBS)
invoke:
ICU_DATA=$${ICU_DATA:-$(top_builddir)/data/} TZ=PST8PDT $(INVOKE) $(INVOCATION)
ifeq (,$(MAKECMDGOALS))
-include $(DEPS)
else
ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
-include $(DEPS)
endif
endif

1730
tools/colprobe/colprobe.cpp Executable file

File diff suppressed because it is too large Load diff

148
tools/colprobe/colprobe.dsp Executable file
View file

@ -0,0 +1,148 @@
# Microsoft Developer Studio Project File - Name="colprobe" - Package Owner=<4>
# Microsoft Developer Studio Generated Build File, Format Version 6.00
# ** DO NOT EDIT **
# TARGTYPE "Win32 (x86) Console Application" 0x0103
CFG=colprobe - Win32 Debug
!MESSAGE This is not a valid makefile. To build this project using NMAKE,
!MESSAGE use the Export Makefile command and run
!MESSAGE
!MESSAGE NMAKE /f "colprobe.mak".
!MESSAGE
!MESSAGE You can specify a configuration when running NMAKE
!MESSAGE by defining the macro CFG on the command line. For example:
!MESSAGE
!MESSAGE NMAKE /f "colprobe.mak" CFG="colprobe - Win32 Debug"
!MESSAGE
!MESSAGE Possible choices for configuration are:
!MESSAGE
!MESSAGE "colprobe - Win32 Release" (based on "Win32 (x86) Console Application")
!MESSAGE "colprobe - Win32 Debug" (based on "Win32 (x86) Console Application")
!MESSAGE
# Begin Project
# PROP AllowPerConfigDependencies 0
# PROP Scc_ProjName ""
# PROP Scc_LocalPath ""
CPP=cl.exe
RSC=rc.exe
!IF "$(CFG)" == "colprobe - Win32 Release"
# PROP BASE Use_MFC 0
# PROP BASE Use_Debug_Libraries 0
# PROP BASE Output_Dir "Release"
# PROP BASE Intermediate_Dir "Release"
# PROP BASE Target_Dir ""
# PROP Use_MFC 0
# PROP Use_Debug_Libraries 0
# PROP Output_Dir "Release"
# PROP Intermediate_Dir "Release"
# PROP Ignore_Export_Lib 0
# PROP Target_Dir ""
MTL=midl.exe
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
# ADD CPP /nologo /W3 /GX /O2 /I "../../../include" /I "../../tools/toolutil" /I "../../common" /I "../../i18n" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
# ADD BASE RSC /l 0x409 /d "NDEBUG"
# ADD RSC /l 0x409 /d "NDEBUG"
BSC32=bscmake.exe
# ADD BASE BSC32 /nologo
# ADD BSC32 /nologo
LINK32=link.exe
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
# ADD LINK32 icuio.lib icuuc.lib icuin.lib icutu.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /libpath:"../../../lib"
!ELSEIF "$(CFG)" == "colprobe - Win32 Debug"
# PROP BASE Use_MFC 0
# PROP BASE Use_Debug_Libraries 1
# PROP BASE Output_Dir "Debug"
# PROP BASE Intermediate_Dir "Debug"
# PROP BASE Target_Dir ""
# PROP Use_MFC 0
# PROP Use_Debug_Libraries 1
# PROP Output_Dir "Debug"
# PROP Intermediate_Dir "Debug"
# PROP Ignore_Export_Lib 0
# PROP Target_Dir ""
MTL=midl.exe
# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /I "../../../include" /I "../../tools/toolutil" /I "../../common" /I "../../i18n" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
# ADD BASE RSC /l 0x409 /d "_DEBUG"
# ADD RSC /l 0x409 /d "_DEBUG"
BSC32=bscmake.exe
# ADD BASE BSC32 /nologo
# ADD BSC32 /nologo
LINK32=link.exe
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
# ADD LINK32 icuiod.lib icuucd.lib icuind.lib icutud.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /libpath:"../../../lib"
!ENDIF
# Begin Target
# Name "colprobe - Win32 Release"
# Name "colprobe - Win32 Debug"
# Begin Group "Source Files"
# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
# Begin Source File
SOURCE=.\colprobeNew.cpp
# End Source File
# Begin Source File
SOURCE=.\line.cpp
# End Source File
# Begin Source File
SOURCE=.\sortedlines.cpp
# End Source File
# Begin Source File
SOURCE=.\strengthprobe.cpp
# End Source File
# Begin Source File
SOURCE=.\targetsetgenerator.cpp
# End Source File
# Begin Source File
SOURCE=.\uprinter.cpp
# End Source File
# End Group
# Begin Group "Header Files"
# PROP Default_Filter "h;hpp;hxx;hm;inl"
# Begin Source File
SOURCE=.\colprobe.h
# End Source File
# Begin Source File
SOURCE=.\line.h
# End Source File
# Begin Source File
SOURCE=.\sortedlines.h
# End Source File
# Begin Source File
SOURCE=.\strengthprobe.h
# End Source File
# Begin Source File
SOURCE=.\targetsetgenerator.h
# End Source File
# Begin Source File
SOURCE=.\uprinter.h
# End Source File
# End Group
# Begin Group "Resource Files"
# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
# End Group
# End Target
# End Project

15
tools/colprobe/colprobe.h Executable file
View file

@ -0,0 +1,15 @@
#ifndef COLPROBE_H
#define COLPROBE_H
#include "unicode/uniset.h"
#include "unicode/normlzr.h"
typedef int (*CompareFn) (const void *elem1, const void *elem2);
typedef int (*GetSortKeyFn) (const UChar *string, int32_t len, uint8_t *buffer, int32_t buffCapacity);
//typedef int (__cdecl *CompareFn)(const void *elem1, const void *elem2);
void generateRepertoire(const char *locale, UnicodeSet &rep, UBool &hanAppears, UErrorCode &status);
UnicodeSet flatten(const UnicodeSet &source, UErrorCode &status);
//UnicodeSet generateRepertoire(const char *locale);
#endif

1078
tools/colprobe/colprobeNew.cpp Executable file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,164 @@
#! /usr/bin/perl -w
use strict;
my $locale = $ARGV[0];
my $long_name = `/home/weiv/src/icu/source/extra/colprobe/longname $locale`;
my $pageTitle = $locale."_collation";
my $filename = $pageTitle.".html";
open TABLE, ">$filename";
print TABLE <<"EndOfTemplate";
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>$pageTitle</title>
<style>
<!--
table { border-spacing: 0; border-collapse: collapse; width: 100%;
border: 1px solid black }
td, th { width: 10%; border-spacing: 0; border-collapse: collapse; color: black;
vertical-align: top; border: 1px solid black }
-->
</style>
</head>
<body bgcolor="#FFFFFF">
<p><b><font color="#FF0000">Collation:</font> $locale ($long_name) <a href="http://oss.software.ibm.com/cgi-bin/icu/lx/en/?_=$locale">Demo</a>,
<a href="../all_diff_xml/comparison_charts.html">Cover
Page</a>, <a href="../all_diff_xml/index.html">Index</a></b></p>
<table>
<tr>
EndOfTemplate
my $dirCommon = "common";
my $refCommon = $dirCommon."/UCARules.txt";
my $nameCommon = $dirCommon."/".$locale."_collation.html";
my $colorCommon = "#AD989D";
my $loc;
if(!(-e $nameCommon)) {
$locale =~ /_/;
$loc = $`;
$nameCommon = "$dirCommon/$loc"."_collation.html";
}
print TABLE " <th bgcolor=\"$colorCommon\">COMMON (<a href=\"$refCommon\">UCA</a> <a href=\"../$dirCommon/xml/$locale.xml\">xml</a>)</th>\n";
my $dirLinux = "linux";
my $refLinux = $dirLinux."/".$locale.".utf8_default_raw.html";
my $rawLinux = $dirLinux."/".$locale.".utf8_raw.html";
my $defLinux = $dirLinux."/".$locale;
my $nameLinux = "$dirLinux/$locale".".utf8_collation.html";
my $colorLinux = "#1191F1";
print TABLE " <th bgcolor=\"$colorLinux\">LINUX (";
if (!(-e $nameLinux)) {
#try the variant that has @euro stuck in
$nameLinux = "$dirLinux/$locale".'.utf8@euro_collation.html';
if(-e $nameLinux) {
$refLinux = $dirLinux."/".$locale.'.utf8@euro_default_raw.html';
$rawLinux = $dirLinux."/".$locale.'.utf8@euro_raw.html';
}
}
if (-e $nameLinux) {
print TABLE "<a href=\"$rawLinux\">Ordering</a> <a href=\"$defLinux\">Definition</a> <a href=\"$refLinux\">base</a>";
}
print TABLE " <a href=\"../$dirLinux/xml/$locale.xml\">xml</a>)</th>\n";
my $dirWin = "winxp";
my $refWin = $dirWin."/".$locale."_default_raw.html";
my $rawWin = $dirWin."/".$locale."_raw.html";
my $nameWin = "$dirWin/$locale"."_collation.html";
my $colorWin = "#98FB98";
print TABLE " <th bgcolor=\"$colorWin\">WINDOWS (";
if (-e $nameWin) {
print TABLE "<a href=\"$rawWin\">Ordering</a> <a href=\"$refWin\">base</a> ";
}
print TABLE "<a href=\"../windows/xml/$locale.xml\">xml</a>)</th>\n";
print TABLE " </tr>\n <tr>";
readRules($nameCommon, "#AD989D", "Same as the UCA.");
readRules($nameLinux, "#1191F1", "No data available.");
readRules($nameWin, "#98FB98", "No data available.");
print TABLE <<"EndOfFooter";
</tr>
</table>
</body>
</html>
EndOfFooter
sub readRules {
# readRules($file, $color)
my $filename = shift;
my $color = shift;
my $comment = shift;
my $noLines = 0;
my $printOut = 0;
my $file;
if(-e $filename) {
open($file, "<$filename") || die "something very strange happened\n";
print TABLE "<td bgcolor=\"$color\">\n";
while (<$file>) {
if (/\}\<br\>$/) {
$printOut = 0;
}
if ($printOut) {
print TABLE $_;
$noLines++;
}
if (/Sequence/) {
$printOut = 1;
print "found sequence\n";
$noLines = 0;
}
}
if (!$noLines) {
print TABLE "Same ordering as base\n";
}
print TABLE "</td>\n";
} else {
print TABLE "<td bgcolor=\"$color\">\n$comment</td>\n";
}
}
# Tasting of food product
# 650-574-4551 $50 1 hour
# <td bgcolor="#AD989D">1.0-alpha</td>
# <td bgcolor="#FF6633">1.0</td>
# <td bgcolor="#FF6633">=</td>
# <td bgcolor="#FF6633"><span title="006E {LATIN SMALL LETTER N}">&amp;n</span><br>
# <span title="006E 0079 {LATIN SMALL LETTER N} {LATIN SMALL LETTER Y}">&nbsp;&nbsp;&lt;&nbsp;ny</span><br>
# <span title="006E 006E 0079 {LATIN SMALL LETTER N} {LATIN SMALL LETTER N} {LATIN SMALL LETTER Y} / 006E 0079 {LATIN SMALL LETTER N} {LATIN SMALL LETTER Y}">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;=&nbsp;nny&nbsp;/&nbsp;ny</span><br>
# <span title="006E 0059 {LATIN SMALL LETTER N} {LATIN CAPITAL LETTER Y}">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&lt;&lt;&lt;&nbsp;nY</span><br>
# </td>
# <td bgcolor="#FF6633">=</td>
# <td bgcolor="#FFFF33">1.2</td>
# <td bgcolor="#98FB98">Windows XP</td>
# <td bgcolor="#FF6633">=</td>
# <td bgcolor="#FF6633">=</td>

View file

@ -0,0 +1,209 @@
#! /usr/bin/perl -w
use strict;
use IO::File;
my $locale = $ARGV[0];
my $long_name = `/home/weiv/src/icu/source/extra/colprobe/longname $locale`;
print "Long name is $long_name\n";
my $pageTitle = $locale." collation";
my $filename = $locale.".html";
open TABLE, ">$filename";
print TABLE <<"EndOfTemplate";
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>$pageTitle</title>
<style>
<!--
table { border-spacing: 0; border-collapse: collapse; width: 100%;
border: 1px solid black }
td, th { width: 10%; border-spacing: 0; border-collapse: collapse; color: black;
vertical-align: top; border: 1px solid black }
-->
</style>
</head>
<body bgcolor="#FFFFFF">
<p><b><font color="#FF0000">Collation:</font> $locale ($long_name) <a href="http://oss.software.ibm.com/cgi-bin/icu/lx/en/?_=$locale">Demo</a>,
<a href="../../comparison_charts.html">Cover
Page</a>, <a href="../main/index.html">Locale Diffs Index</a>, <a href="index.html">Collation Diffs Index</a></b></p>
<table>
<tr>
EndOfTemplate
my $dirCommon = "icucollations";
my $refCommon = $dirCommon."/UCARules.txt";
my $nameCommon = $dirCommon."/".$locale."_collation.html";
my $colorCommon = "#AD989D";
my $loc = $locale;
if(!(-e $nameCommon)) {
$locale =~ /_/;
$loc = $`;
$nameCommon = "$dirCommon/$loc"."_collation.html";
}
print "Common is $nameCommon\n";
print TABLE " <th bgcolor=\"$colorCommon\">COMMON (";
if(-e $nameCommon) {
print TABLE "<a href=\"../../common/collation/$loc.xml\">xml</a> ";
}
print TABLE "<a href=\"../../common/collation/root.xml\">UCA</a>)</th>\n";
my $dirLinux = "linuxcollations";
my $refLinux = $dirLinux."/".$locale.".utf8_default_raw.html";
my $rawLinux = $dirLinux."/".$locale.".utf8_raw.html";
my $defLinux = $dirLinux."/".$locale;
my $nameLinux = "$dirLinux/$locale"."_collation.html";
my $colorLinux = "#1191F1";
print TABLE " <th bgcolor=\"$colorLinux\">LINUX";
if (!(-e $nameLinux)) {
#try the variant that has @euro stuck in
$nameLinux = "$dirLinux/$locale".'.utf8@euro_collation.html';
if(-e $nameLinux) {
$refLinux = $dirLinux."/".$locale.'.utf8@euro_default_raw.html';
$rawLinux = $dirLinux."/".$locale.'.utf8@euro_raw.html';
}
}
if (-e $nameLinux) {
print TABLE " (<a href=\"../../linux/collation/$locale.xml\">xml</a>";
my $linuxBase = &getBaseLocale("$dirLinux/base", $locale);
if($linuxBase ne "") {
print TABLE " <a href=\"../../linux/collation/$linuxBase.xml\">Base ($linuxBase)</a>";
}
print TABLE ")";
}
print TABLE "</th>\n";
my $dirWin = "w2kcollations";
my $refWin = $dirWin."/".$locale."_default_raw.html";
my $rawWin = $dirWin."/".$locale."_raw.html";
my $nameWin = "$dirWin/$locale"."_collation.html";
my $colorWin = "#98FB98";
$loc = $locale;
#try fallback for windows
print TABLE " <th bgcolor=\"$colorWin\">WINDOWS";
if(!(-e $nameWin)) {
$locale =~ /_/;
$loc = $`;
$nameWin = "$dirWin/$loc"."_collation.html";
}
print "Windows loc is $loc\n";
if (-e $nameWin) {
print TABLE " (<a href=\"../../windows/collation/$loc.xml\">xml</a>";
my $winBase = &getBaseLocale("$dirWin/base", $locale);
if($winBase ne "") {
print TABLE "<a href=\"../../windows/collation/$winBase.xml\">base ($winBase)</a>";
}
print TABLE ")";
}
print TABLE "</th>\n";
print TABLE " </tr>\n <tr>";
readRules($nameCommon, "#AD989D", "Same as the UCA.");
readRules($nameLinux, "#1191F1", "No data available.");
readRules($nameWin, "#98FB98", "No data available.");
print TABLE <<"EndOfFooter";
</tr>
</table>
</body>
</html>
EndOfFooter
sub readRules {
# readRules($file, $color)
my $filename = shift;
my $color = shift;
my $comment = shift;
my $noLines = 0;
my $printOut = 0;
my $file;
if(-e $filename) {
open($file, "<$filename") || die "something very strange happened\n";
print TABLE "<td bgcolor=\"$color\">\n";
while (<$file>) {
if (/\}\<br\>$/) {
$printOut = 0;
}
if ($printOut) {
if(!/^$/ && !/&nbsp;<br>$/) {
print TABLE $_;
$noLines++;
}
}
if (/Sequence/) {
$printOut = 1;
print "found sequence\n";
$noLines = 0;
}
}
if (!$noLines) {
print TABLE "Same ordering as base\n";
}
print TABLE "</td>\n";
} else {
print TABLE "<td bgcolor=\"$color\">\n$comment</td>\n";
}
}
sub getBaseLocale(){
my $basefile = shift;
my $locale = shift;
my $baseFH = IO::File->new($basefile,"r")
or die "could not open the file $basefile for reading: $! \n";
my $bse;
my $loc;
while(defined ( my $line = <$baseFH>)){
if( $line =~ /\<$locale\>/){
($loc,$bse) = split (/\>/, $line);
$bse =~ s/^\s+\<//;
return $bse;
}
}
}
# Tasting of food product
# 650-574-4551 $50 1 hour
# <td bgcolor="#AD989D">1.0-alpha</td>
# <td bgcolor="#FF6633">1.0</td>
# <td bgcolor="#FF6633">=</td>
# <td bgcolor="#FF6633"><span title="006E {LATIN SMALL LETTER N}">&amp;n</span><br>
# <span title="006E 0079 {LATIN SMALL LETTER N} {LATIN SMALL LETTER Y}">&nbsp;&nbsp;&lt;&nbsp;ny</span><br>
# <span title="006E 006E 0079 {LATIN SMALL LETTER N} {LATIN SMALL LETTER N} {LATIN SMALL LETTER Y} / 006E 0079 {LATIN SMALL LETTER N} {LATIN SMALL LETTER Y}">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;=&nbsp;nny&nbsp;/&nbsp;ny</span><br>
# <span title="006E 0059 {LATIN SMALL LETTER N} {LATIN CAPITAL LETTER Y}">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&lt;&lt;&lt;&nbsp;nY</span><br>
# </td>
# <td bgcolor="#FF6633">=</td>
# <td bgcolor="#FFFF33">1.2</td>
# <td bgcolor="#98FB98">Windows XP</td>
# <td bgcolor="#FF6633">=</td>
# <td bgcolor="#FF6633">=</td>

View file

@ -0,0 +1,246 @@
#!/usr/bin/perl
use strict;
use Unicode::UCD 'charinfo';
use Unicode::Normalize;
use utf8;
use open ':utf8';
my $printout = 0;
my $braces = 0;
my $colls = 0;
my $aliased = 0;
my $newName = "";
my $filename;
my $suffix;
my $locale;
NEW_FILE:
foreach my $arg (@ARGV) {
if($newName =~ /^$/) {
$locale = $arg;
$locale =~ s#^.*/##g;
$locale =~ s/\.txt//;
} else {
$newName = "";
}
my $command = "/home/weiv/build/current/bin/uconv -x hex-any/Java -f utf8 -t utf8 $arg";
print $command."\n";
my @bundle = `$command`;
foreach $_ (@bundle) {
#while(<>) {
#print $ARGV if eof;
if(/^\/\//) {
next;
}
if(/collations/) {
print "found Collations\n";
$colls = 1;
if(/alias/) {
print "collations are aliased\n";
$aliased = 1;
}
}
if($aliased) {
print "processing aliased data: $_\n";
if(/\{/) {
print "Braces opened\n";
$braces = 1;
}
if($braces && /\"(.*)\"/) {
$newName = $1;
print "Aliasing to $newName\n";
}
if($braces && /\}/) {
$braces = 0;
print "Braces closed\n";
$aliased = 0;
print "Switching from $filename to $newName\n";
$arg =~ s/$locale\.txt$/$newName\.txt/;
print "$arg\n";
redo NEW_FILE;
}
}
if(/standard|phonebook|traditional|pinyin|stroke|direct/ && $colls) {
print "found $& collation\n";
$suffix = "_".uc($&);
if(/standard/) {
$suffix = "";
}
}
if(/Sequence/ && $colls) {
#binmode ARGV, ":utf8";
$printout = 1;
#$filename = $ARGV;
$filename = $locale;
if($suffix) {
$filename .= "_".$suffix;
}
$filename .= "_collation.html";
print "filename is $filename\n";
#open(OUT, ">:utf8", "$filename");
open(OUT, ">$filename");
printHeading($arg);
#next;
}
my $line = $_;
if($line =~ /\{/ && $printout) {
$braces++;
}
if($printout) {
print OUT processLine($line);
print OUT "\n";
}
if( $line =~ /\}/ && $printout) {
$braces--;
if($braces == 0) {
$printout = 0;
printFooting();
close(OUT);
}
}
}
}
sub processLine {
my $line = shift;
$_ = $line;
my $i = 0;
my $j = 0;
my $result;
# remove comments
s#//.*$##g;
# remove "Sequence" if present
s/Sequence\s*//;
# remove leading brace if present
s/^\s*{//;
# remove trailing brace if present
s/}\s*$//;
# remove trailing quote
s/"\s*$//;
#remove lead quote
s/^\s*"//;
#separate options
s/(\[.*\])/\n\1/g;
#separate resets
s/\s*\&\s*/\n\& /g;
#separate strengths and insert spaces
s/\s*(<{1,4})\s*/\n\1 /g;
#separate equals and insert spaces
s/\s*=\s*/\n= /g;
# break into individual reset/strength/setting lines
my @lines = split(/\n/);
my $line;
my $name;
my $spanEnd = "";
my $result = "";
my $names = "";
my $codes = "";
my $lrm = "";
foreach $line (@lines) {
# skip empty lines
if($line =~ /^$/) {
next;
}
$spanEnd = "";
$name = "";
$lrm = "";
$line = NFC($line);
# for resets and strengths we will get name for elements
if($line =~ /<{1,4} |= |& \[.*\]|& /) {
$name = "<span title=\"";
$names = "";
$codes = "";
my $start = $&;
my $rest = $';
for ($j = 0; $j < length($rest); $j++) {
my $char = substr($rest, $j, 1);
my $charVal = ord($char);
# some of elements are part of the syntax, so they are
# entered without translation to the name
if($charVal == 0x002F || $charVal == 0x007C) {
$name .= $codes.$names." $char ";
$codes = "";
$names = "";
} elsif($charVal == 0x0027) { #quote requires more processing
#$name .= "'";
} else {
my $charinfo = charinfo($charVal);
$codes .= $charinfo->{'code'}." ";
$names .= "{".$charinfo->{'name'}."} ";
if($charinfo->{'bidi'} eq "R" || $charinfo->{'bidi'} eq "AL") {
$lrm = "&lrm;";
}
#$name .= $charinfo->{'code'}." {".$charinfo->{'name'}."} ";
}
}
$name .= $codes.$names."\" >";
$spanEnd = "</span>";
}
#print $name."\n";
if($line =~ /^<<<</) {
$line = " $line";
} elsif($line =~ /^<<</) {
$line = " $line";
} elsif($line =~ /^<</) {
$line = " $line";
} elsif($line =~ /^</) {
$line = " $line";
} elsif($line =~ /^=/) {
$line = " $line";
}
# insert spaces around vertical bars (fix prefixes)
# insert spaces around slashes (fix expansions)
$line =~ s#/# / #g;
# replace &
$line =~ s/\&/&amp;/g;
# replace spaces
$line =~ s/ /&nbsp;/g;
# replace <
$line =~ s/</&lt;/g;
# replace >
$line =~ s/>/&gt;/g;
#$lines[$i] = $name.$lrm.$line."</span><br>";
#$i++;
$result .= $name.$lrm.$line.$spanEnd."<br>\n";
}
#$_ = join("\n", @lines);
return $result;
}
sub printHeading {
my $filename = shift;
$filename =~ s/\.txt//;
print OUT <<"EndOfHeading";
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=utf-8">
</head>
# Collation data resource bundle generated for locale: $filename<br>
# For platform icu reference platform UCA<br><br>
$filename&nbsp;{<br>
&nbsp;&nbsp;CollationElements&nbsp;{<br>
&nbsp;&nbsp;&nbsp;&nbsp;Sequence&nbsp;{<br>
EndOfHeading
}
sub printFooting {
print OUT <<"EndOfFooting";
&nbsp;&nbsp;&nbsp;&nbsp;}<br>
&nbsp;&nbsp;}<br>
}<br>
</pre>
</html>
EndOfFooting
}

24
tools/colprobe/gcd2.pl Executable file
View file

@ -0,0 +1,24 @@
#!/usr/bin/perl -w
use strict;
#my $localeMinusA = `locale -a`;
my $localeMinusA = `cat ~/src/icu/source/extra/colprobe/locale.txt`;
my @locales = split(/\n/, $localeMinusA);
my $locale;
my $command;
my $platform = $ARGV[0];
mkdir $platform."logs2";
mkdir $platform;
foreach $locale (@locales) {
$command = "~/src/icu/source/extra/colprobe/colprobe --platform $platform --ref $platform --diff $locale >$platform"."logs2/$locale"."Log.txt 2>&1";
($locale, $_) = split(/\./, $locale);
$command .= "; cp /usr/share/i18n/locales/$locale $platform/";
print "$command\n";
`$command`;
#chdir "..";
}

23
tools/colprobe/genCollData.pl Executable file
View file

@ -0,0 +1,23 @@
#!/usr/bin/perl -w
use strict;
my $localeMinusA = `locale -a`;
my @locales = split(/\n/, $localeMinusA);
my $locale;
my $command;
my $platform = $ARGV[0];
mkdir $platform."logs";
mkdir $platform;
foreach $locale (@locales) {
$command = "~/src/icu/source/extra/colprobe/colprobe --output resb --platform linux --ref linux $locale >$platform"."logs/$locale"."Log.txt 2>&1";
($locale, $_) = split(/\./, $locale);
$command .= "; cp /usr/share/i18n/locales/$locale $platform/";
print "$command\n";
`$command`;
#chdir "..";
}

701
tools/colprobe/line.cpp Executable file
View file

@ -0,0 +1,701 @@
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
*
* File line.cpp
*
* Modification History:
*
* Date Name Description
* 03/18/2003 weiv Creation.
*******************************************************************************
*/
#include "line.h"
#include <stdio.h>
UnicodeSet * Line::needsQuoting = NULL;
void
Line::init()
{
len = 0;
expLen = 0;
strength = UCOL_OFF;
strengthFromEmpty = UCOL_OFF;
cumulativeStrength = UCOL_OFF;
expStrength = UCOL_OFF;
previous = NULL;
next = NULL;
left = NULL;
right = NULL;
isContraction = FALSE;
isExpansion = FALSE;
isRemoved = FALSE;
isReset = FALSE;
expIndex = 0;
firstCC = 0;
lastCC = 0;
sortKey = NULL;
}
Line::Line()
{
init();
memset(name, 0, 25*sizeof(UChar));
memset(expansionString, 0, 25*sizeof(UChar));
}
Line::Line(const UChar* name, int32_t len)
{
init();
this->len = len;
u_memcpy(this->name, name, len);
memset(expansionString, 0, 25*sizeof(UChar));
UChar32 c;
U16_GET(name, 0, 0, len, c);
firstCC = u_getCombiningClass(c);
U16_GET(name, 0, len-1, len, c);
lastCC = u_getCombiningClass(c);
}
Line::Line(const UChar name)
{
init();
len = 1;
this->name[0] = name;
this->name[1] = 0;
memset(expansionString, 0, 25*sizeof(UChar));
firstCC = u_getCombiningClass(name);
lastCC = firstCC;
}
Line::Line(const UnicodeString &string)
{
init();
setTo(string);
}
Line::Line(const char *buff, int32_t buffLen, UErrorCode &status) :
previous(NULL),
next(NULL),
left(NULL),
right(NULL)
{
initFromString(buff, buffLen, status);
}
Line::Line(const Line &other) :
previous(NULL),
next(NULL),
left(NULL),
right(NULL)
{
*this = other;
}
Line &
Line::operator=(const Line &other) {
len = other.len;
expLen = other.expLen;
strength = other.strength;
strengthFromEmpty = other.strengthFromEmpty;
cumulativeStrength = other.cumulativeStrength;
expStrength = other.expStrength;
isContraction = other.isContraction;
isExpansion = other.isExpansion;
isRemoved = other.isRemoved;
isReset = other.isReset;
expIndex = other.expIndex;
firstCC = other.firstCC;
lastCC = other.lastCC;
u_strcpy(name, other.name);
u_strcpy(expansionString, other.expansionString);
sortKey = other.sortKey;
left = other.left;
right = other.right;
return *this;
}
UBool
Line::operator==(const Line &other) const {
if(this == &other) {
return TRUE;
}
if(len != other.len) {
return FALSE;
}
if(u_strcmp(name, other.name) != 0) {
return FALSE;
}
return TRUE;
}
UBool
Line::equals(const Line &other) const {
if(this == &other) {
return TRUE;
}
if(len != other.len) {
return FALSE;
}
if(u_strcmp(name, other.name) != 0) {
return FALSE;
}
if(strength != other.strength) {
return FALSE;
}
if(expLen != other.expLen) {
return FALSE;
}
if(u_strcmp(expansionString, other.expansionString)) {
return FALSE;
}
return TRUE;
}
UBool
Line::operator!=(const Line &other) const {
return !(*this == other);
}
Line::~Line() {
}
void
Line::copyArray(Line *dest, const Line *src, int32_t size) {
int32_t i = 0;
for(i = 0; i < size; i++) {
dest[i] = src[i];
}
}
void
Line::setName(const UChar* name, int32_t len) {
this->len = len;
u_memcpy(this->name, name, len);
UChar32 c;
U16_GET(name, 0, 0, len, c);
firstCC = u_getCombiningClass(c);
U16_GET(name, 0, len-1, len, c);
lastCC = u_getCombiningClass(c);
}
void
Line::setToConcat(const Line *first, const Line *second) {
u_strcpy(name, first->name);
u_strcat(name, second->name);
len = first->len + second->len;
firstCC = first->firstCC;
lastCC = second->lastCC;
}
UnicodeString
Line::stringToName(UChar *string, int32_t len) {
UErrorCode status = U_ZERO_ERROR;
UnicodeString result;
char buffer[256];
int32_t i = 0;
UChar32 c;
while(i < len) {
U16_NEXT(string, i, len, c);
if(c < 0x10000) {
sprintf(buffer, "%04X ", c);
} else {
sprintf(buffer, "%06X ", c);
}
result.append(buffer);
}
i = 0;
while(i < len) {
U16_NEXT(string, i, len, c);
u_charName(c, U_EXTENDED_CHAR_NAME, buffer, 256, &status);
result.append("{");
result.append(buffer);
result.append("} ");
}
/*
for(i = 0; i < len; i++) {
sprintf(buffer, "%04X ", string[i]);
result.append(buffer);
}
for(i = 0; i < len; i++) {
u_charName(string[i], U_EXTENDED_CHAR_NAME, buffer, 256, &status);
result.append("{");
result.append(buffer);
result.append("} ");
}
*/
return result;
}
UnicodeString
Line::toBundleString()
{
UnicodeString result;
UErrorCode status = U_ZERO_ERROR;
if(!needsQuoting) {
needsQuoting = new UnicodeSet("[[:whitespace:][:c:][:z:][[:ascii:]-[a-zA-Z0-9]]]", status);
}
UChar NFC[50];
int32_t NFCLen = unorm_normalize(name, len, UNORM_NFC, 0, NFC, 50, &status);
result.append("\"");
if(isReset) {
result.append("&");
} else {
result.append(strengthToString(strength, FALSE, FALSE));
}
UBool quote = needsQuoting->containsSome(name) || needsQuoting->containsSome(NFC);
if(quote) {
result.append("'");
}
if(NFC[0] == 0x22) {
result.append("\\u0022");
} else {
result.append(NFC, NFCLen);
}
if(quote && NFC[0] != 0x0027) {
result.append("'");
}
if(expLen && !isReset) {
quote = needsQuoting->containsSome(expansionString);
result.append(" / ");
if(quote) {
result.append("'");
}
result.append(expansionString);
if(quote) {
result.append("'");
}
}
result.append("\" //");
result.append(stringToName(NFC, NFCLen));
if(expLen && !isReset) {
result.append(" / ");
result.append(stringToName(expansionString, expLen));
}
result.append("\n");
return result;
}
UnicodeString
Line::toHTMLString()
{
UnicodeString result;
UErrorCode status = U_ZERO_ERROR;
UChar NFC[50];
int32_t NFCLen = unorm_normalize(name, len, UNORM_NFC, 0, NFC, 50, &status);
result.append("<span title=\"");
result.append(stringToName(NFC, NFCLen));
if(expLen && !isReset) {
result.append(" / ");
result.append(stringToName(expansionString, expLen));
}
result.append("\">");
if(isReset) {
result.append("&amp;");
} else {
result.append(strengthToString(strength, FALSE, TRUE));
}
result.append(NFC, NFCLen);
if(expLen && !isReset) {
result.append("&nbsp;/&nbsp;");
result.append(expansionString);
}
result.append("</span><br>\n");
return result;
}
UnicodeString
Line::toString(UBool pretty) {
UnicodeString result;
if(!pretty) {
result.setTo(name);
if(expLen) {
result.append("/");
result.append(expansionString);
}
} else {
UErrorCode status = U_ZERO_ERROR;
UChar NFC[50];
int32_t NFCLen = unorm_normalize(name, len, UNORM_NFC, 0, NFC, 50, &status);
result.setTo(NFC, NFCLen);
if(expLen) {
result.append("/");
result.append(expansionString);
}
/*
if(NFCLen != len || u_strncmp(name, NFC, len) != 0) {
result.append("(NFC: ");
result.append(NFC, NFCLen);
result.append(stringToName(NFC, NFCLen));
result.append(")");
}
*/
result.append(" # ");
result.append(stringToName(NFC, NFCLen));
if(expLen) {
result.append("/ ");
result.append(stringToName(expansionString, expLen));
}
}
return result;
}
void
Line::setTo(const UnicodeString &string) {
int32_t len = string.length();
u_strncpy(name, string.getBuffer(), len);
name[len] = 0;
this->len = len;
UChar32 c;
U16_GET(name, 0, 0, len, c);
firstCC = u_getCombiningClass(c);
U16_GET(name, 0, len-1, len, c);
lastCC = u_getCombiningClass(c);
}
void
Line::setTo(const UChar32 n) {
UBool isError = FALSE;
len = 0; // we are setting the line to char, not appending
U16_APPEND(name, len, 25, n, isError);
name[len] = 0;
firstCC = u_getCombiningClass(n);
lastCC = firstCC;
}
UnicodeString
Line::strengthIndent(UColAttributeValue strength, int indentSize, UnicodeString &result)
{
int i;
int numIndents = strength+1;
if(strength > UCOL_IDENTICAL) {
return result;
} else if(strength == UCOL_IDENTICAL) {
numIndents = 5;
}
for(i = 0; i < numIndents*indentSize; i++) {
result.append(" ");
}
return result;
}
UnicodeString
Line::strengthToString(UColAttributeValue strength, UBool pretty, UBool html) {
UnicodeString result;
if(html) {
switch(strength) {
case UCOL_IDENTICAL:
result.append("&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;=&nbsp;");
break;
case UCOL_QUATERNARY:
result.append("&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&lt;&lt;&lt;&lt;&nbsp;");
break;
case UCOL_TERTIARY:
result.append("&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&lt;&lt;&lt;&nbsp;");
break;
case UCOL_SECONDARY:
result.append("&nbsp;&nbsp;&nbsp;&nbsp;&lt;&lt;&nbsp;");
break;
case UCOL_PRIMARY:
result.append("&nbsp;&nbsp;&lt;&nbsp;");
break;
case UCOL_OFF:
result.append("&nbsp;&gt;?&nbsp;");
break;
default:
result.append("&nbsp;?!&nbsp;");
break;
}
} else {
switch(strength) {
case UCOL_IDENTICAL:
if(pretty) {
result.append(" ");
}
result.append(" = ");
break;
case UCOL_QUATERNARY:
if(pretty) {
result.append(" ");
}
result.append(" <<<< ");
break;
case UCOL_TERTIARY:
//u_fprintf(file, "<3");
if(pretty) {
result.append(" ");
}
result.append(" <<< ");
break;
case UCOL_SECONDARY:
//u_fprintf(file, "<2");
if(pretty) {
result.append(" ");
}
result.append(" << ");
break;
case UCOL_PRIMARY:
//u_fprintf(file, "<1");
if(pretty) {
result.append(" ");
}
result.append(" < ");
break;
case UCOL_OFF:
result.append(" >? ");
break;
default:
result.append(" ?! ");
break;
}
}
return result;
}
Line *
Line::nextInteresting() {
Line *result = this->next;
while(result && result->strength != UCOL_IDENTICAL) {
result = result->next;
}
return result;
}
void
Line::append(const UChar* n, int32_t length)
{
u_strncat(name, n, length);
name[len+length] = 0;
len += length;
UChar32 end;
U16_GET(n, 0, length-1, length, end);
lastCC = u_getCombiningClass(end);
}
void
Line::append(const UChar n)
{
name[len] = n;
name[len+1] = 0;
len++;
lastCC = u_getCombiningClass(n);
}
void
Line::append(const Line &l)
{
append(l.name, l.len);
lastCC = l.lastCC;
}
void
Line::clear()
{
name[0] = 0;
len = 0;
}
int32_t
Line::write(char *buff, int32_t, UErrorCode &)
{
/*
UChar name[25];
int32_t len;
UChar expansionString[25];
int32_t expLen;
UColAttributeValue strength;
UColAttributeValue strengthFromEmpty;
UColAttributeValue cumulativeStrength;
UColAttributeValue expStrength;
Line *previous;
Line *next;
UBool isContraction;
UBool isExpansion;
UBool isRemoved;
UBool isReset;
int32_t expIndex;
uint8_t firstCC;
uint8_t lastCC;
*/
int32_t resLen = 0;
int32_t i = 0;
sprintf(buff+resLen, "%04X", name[0]);
resLen += 4;
for(i = 1; i < len; i++) {
sprintf(buff+resLen, " %04X", name[i]);
resLen += 5;
}
sprintf(buff+resLen, "/");
resLen += 1;
i = 0;
if(expLen) {
sprintf(buff+resLen, "%04X", expansionString[0]);
resLen += 4;
for(i = 1; i < expLen; i++) {
sprintf(buff+resLen, " %04X", expansionString[i]);
resLen += 5;
}
}
sprintf(buff+resLen, "; ");
resLen += 2;
sprintf(buff+resLen, "%02i ", strength);
resLen += 3;
sprintf(buff+resLen, "%02i", strengthFromEmpty);
resLen += 2;
sprintf(buff+resLen, "%02i", cumulativeStrength);
resLen += 2;
sprintf(buff+resLen, "%02i", expStrength);
resLen += 2;
// Various flags. The only interesting ones are isReset and isRemoved. We will not output removed lines
//sprintf(buff+resLen, "%1i%1i%1i%1i ", isContraction, isExpansion, isRemoved, isReset);
//resLen += 5;
sprintf(buff+resLen, "%1i%1i ", isRemoved, isReset);
resLen += 3;
// first and last CC
// can be calculated on reading
//sprintf(buff+resLen, "%03i %03i ", firstCC, lastCC);
//resLen += 8;
sprintf(buff+resLen, "%08X", expIndex);
resLen += 8;
buff[resLen] = 0;
return resLen;
}
void
Line::initFromString(const char *buff, int32_t, UErrorCode &)
{
int32_t bufIndex = 0;
int32_t i = 0;
sscanf(buff+bufIndex, "%04X", &name[i]);
i++;
bufIndex += 4;
while(buff[bufIndex] != '/') {
sscanf(buff+bufIndex, " %04X", &name[i]);
i++;
bufIndex += 5;
}
len = i;
name[len] = 0;
bufIndex++;
if(i > 1) {
isContraction = TRUE;
} else {
isContraction = FALSE;
}
if(buff[bufIndex] == ';') {
isExpansion = FALSE;
bufIndex += 2;
expansionString[0] = 0;
expLen = 0;
} else {
i = 0;
sscanf(buff+bufIndex, "%04X", &expansionString[i]);
i++;
bufIndex += 4;
while(buff[bufIndex] != ';') {
sscanf(buff+bufIndex, " %04X", &expansionString[i]);
i++;
bufIndex += 5;
}
expLen = i;
expansionString[expLen] = 0;
bufIndex += 2;
}
sscanf(buff+bufIndex, "%02i ", &strength);
bufIndex += 3;
sscanf(buff+bufIndex, "%02i", &strengthFromEmpty);
bufIndex += 2;
sscanf(buff+bufIndex, "%02i", &cumulativeStrength);
bufIndex += 2;
sscanf(buff+bufIndex, "%02i", &expStrength);
bufIndex += 2;
sscanf(buff+bufIndex, "%1i%1i ", &isRemoved, &isReset);
bufIndex += 3;
sscanf(buff+bufIndex, "%08X", &expIndex);
bufIndex += 8;
// calculate first and last CC
UChar32 c;
U16_GET(name, 0, 0, len, c);
firstCC = u_getCombiningClass(c);
U16_GET(name, 0, len-1, len, c);
lastCC = u_getCombiningClass(c);
}
void
Line::swapCase(UChar *string, int32_t &sLen)
{
UChar32 c = 0;
int32_t i = 0, j = 0;
UChar buff[256];
UBool isError = FALSE;
while(i < sLen) {
U16_NEXT(string, i, sLen, c);
if(u_isUUppercase(c)) {
c = u_tolower(c);
} else if(u_isULowercase(c)) {
c = u_toupper(c);
}
U16_APPEND(buff, j, 256, c, isError);
}
buff[j] = 0;
u_strcpy(string, buff);
sLen = j;
}
void
Line::swapCase()
{
swapCase(name, len);
swapCase(expansionString, expLen);
}
UnicodeString
Line::dumpSortkey()
{
char buffer[256];
char *buff = buffer;
*buff = 0;
uint8_t *key = sortKey;
if(sortKey) {
while(*key) {
sprintf(buff, "%02X ", *key);
key++;
buff += 3;
if(buff - buffer > 252) {
break;
}
}
}
return UnicodeString(buffer);
}

113
tools/colprobe/line.h Executable file
View file

@ -0,0 +1,113 @@
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
*
* File line.h
*
* Modification History:
*
* Date Name Description
* 03/18/2003 weiv Creation.
*******************************************************************************
*/
//
// class Line
//
// Each line from the source file (containing a name, presumably) gets
// one of these structs.
//
#ifndef COLPROBE_LINE_H
#define COLPROBE_LINE_H
#include "unicode/utypes.h"
#include "unicode/ucol.h"
#include "unicode/ustring.h"
#include "unicode/unistr.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "colprobe.h"
#include <stdlib.h>
#include <string.h>
static const int MAX_EXPANSION_PREFIXES = 10;
class Line {
public:
static void copyArray(Line *dest, const Line *src, int32_t size);
Line();
Line(const Line &other);
Line(const UChar* name, int32_t len);
Line(const UnicodeString &string);
Line(const UChar name);
Line(const char *buff, int32_t buffLen, UErrorCode &status);
~Line();
Line & operator=(const Line &other);
UBool operator==(const Line &other) const;
UBool operator!=(const Line &other) const;
void setToConcat(const Line *first, const Line *second);
void setName(const UChar* name, int32_t len);
UnicodeString toString(UBool pretty = FALSE);
UnicodeString toBundleString();
UnicodeString toHTMLString();
int32_t write(char *buff, int32_t buffLen, UErrorCode &status);
void initFromString(const char *buff, int32_t buffLen, UErrorCode &status);
UnicodeString strengthIndent(UColAttributeValue strength, int indentSize, UnicodeString &result);
UnicodeString strengthToString(UColAttributeValue strength, UBool pretty, UBool html = FALSE);
UnicodeString stringToName(UChar *string, int32_t len);
void setTo(const UnicodeString &string);
void setTo(const UChar32 n);
UBool equals(const Line &other) const;
Line *nextInteresting();
void append(const UChar n);
void append(const UChar* n, int32_t length);
void append(const Line &l);
void clear();
void swapCase();
void swapCase(UChar *string, int32_t &sLen);
UnicodeString dumpSortkey();
void init();
public:
UChar name[25];
int32_t len;
UChar expansionString[25];
int32_t expLen;
UColAttributeValue strength;
UColAttributeValue strengthFromEmpty;
UColAttributeValue cumulativeStrength;
UColAttributeValue expStrength;
Line *previous;
Line *next;
// In case this element is a contraction
// we keep a pointer at which lines were components
Line *left;
Line *right;
UBool isContraction;
UBool isExpansion;
UBool isRemoved;
UBool isReset;
int32_t expIndex;
uint8_t firstCC;
uint8_t lastCC;
uint8_t *sortKey;
public:
static UnicodeSet *needsQuoting;
};
#endif //COLPROBE_LINE_H

241
tools/colprobe/locale.txt Executable file
View file

@ -0,0 +1,241 @@
af
af_ZA
am
am_ET
ar
ar_AE
ar_BH
ar_DZ
ar_EG
ar_IN
ar_IQ
ar_JO
ar_KW
ar_LB
ar_LY
ar_MA
ar_OM
ar_QA
ar_SA
ar_SD
ar_SY
ar_TN
ar_YE
be
be_BY
bg
bg_BG
bn
bn_IN
ca
ca_ES
#ca_ES_PREEURO
cs
cs_CZ
da
da_DK
de
de_AT
#de_AT_PREEURO
de_BE
de_CH
de_DE
#de_DE_PREEURO
de_LU
#de_LU_PREEURO
de__PHONEBOOK
el
el_GR
#el_GR_PREEURO
en
en_AU
en_BE
#en_BE_PREEURO
en_BW
en_CA
en_GB
#en_GB_EURO
en_HK
en_IE
#en_IE_PREEURO
en_IN
en_MT
en_NZ
en_PH
en_SG
en_US
en_US_POSIX
en_VI
en_ZA
en_ZW
eo
es
es_AR
es_BO
es_CL
es_CO
es_CR
es_DO
es_EC
es_ES
#es_ES_PREEURO
es_GT
es_HN
es_MX
es_NI
es_PA
es_PE
es_PR
es_PY
es_SV
es_US
es_UY
es_VE
es__TRADITIONAL
et
et_EE
eu
eu_ES
#eu_ES_PREEURO
fa
fa_AF
fa_IR
fi
fi_FI
#fi_FI_PREEURO
fo
fo_FO
fr
fr_BE
#fr_BE_PREEURO
fr_CA
fr_CH
fr_FR
#fr_FR_PREEURO
fr_LU
#fr_LU_PREEURO
ga
ga_IE
#ga_IE_PREEURO
gl
gl_ES
#gl_ES_PREEURO
gu
gu_IN
gv
gv_GB
he
he_IL
hi
hi_IN
hi__DIRECT
hr
hr_HR
hu
hu_HU
hy
hy_AM
hy_AM_REVISED
id
id_ID
is
is_IS
it
it_CH
it_IT
#it_IT_PREEURO
ja
ja_JP
#ja_JP_TRADITIONAL
kk_KZ
kl
kl_GL
kn
kn_IN
ko
ko_KR
kok
kok_IN
kw
kw_GB
lt
lt_LT
lv
lv_LV
mk
mk_MK
mr
mr_IN
ms_MY
mt
mt_MT
nb
nb_NO
nl
nl_BE
#nl_BE_PREEURO
nl_NL
#nl_NL_PREEURO
nn
nn_NO
om
om_ET
om_KE
pl
pl_PL
ps
ps_AF
pt
pt_BR
pt_PT
#pt_PT_PREEURO
ro
ro_RO
ru
ru_RU
ru_UA
sh
sh_YU
sk
sk_SK
sl
sl_SI
so
so_DJ
so_ET
so_KE
so_SO
sq
sq_AL
sr
sr_YU
sv
sv_FI
sv_SE
sw
sw_KE
sw_TZ
ta
ta_IN
te
te_IN
th
th_TH
#th_TH_TRADITIONAL
ti
ti_ER
ti_ET
tr
tr_TR
uk
uk_UA
vi
vi_VN
zh
zh_CN
zh_HK
zh_MO
zh_SG
zh_TW
zh_TW_STROKE
zh__PINYIN

48
tools/colprobe/longname.cpp Executable file
View file

@ -0,0 +1,48 @@
#include "unicode/unistr.h"
#include "unicode/locid.h"
#include "unicode/ucnv.h"
#include <stdio.h>
int main(int argc,
char* argv[])
{
UErrorCode status = U_ZERO_ERROR;
const char *loc = argv[1];
int32_t hasCountry;
UConverter *conv = ucnv_open("utf8", &status);
UChar UBuffer[256];
int32_t uBufLen = 0;
char buffer[256];
int32_t bufLen = 0;
uBufLen = uloc_getDisplayLanguage(loc, "en", UBuffer, 256, &status);
bufLen = ucnv_fromUChars(conv, buffer, 256, UBuffer, uBufLen, &status);
//u_UCharsToChars(UBuffer, buffer, uBufLen);
buffer[bufLen] = 0;
printf("%s", buffer);
if(hasCountry = uloc_getCountry(loc, buffer, 256, &status)) {
uBufLen = uloc_getDisplayCountry(loc, "en", UBuffer, 256, &status);
bufLen = ucnv_fromUChars(conv, buffer, 256, UBuffer, uBufLen, &status);
//u_UCharsToChars(UBuffer, buffer, uBufLen);
buffer[bufLen] = 0;
printf("_%s", buffer);
}
if(uloc_getVariant(loc, buffer, 256, &status)) {
uBufLen = uloc_getDisplayVariant(loc, "en", UBuffer, 256, &status);
bufLen = ucnv_fromUChars(conv, buffer, 256, UBuffer, uBufLen, &status);
//u_UCharsToChars(UBuffer, buffer, uBufLen);
buffer[bufLen] = 0;
if(!hasCountry) {
printf("_");
}
printf("_%s", buffer);
}
printf("\n");
return 0;
}

4
tools/colprobe/readme.txt Executable file
View file

@ -0,0 +1,4 @@
There are several tools in this directory that should make it easier to generate collation data:
extractCollationData.pl - perl script that reads ICU resource bundle files and outputs a locale_collation.html file if collation elements are present in the locale. Arguments are the list of locale source files (*.txt) that need to be processed.
createComparisonTables.pl - takes a locale name. Looks in directories that should contain the html data produced by colprobe or extractCollationData.
tableStarter.pl - invokes createComparisonTables.pl with a list of locales.

2067
tools/colprobe/sortedlines.cpp Executable file

File diff suppressed because it is too large Load diff

120
tools/colprobe/sortedlines.h Executable file
View file

@ -0,0 +1,120 @@
#ifndef COLPROBE_SORTEDLINES_H
#define COLPROBE_SORTEDLINES_H
// colprobe includes
#include "colprobe.h"
#include "line.h"
#include "uprinter.h"
#include "strengthprobe.h"
// ICU includes
#include "unicode/uniset.h"
#include "unicode/usetiter.h"
#include "unicode/uscript.h"
#include "hash.h"
class SortedLines {
Line empty;
Line *UB[UCOL_OFF];
UnicodeSet ignorables[UCOL_OFF];
Line **toSort;
int32_t toSortCapacity;
Line *lines;
int32_t size;
int32_t capacity;
UnicodeSet repertoire;
UnicodeSet excludeBounds;
StrengthProbe probe;
Line *first;
Line *last;
Line *current;
SortedLines() {};
UPrinter *logger;
UPrinter *debug;
Hashtable *contractionsTable;
Hashtable *duplicators; // elements that duplicate preceding characters
int32_t maxExpansionPrefixSize;
// Properties of the sort
UBool wordSort;
UBool frenchSecondary;
UBool upperFirst;
uint8_t *sortkeys;
int32_t sortkeyOffset;
public:
SortedLines(const UnicodeSet &set, const UnicodeSet &excludeBounds, const StrengthProbe &probe, UPrinter *logger, UPrinter *debug);
SortedLines(FILE *file, UPrinter *logger, UPrinter *debug, UErrorCode &status);
~SortedLines();
void analyse(UErrorCode &status);
void sort(UBool setStrengths = TRUE, UBool link = FALSE);
void sort(Line **sortingArray, int32_t sizeToSort, UBool setStrengths = TRUE, UBool link = FALSE);
Line *getFirst();
Line *getLast();
void add(Line *line, UBool linkIn = FALSE);
void insert(Line *line, int32_t index);
Line *getNext();
Line *getPrevious();
Line *operator[](int32_t index);
int32_t addContractionsToRepertoire(UErrorCode &status);
int32_t getSize() const;
int32_t detectExpansions();
UnicodeString toString(UBool useLinks = FALSE);
UnicodeString toStringFromEmpty();
UnicodeString toPrettyString(UBool useLinks, UBool printSortKeys = FALSE);
UnicodeString toOutput(const char *format,
const char *locale, const char *platform, const char *reference,
UBool useLinks, UBool initialize, UBool moreToCome);
UnicodeString toBundle(const char *locale, const char *platform, const char *reference,
UBool useLinks, UBool initialize, UBool moreToCome);
UnicodeString toHTML(const char *locale, const char *platform, const char *reference,
UBool useLinks, UBool initialize, UBool moreToCome);
UnicodeString toXML(const char *locale, const char *platform, const char *reference,
UBool useLinks, UBool initialize, UBool moreToCome);
UnicodeString arrayToString(Line** sortedLines, int32_t linesSize, UBool pretty, UBool useLinks, UBool printSortKeys);
void setSortingArray(Line **sortingArray, Line *elements, int32_t sizeToSort);
int32_t setSortingArray(Line **sortingArray, Hashtable *table);
void reduceDifference(SortedLines& reference);
void getRepertoire(UnicodeSet &fillIn);
void removeDecompositionsFromRepertoire();
void getBounds(UErrorCode &status);
void classifyRepertoire();
void toFile(FILE *file, UBool useLinks, UErrorCode &status);
void swapCase();
void calculateSortKeys();
void calculateSortKey(Line &line);
private:
void init();
void init(UnicodeSet &rep, Line *lin);
int32_t detectContractions(Line **firstRep, int32_t firstSize,
Line **secondRep, int32_t secondSize,
Line *toAddTo, int32_t &toAddToSize,
Line *lesserToAddTo, int32_t &lesserToAddToSize,
int32_t capacity, UErrorCode &status);
void calculateCumulativeStrengths(Line *start, Line *end);
void transferCumulativeStrength(Line *previous, Line *that);
void updateBounds(UnicodeSet &set);
void addAll(Line* toAdd, int32_t toAddSize);
void setDistancesFromEmpty(Line* array, int32_t arraySize);
void noteContraction(const char* msg, Line *toAddTo, int32_t &toAddToSize, Line *left, Line *right, int32_t &noConts, UErrorCode &status);
int32_t gooseUp(int32_t resetIndex, int32_t expansionIndex, Line &expLine, int32_t *expIndexes, int32_t &expIndexSize, UColAttributeValue strength);
UBool getExpansionLine(const Line &expansion, const Line &previous, const Line &exp, Line &expansionLine);
};
#endif // #ifndef COLPROBE_SORTEDLINES_H

402
tools/colprobe/strengthprobe.cpp Executable file
View file

@ -0,0 +1,402 @@
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
*
* File line.h
*
* Modification History:
*
* Date Name Description
* 07/07/2003 weiv Creation.
*******************************************************************************
*/
//
// class Line
//
// Each line from the source file (containing a name, presumably) gets
// one of these structs.
//
#include "strengthprobe.h"
StrengthProbe::StrengthProbe(CompareFn comparer, GetSortKeyFn getter, UChar SE,
UChar B0, UChar B1, UChar B2, UChar B3) :
SE(SE),
B0(B0), B1(B1), B2(B2), B3(B3),
utilFirstP(&utilFirst), utilSecondP(&utilSecond),
frenchSecondary(FALSE),
comparer(comparer), skgetter(getter)
{
}
int
StrengthProbe::setProbeChars(UChar B0, UChar B1, UChar B2, UChar B3)
{
this->B0 = B0;
this->B1 = B1;
this->B2 = B2;
this->
B3 = B3;
return checkSanity();
}
int
StrengthProbe::checkSanity()
{
int sanityRes;
utilFirst.setTo(B0);
utilSecond.setTo(B3);
if((sanityRes = comparer(&utilFirstP, &utilSecondP)) >= 0) {
return sanityRes*10 + 3;
}
utilSecond.setTo(B2);
if((sanityRes = comparer(&utilFirstP, &utilSecondP)) >= 0) {
return sanityRes*10 + 2;
}
utilSecond.setTo(B1);
if((sanityRes = comparer(&utilFirstP, &utilSecondP)) >= 0) {
return sanityRes*10 + 1;
}
utilFirst.setTo(B3);
utilSecond.setTo(B2);
if((sanityRes = comparer(&utilFirstP, &utilSecondP)) >= 0) {
return sanityRes*10 + 5;
}
utilSecond.setTo(B1);
if((sanityRes = comparer(&utilFirstP, &utilSecondP)) >= 0) {
return sanityRes*10 + 4;
}
utilFirst.setTo(B2);
if((sanityRes = comparer(&utilFirstP, &utilSecondP)) >= 0) {
return sanityRes*10 + 6;
}
utilFirst.setTo(B0);
if(distanceFromEmptyString(utilFirst) > UCOL_PRIMARY) {
return 1000;
}
utilFirst.setTo(B1);
if(distanceFromEmptyString(utilFirst) > UCOL_PRIMARY) {
return 1001;
}
utilFirst.setTo(B2);
if(distanceFromEmptyString(utilFirst) > UCOL_PRIMARY) {
return 1002;
}
utilFirst.setTo(B3);
if(distanceFromEmptyString(utilFirst) > UCOL_PRIMARY) {
return 1003;
}
return 0;
}
UBool
StrengthProbe::probePrefix(const Line &x, const Line &y, UChar first, UChar second) {
utilFirst.name[0] = first;
utilFirst.name[1] = SE;
u_strcpy(utilFirst.name+2, x.name);
utilFirst.name[x.len+2] = 0;
utilFirst.len = x.len+2;
utilSecond.name[0] = second;
utilSecond.name[1] = SE;
u_strcpy(utilSecond.name+2, y.name);
utilSecond.name[y.len+2] = 0;
utilSecond.len = y.len+2;
if(comparer(&utilFirstP, &utilSecondP) < 0) {
return TRUE;
} else {
return FALSE;
}
}
UBool
StrengthProbe::probeSuffix(const Line &x, const Line &y, UChar first, UChar second) {
u_strcpy(utilFirst.name, x.name);
utilFirst.name[x.len] = SE;
utilFirst.name[x.len+1] = first;
utilFirst.name[x.len+2] = 0;
utilFirst.len = x.len + 2;
u_strcpy(utilSecond.name, y.name);
utilSecond.name[y.len] = SE;
utilSecond.name[y.len+1] = second;
utilSecond.name[y.len+2] = 0;
utilSecond.len = y.len + 2;
if(comparer(&utilFirstP, &utilSecondP) < 0) {
return TRUE;
} else {
return FALSE;
}
}
UBool
StrengthProbe::probePrefixNoSep(const Line &x, const Line &y, UChar first, UChar second) {
utilFirst.name[0] = first;
u_strcpy(utilFirst.name+1, x.name);
utilFirst.name[x.len+1] = 0;
utilFirst.len = x.len + 1;
utilSecond.name[0] = second;
u_strcpy(utilSecond.name+1, y.name);
utilSecond.name[y.len+1] = 0;
utilSecond.len = y.len + 1;
if(comparer(&utilFirstP, &utilSecondP) < 0) {
return TRUE;
} else {
return FALSE;
}
}
UBool
StrengthProbe::probeSuffixNoSep(const Line &x, const Line &y, UChar first, UChar second) {
u_strcpy(utilFirst.name, x.name);
utilFirst.name[x.len] = first;
utilFirst.name[x.len+1] = 0;
utilFirst.len = x.len + 1;
u_strcpy(utilSecond.name, y.name);
utilSecond.name[y.len] = second;
utilSecond.name[y.len+1] = 0;
utilSecond.len = y.len + 1;
if(comparer(&utilFirstP, &utilSecondP) < 0) {
return TRUE;
} else {
return FALSE;
}
}
UColAttributeValue
StrengthProbe::getStrength(const Line &x, const Line &y) {
const Line *xp = &x;
const Line *yp = &y;
Line empty;
Line *emptyP = &empty;
if(comparer(&emptyP, &xp) == 0) {
return distanceFromEmptyString(y);
}
int32_t result = comparer(&xp, &yp);
if(result == 0) {
return UCOL_IDENTICAL;
} else if(result > 0) {
return UCOL_OFF; // bad situation
} else { // we need to probe strength
if(probeSuffix(x, y, B1, B0)) {
//if(probePrefix(x, y, B2, B0)) { // swamps secondary difference
return UCOL_PRIMARY;
} else if(probePrefix(x, y, B3, B0)) { // swamps tertiary difference
return UCOL_SECONDARY;
} else if(probeSuffix(x, y, B3, B0)) { // swamped by tertiary difference
return UCOL_TERTIARY;
} else if(!probePrefix(x, y, B3, B0)) {
return UCOL_QUATERNARY;
}
/*
//if(probeSuffix(x, y, B1, B0)) {
if(probePrefix(x, y, B2, B0)) { // swamps secondary difference
return UCOL_PRIMARY;
} else if(probePrefix(x, y, B3, B0)) { // swamps tertiary difference
return UCOL_SECONDARY;
} else if(probeSuffix(x, y, B3, B0)) { // swamped by tertiary difference
return UCOL_TERTIARY;
} else if(!probePrefix(x, y, B3, B0)) {
return UCOL_QUATERNARY;
}
*/
}
return UCOL_OFF; // bad
}
UColAttributeValue
StrengthProbe::getStrength(const UnicodeString &sx, const UnicodeString &sy) {
Line x(sx);
Line y(sy);
return getStrength(x, y);
}
int32_t
StrengthProbe::compare(const UnicodeString &sx, const UnicodeString &sy) {
Line x(sx);
Line y(sy);
const Line *xp = &x;
const Line *yp = &y;
return comparer(&xp, &yp);
}
int32_t
StrengthProbe::compare(const Line &x, const Line &y) {
const Line *xp = &x;
const Line *yp = &y;
return comparer(&xp, &yp);
}
UColAttributeValue
StrengthProbe::distanceFromEmptyString(const Line &x) {
if(x.name[0] == 0x30D) {
int32_t putBreakPointHere = 0;
}
Line empty;
Line *emptyP = &empty;
uint8_t buff[256];
getSortKey(empty.name, empty.len, buff, 256);
Line B0Line(B0);
Line *B0LineP = &B0Line;
const Line *xp = &x;
int32_t result = comparer(&emptyP, &xp);
if(result == 0) {
return UCOL_IDENTICAL;
} else if(result > 0) {
return UCOL_OFF;
}
result = comparer(&B0LineP, &xp);
if(result <= 0) {
return UCOL_PRIMARY;
}
Line sexb0(SE);
sexb0.append(x.name, x.len);
sexb0.append(B0);
Line seb0(SE);
seb0.append(B0);
uint8_t seb0K[256];
uint8_t sexb0K[256];
uint8_t seb2K[256];
uint8_t seb3K[256];
memset(seb0K, 0, 256);
memset(sexb0K, 0, 256);
memset(seb2K, 0, 256);
memset(seb3K, 0, 256);
getSortKey(seb0, seb0K, 256);
getSortKey(sexb0, sexb0K, 256);
if(compare(seb0, sexb0) <= 0) {
Line seb2(SE);
seb2.append(B2);
getSortKey(seb2, seb2K, 256);
result = compare(seb2, sexb0);
if((result <= 0 && !frenchSecondary) || (result >= 0 && frenchSecondary)) { // swamps tertiary difference
return UCOL_SECONDARY;
}
Line seb3(SE);
seb3.append(B3);
getSortKey(seb3, seb3K, 256);
if(compare(seb3, sexb0) < 0) {
return UCOL_TERTIARY;
}
return UCOL_QUATERNARY;
} else {
// if this was UCA, we would have a primary difference.
// however, this might not be so, since not everybody
// makes well formed CEs.
// in cs_CZ on linux, space is tertiary ignorable, but
// its quaternary level strength is lower than quad
// strengths for non-ignorables. oh well, more testing
// required
// I think that we can only have quaternary difference
// here (in addition to primary difference).
//if(!probePrefix(x, empty, B3, B0)) {
//return UCOL_QUATERNARY;
//} else {
return UCOL_PRIMARY;
//}
}
}
UColAttributeValue
StrengthProbe::distanceFromEmptyString(const UnicodeString &x) {
const Line xp(x);
return distanceFromEmptyString(xp);
}
UColAttributeValue
StrengthProbe::getPrefixedStrength(const Line &prefix, const Line &x, const Line &y) {
contractionUtilFirst.setToConcat(&prefix, &x);
contractionUtilSecond.setToConcat(&prefix, &y);
return getStrength(contractionUtilFirst, contractionUtilSecond);
}
StrengthProbe::StrengthProbe(const StrengthProbe &that) {
*this = that;
}
StrengthProbe &
StrengthProbe::operator=(const StrengthProbe &that) {
if(this != &that) {
B0 = that.B0;
B1 = that.B1;
B2 = that.B2;
B3 = that.B3;
SE = that.SE;
frenchSecondary = that.frenchSecondary;
comparer = that.comparer;
skgetter = that.skgetter;
utilFirstP = &utilFirst;
utilSecondP = &utilSecond;
}
return *this;
}
UBool
StrengthProbe::isFrenchSecondary(UErrorCode &status) {
utilFirst.setTo(B0);
utilFirst.append(SE);
utilFirst.append(B2);
utilSecond.setTo(B2);
utilSecond.append(SE);
utilSecond.append(B0);
int32_t result = compare(utilFirst, utilSecond);
if(result < 0) {
return FALSE;
} else if(result > 0) {
frenchSecondary = TRUE;
return TRUE;
} else {
status = U_INTERNAL_PROGRAM_ERROR;
return FALSE;
}
}
UBool
StrengthProbe::isUpperFirst(UErrorCode &status) {
UChar i = 0;
int32_t result = 0;
int32_t upper = 0, lower = 0, equal = 0;
for(i = 0x41; i < 0x5B; i++) {
utilFirst.setTo(i);
utilSecond.setTo(i+0x20);
result = compare(utilFirst, utilSecond);
if(result < 0) {
upper++;
} else if(result > 0) {
lower++;
} else {
equal++;
}
}
if(lower == 0 && equal == 0) {
return TRUE;
}
if(upper == 0 && equal == 0) {
return FALSE;
}
status = U_INTERNAL_PROGRAM_ERROR;
return FALSE;
}

85
tools/colprobe/strengthprobe.h Executable file
View file

@ -0,0 +1,85 @@
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
*
* File line.h
*
* Modification History:
*
* Date Name Description
* 07/07/2003 weiv Creation.
*******************************************************************************
*/
//
// class Line
//
// Each line from the source file (containing a name, presumably) gets
// one of these structs.
//
#ifndef COLPROBE_STRENGTHPROBE_H
#define COLPROBE_STRENGTHPROBE_H
#include "colprobe.h"
#include "line.h"
#include "unicode/uniset.h"
#include "unicode/usetiter.h"
class StrengthProbe {
public:
UChar SE;
UChar B0;
UChar B1;
UChar B2;
UChar B3;
private:
Line utilFirst;
Line utilSecond;
Line *utilFirstP;
Line *utilSecondP;
Line contractionUtilFirst;
Line contractionUtilSecond;
UBool probePrefix(const Line &x, const Line &y, UChar first, UChar second);
UBool probeSuffix(const Line &x, const Line &y, UChar first, UChar second);
UBool probePrefixNoSep(const Line &x, const Line &y, UChar first, UChar second);
UBool probeSuffixNoSep(const Line &x, const Line &y, UChar first, UChar second);
UBool frenchSecondary;
public:
CompareFn comparer;
GetSortKeyFn skgetter;
StrengthProbe() {};
StrengthProbe(CompareFn comparer, GetSortKeyFn getter, UChar SE = 0x0030, UChar B0 = 0x0061, UChar B1 = 0x0062, UChar B2 = 0x00E1, UChar B3 = 0x0041); //, UChar LB = 0x0039, UChar UB = 0xfa29);
int setProbeChars(UChar B0, UChar B1, UChar B2, UChar B3);
int checkSanity();
StrengthProbe(const StrengthProbe &that);
StrengthProbe &operator=(const StrengthProbe &that);
UColAttributeValue getStrength(const Line &x, const Line &y);
UColAttributeValue getStrength(const UnicodeString &x, const UnicodeString &y);
UColAttributeValue getPrefixedStrength(const Line &prefix, const Line &x, const Line &y);
int32_t compare(const UnicodeString &x, const UnicodeString &y);
int32_t compare(const Line &x, const Line &y);
UColAttributeValue distanceFromEmptyString(const Line &x);
UColAttributeValue distanceFromEmptyString(const UnicodeString &x);
UBool isFrenchSecondary(UErrorCode &status);
UBool isUpperFirst(UErrorCode &status);
int getSortKey(const Line &l, uint8_t *buffer, int32_t buffCap) {
return skgetter(l.name, l.len, buffer, buffCap);
};
int getSortKey(UChar *string, int32_t sLen, uint8_t *buffer, int32_t buffCap) {
return skgetter(string, sLen, buffer, buffCap);
};
};
#endif //#ifndef COLPROBE_STRENGTHPROBE_H

16
tools/colprobe/tableStarter.pl Executable file
View file

@ -0,0 +1,16 @@
#!/usr/bin/perl -w
use strict;
my $localeMinusA = `cat /home/weiv/src/icu/source/extra/colprobe/locale.txt`;
my @locales = split(/\n/, $localeMinusA);
my $locale;
my $command;
foreach $locale (@locales) {
if($locale =~ /_/ && !($locale =~ /^#/)) {
$command = "/home/weiv/src/icu/source/extra/colprobe/doComparisonTable.pl $locale";
print "$command\n";
`$command`;
}
}

View file

@ -0,0 +1,8 @@
#include "targetsetgenerator.h"
TargetSetGenerator::TargetSetGenerator(UnicodeSet &startingSet, CompareFn comparer) :
comparer(comparer),
set(startingSet)
{
addAll(startingSet);
}

View file

@ -0,0 +1,15 @@
#ifndef TARGETSETGENERATOR_H
#define TARGETSETGENERATOR_H
#include "colprobe.h"
#include "unicode/uniset.h"
class TargetSetGenerator : public UnicodeSet {
public:
TargetSetGenerator(UnicodeSet &startingSet, CompareFn comparer);
private:
CompareFn comparer;
UnicodeSet set;
};
#endif

48
tools/colprobe/template Executable file
View file

@ -0,0 +1,48 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>$locale</title>
<style>
<!--
table { border-spacing: 0; border-collapse: collapse; width: 100%;
border: 1px solid black }
td, th { width: 10%; border-spacing: 0; border-collapse: collapse; color: black;
vertical-align: top; border: 1px solid black }
-->
</style>
</head>
<body bgcolor="#FFFFFF">
<p><b><font color="#FF0000">Collation:</font> $locale <a href="http://oss.software.ibm.com/cgi-bin/icu/lx/en/?_=$locale">Demo</a>,
<a href="http://oss.software.ibm.com/cvs/icu/~checkout~/locale/all_diff_xml/comparison_charts.html">Cover
Page</a>, <a href="http://oss.software.ibm.com/cvs/icu/~checkout~/locale/all_diff_xml/index.html">Index</a></b></p>
<table>
<tr>
<th bgcolor="#AD989D">COMMON (<a href="http://oss.software.ibm.com/cvs/icu/~checkout~/locale/common/xml/$locale.xml">xml</a>)</th>
<th bgcolor="#1191F1">LINUX (<a href="http://oss.software.ibm.com/cvs/icu/~checkout~/locale/linux/xml/$locale.xml">xml</a>)</th>
<th bgcolor="#98FB98">WINDOWS (<a href="http://oss.software.ibm.com/cvs/icu/~checkout~/locale/windows/xml/$locale.xml">xml</a>)</th>
</tr>
<tr>
<td bgcolor="#AD989D">1.0-alpha</td>
<td bgcolor="#FF6633">1.0</td>
<td bgcolor="#FF6633">=</td>
<td bgcolor="#FF6633"><span title="006E {LATIN SMALL LETTER N}">&amp;n</span><br>
<span title="006E 0079 {LATIN SMALL LETTER N} {LATIN SMALL LETTER Y}">&nbsp;&nbsp;&lt;&nbsp;ny</span><br>
<span title="006E 006E 0079 {LATIN SMALL LETTER N} {LATIN SMALL LETTER N} {LATIN SMALL LETTER Y} / 006E 0079 {LATIN SMALL LETTER N} {LATIN SMALL LETTER Y}">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;=&nbsp;nny&nbsp;/&nbsp;ny</span><br>
<span title="006E 0059 {LATIN SMALL LETTER N} {LATIN CAPITAL LETTER Y}">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&lt;&lt;&lt;&nbsp;nY</span><br>
</td>
<td bgcolor="#FF6633">=</td>
<td bgcolor="#FFFF33">1.2</td>
<td bgcolor="#98FB98">Windows XP</td>
<td bgcolor="#FF6633">=</td>
<td bgcolor="#FF6633">=</td>
</tr>
</table>
</body>

49
tools/colprobe/uniqueFiles.pl Executable file
View file

@ -0,0 +1,49 @@
#!/usr/bin/perl
use strict;
my $file;
my $secondfile;
my %secondfilelist;
my @same;
my %list;
my $samefile;
foreach $secondfile (@ARGV) {
$secondfilelist{$secondfile} = "";
}
foreach $file (sort keys(%secondfilelist)) {
if(exists $secondfilelist{$file}) {
delete $secondfilelist{$file};
foreach $secondfile (sort(keys %secondfilelist)) {
#print "diffing: $file and $secondfile\n";
if (!`diff $file $secondfile`) {
#print "$file and $secondfile are the same\n";
push @same, $secondfile;
}
}
# if ($#same > -1) {
print "Adding @same to $file\n";
$list{$file} = [@same] ;
foreach $samefile (@same) {
delete $secondfilelist{$samefile};
}
delete @same[0..$#same];
# }
}
}
my $i = 0;
my $j = 0;
foreach $file (sort( keys %list)) {
#print "$file -> "; #@{list{$file}}\n";
print "<$file> <$j>\n";
foreach $i ( 0 .. $#{ $list{$file} } ) {
#print "$list{$file}[$i] ";
print "<$list{$file}[$i]> <$j>\n ";
}
$j++;
}

116
tools/colprobe/uprinter.cpp Executable file
View file

@ -0,0 +1,116 @@
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
*
* File uprinter.cpp
*
* Modification History:
*
* Date Name Description
* 03/18/2003 weiv Creation.
*******************************************************************************
*/
#include "uprinter.h"
UPrinter::UPrinter(FILE *file, const char *locale, const char *encoding, UBool transliterateNonPrintable) {
_on = TRUE;
out = u_finit(file, locale, encoding);
strcpy(_locale, locale);
if(transliterateNonPrintable) {
UErrorCode status = U_ZERO_ERROR;
UTransliterator *anyHex = utrans_open("[^\\u000d\\u000a\\u0009\\u0020-\\u007f] Any-Hex/Java", UTRANS_FORWARD, NULL, 0, NULL, &status);
u_fsettransliterator(out, U_WRITE, anyHex, &status);
}
};
UPrinter::UPrinter(const char *name, const char *locale, const char *encoding, UTransliterator *trans, UBool transliterateNonPrintable) {
_on = TRUE;
out = u_fopen(name, "wb", locale, encoding);
u_fputc(0xFEFF, out); // emit a BOM
strcpy(_locale, locale);
if(transliterateNonPrintable) {
UErrorCode status = U_ZERO_ERROR;
if(trans == NULL) {
UTransliterator *anyHex = utrans_open("[^\\u000d\\u000a\\u0009\\u0020-\\u007f] Any-Hex/Java", UTRANS_FORWARD, NULL, 0, NULL, &status);
u_fsettransliterator(out, U_WRITE, anyHex, &status);
} else {
u_fsettransliterator(out, U_WRITE, trans, &status);
}
}
};
UPrinter::~UPrinter() {
u_fclose(out);
}
void
UPrinter::log(const UnicodeString &string, UBool nl) {
if(_on) {
log(((UnicodeString)string).getTerminatedBuffer(), nl);
}
}
void
UPrinter::log(const UChar *string, UBool nl) {
if(_on) {
u_fprintf(out, "%S", string);
if(nl) {
u_fprintf(out, "\n");
}
u_fflush(out);
}
}
/*
void
UPrinter::log(const char *string, UBool nl) {
if(_on) {
u_fprintf(out, "%s", string);
if(nl) {
u_fprintf(out, "\n");
}
}
}
*/
void
UPrinter::log(const Line *line, UBool nl) {
if(_on) {
log(line->name);
if(line->expLen) {
log("/");
log(line->expansionString);
}
if(nl) {
u_fprintf(out, "\n");
u_fflush(out);
}
}
}
void UPrinter::log(const char *fmt, ...)
{
UChar buffer[4000];
va_list ap;
va_start(ap, fmt);
/* sprintf it just to make sure that the information is valid */
u_vsprintf(buffer, _locale, fmt, ap);
va_end(ap);
if( _on ) {
log(buffer);
}
}
void
UPrinter::on(void) {
_on = TRUE;
}
void
UPrinter::off(void) {
_on = FALSE;
}

51
tools/colprobe/uprinter.h Executable file
View file

@ -0,0 +1,51 @@
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
*
* File uprinter.h
*
* Modification History:
*
* Date Name Description
* 03/18/2003 weiv Creation.
*******************************************************************************
*/
#ifndef COLPROBE_UPRINTER_H
#define COLPROBE_UPRINTER_H
#include "line.h"
#include "unicode/ustdio.h"
#include "unicode/unistr.h"
#include "unicode/ustring.h"
class UPrinter {
UFILE *out;
UChar buffer[256];
UBool _on;
char _locale[256];
public:
UPrinter(FILE *file, const char *locale, const char *encoding, UBool transliterateNonPrintable=TRUE);
UPrinter(const char *name, const char *locale, const char *encoding, UTransliterator *trans, UBool transliterateNonPrintable);
~UPrinter();
void log(const UnicodeString &string, UBool nl = FALSE);
void log(const UChar *string, UBool nl = FALSE);
//void log(const char *string, UBool nl = FALSE);
void log(const Line *line, UBool nl = FALSE);
void log(const char *fmt, ...);
void off(void);
void on(void);
UBool isOn(void) {
return _on;
};
};
#endif // #ifndef COLPROBE_UPRINTER_H

View file

@ -0,0 +1,30 @@
#!/usr/bin/perl -w
use strict;
#my $localeMinusA = `locale -a`;
my $localeMinusA = `cat locale.txt`;
my @locales = split(/\r\n/, $localeMinusA);
my $locale;
my $command;
#my $commandPath = "~/src/icu/source/extra/colprobe/";
my $commandPath = "c:/dev/0_icu/source/extra/colprobe/release/";
my $platform = $ARGV[0];
mkdir $platform."logs";
mkdir $platform;
foreach $locale (@locales) {
$_ = $locale;
chomp;
if(!/^\#/) { # && /\_/) {
$command = $commandPath."colprobe --platform $platform --ref $platform --output resb $locale >$platform"."logs/$locale"."_log.txt 2>&1";
print "$command\n";
`$command`;
}
}