ICU-7264 add ScriptExtensions.txt, new scripts, new blocks, fix genpname/preparse.pl

X-SVN-Rev: 28359
This commit is contained in:
Markus Scherer 2010-07-22 23:30:47 +00:00
parent 6f17ff12b4
commit e72d90de1a
8 changed files with 2340 additions and 2235 deletions

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2009, International Business Machines
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -165,6 +165,7 @@ enum {
UNI_5_0,
UNI_5_1,
UNI_5_2,
UNI_6_0,
UNI_VER_COUNT
};
@ -181,7 +182,8 @@ unicodeVersions[]={
{ 4, 1, 0, 0 },
{ 5, 0, 0, 0 },
{ 5, 1, 0, 0 },
{ 5, 2, 0, 0 }
{ 5, 2, 0, 0 },
{ 6, 0, 0, 0 }
};
static int32_t ucdVersion=UNI_5_2;
@ -1220,6 +1222,11 @@ generateAlgorithmicData(UNewDataMemory *pData, Options *storeOptions) {
0, 5,
sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
};
static AlgorithmicRange cjkExtD={
0x2b740, 0x2b81d,
0, 5,
sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
};
static char jamo[]=
"HANGUL SYLLABLE \0"
@ -1266,6 +1273,9 @@ generateAlgorithmicData(UNewDataMemory *pData, Options *storeOptions) {
/* number of ranges of algorithmic names */
if(!storeOptions->storeNames) {
countAlgRanges=0;
} else if(ucdVersion>=UNI_6_0) {
/* Unicode 6.0 and up has 6 ranges including CJK Extension D */
countAlgRanges=6;
} else if(ucdVersion>=UNI_5_2) {
/* Unicode 5.2 and up has 5 ranges including CJK Extension C */
countAlgRanges=5;
@ -1358,6 +1368,19 @@ generateAlgorithmicData(UNewDataMemory *pData, Options *storeOptions) {
}
}
/* range 5: cjk extension d */
if(countAlgRanges>=6) {
if(pData!=NULL) {
udata_writeBlock(pData, &cjkExtD, sizeof(AlgorithmicRange));
udata_writeString(pData, prefix, PREFIX_LENGTH);
if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
}
} else {
size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
}
}
return size;
}

View file

@ -1,5 +1,5 @@
######################################################################
# Copyright (c) 2003-2005, International Business Machines
# Copyright (c) 2003-2010, International Business Machines
# Corporation and others. All Rights Reserved.
######################################################################
# Author: Alan Liu
@ -14,11 +14,12 @@
# ================================================
# ================================================
# Non-enumerated Properties
# Miscellaneous Properties
# ================================================
scx; Script_Extensions
# ================================================
# Enumerated Non-Binary Properties
# Enumerated Properties
# ================================================
# lccc(c)=ccc(NFD(c)[0])

View file

@ -1,5 +1,5 @@
########################################################################
# Copyright (c) 2006-2009, International Business Machines
# Copyright (c) 2006-2010, International Business Machines
# Corporation and others. All Rights Reserved.
########################################################################
# file name: SyntheticPropertyValueAliases.txt
@ -24,9 +24,7 @@
# Script (sc)
sc ; Batk ; Batk
sc ; Blis ; Blis
sc ; Brah ; Brah
sc ; Cirt ; Cirt
sc ; Cyrs ; Cyrs
sc ; Egyd ; Egyd
@ -41,7 +39,6 @@ sc ; Jpan ; Jpan
sc ; Latf ; Latf
sc ; Latg ; Latg
sc ; Lina ; Lina
sc ; Mand ; Mand
sc ; Maya ; Maya
sc ; Mero ; Mero
sc ; Moon ; Moon
@ -66,3 +63,17 @@ sc ; Zmth ; Zmth
sc ; Zsym ; Zsym
sc ; Nkgb ; Nkgb
sc ; Bass ; Bass
sc ; Dupl ; Dupl
sc ; Elba ; Elba
sc ; Gran ; Gran
sc ; Kpel ; Kpel
sc ; Loma ; Loma
sc ; Mend ; Mend
sc ; Merc ; Merc
sc ; Narb ; Narb
sc ; Nbat ; Nbat
sc ; Palm ; Palm
sc ; Sind ; Sind
sc ; Wara ; Wara

File diff suppressed because it is too large Load diff

View file

@ -1,7 +1,7 @@
#!/bin/perl -w
#*******************************************************************
# COPYRIGHT:
# Copyright (c) 2002-2009, International Business Machines Corporation and
# Copyright (c) 2002-2010, International Business Machines Corporation and
# others. All Rights Reserved.
#*******************************************************************
@ -14,17 +14,6 @@
#
# See usage note below.
#
# TODO: The Property[Value]Alias.txt files state that they can support
# more than 2 names per property|value. Currently (Unicode 3.2) there
# are always 1 or 2 names. If more names were supported, presumably
# the format would be something like:
# nv ; Numeric_Value
# nv ; Value_Numerique
# CURRENTLY, this script assumes that there are 1 or two names. Any
# duplicates it sees are flagged as an error. If multiple aliases
# appear in a future version of Unicode, modify this script to support
# that.
#
# NOTE: As of ICU 2.6, this script has been modified to know about the
# pseudo-property gcm/General_Category_Mask, which corresponds to the
# uchar.h property UCHAR_GENERAL_CATEGORY_MASK. This property
@ -70,23 +59,17 @@ my $propNA = 0;
my $valueNA = 0;
#----------------------------------------------------------------------
# Top level property keys for binary, enumerated, string, and double props
my @TOP = qw( _bp _ep _sp _dp _mp );
# Top level property keys for binary, enumerated, string, double, and other props
my @TOP = qw( _bp _ep _sp _dp _op );
# This hash governs how top level properties are grouped into output arrays.
#my %TOP_PROPS = ( "VALUED" => [ '_bp', '_ep' ],
# "NO_VALUE" => [ '_sp', '_dp' ] );m
#my %TOP_PROPS = ( "BINARY" => [ '_bp' ],
# "ENUMERATED" => [ '_ep' ],
# "STRING" => [ '_sp' ],
# "DOUBLE" => [ '_dp' ] );
my %TOP_PROPS = ( "" => [ '_bp', '_ep', '_sp', '_dp', '_mp' ] );
# Top level properties are grouped into output arrays.
my %TOP_PROPS = ( "" => [ '_bp', '_ep', '_sp', '_dp', '_op' ] );
my %PROP_TYPE = (Binary => "_bp",
String => "_sp",
Double => "_dp",
Enumerated => "_ep",
Bitmask => "_mp");
Other => "_op");
#----------------------------------------------------------------------
# Properties that are unsupported in ICU
@ -1079,7 +1062,7 @@ sub read_uscript {
# @param a filename for uchar.h
#
# @return a ref to a hash. The keys of the hash are '_bp' for binary
# properties, '_ep' for enumerated properties, '_dp'/'_sp'/'_mp' for
# properties, '_ep' for enumerated properties, '_dp'/'_sp'/'_op' for
# double/string/mask properties, and 'gc', 'gcm', 'bc', 'blk',
# 'ea', 'dt', 'jt', 'jg', 'lb', or 'nt' for corresponding property
# value aliases. The values of the hash are subhashes. The subhashes
@ -1137,9 +1120,13 @@ sub read_uchar {
elsif (m|^\s*/\*\*\s*(\w+)\s+property\s+(\w+)|i) {
die "Error: Unmatched tag $submode" if ($submode);
die "Error: Unrecognized UProperty comment: $_"
unless (exists $PROP_TYPE{$1});
$key = $PROP_TYPE{$1};
#die "Error: Unrecognized UProperty comment: $_"
# unless (exists $PROP_TYPE{$1});
if (exists $PROP_TYPE{$1}) {
$key = $PROP_TYPE{$1};
} else {
$key = $PROP_TYPE{"Other"};
}
$submode = $2;
}
}

View file

@ -1,5 +1,5 @@
#!/usr/bin/python2.4
# Copyright (c) 2009 International Business Machines
# Copyright (c) 2009-2010 International Business Machines
# Corporation and others. All Rights Reserved.
#
# file name: ucdcopy.py
@ -120,6 +120,7 @@ _unidata_files = {
"NormalizationCorrections.txt": shutil.copy,
"PropertyAliases.txt": shutil.copy,
"PropertyValueAliases.txt": shutil.copy,
"ScriptExtensions.txt": shutil.copy,
"SpecialCasing.txt": shutil.copy,
"UnicodeData.txt": shutil.copy,

View file

@ -15,6 +15,7 @@
*/
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include "unicode/utypes.h"
#include "unicode/errorcode.h"

View file

@ -19,9 +19,8 @@ There are autoconf makefiles (Makefile.in) and Visual C++ project files (.vcproj
in the subfolders. They are copied over from the ICU source tree and will not
work without modifications. However, I started to use CMake (CMakeLists.txt)
which is much simpler, and if it works well enough then I plan to just
delete the old makefiles and project files. The CMake files will currently
work only on Linux, just because I hardcoded the ICU library filenames
(e.g., libicuuc.so).
delete the old makefiles and project files. The CMake files should
work on Linux and MacOS X.
I should use more variables to make the CMake files more portable, and should
use ICU's installed icu-config or Makefile.inc to get the values for these
variables.