mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-10835 Add tool for generation of regular expressions casing data
X-SVN-Rev: 35682
This commit is contained in:
parent
3ed1418315
commit
8807332753
3 changed files with 172 additions and 0 deletions
12
tools/unicode/c/genregexcasing/Makefile
Normal file
12
tools/unicode/c/genregexcasing/Makefile
Normal file
|
@ -0,0 +1,12 @@
|
|||
# Copyright (C) 2014, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
|
||||
# Edit the following two lines to reflect the location of your ICU sources & build (if out-of-source)
|
||||
ICU_HOME=$(HOME)/icu/icu/trunk/source
|
||||
ICU_BUILD=$(ICU_HOME)
|
||||
|
||||
data: genregexcasing
|
||||
LD_LIBRARY_PATH=$(ICU_BUILD)/lib:$(ICU_BUILD)/stubdata ICU_DATA=$(ICU_BUILD)/data/out ./genregexcasing
|
||||
|
||||
genregexcasing: genregexcasing.cpp
|
||||
clang++ genregexcasing.cpp -std=c++0x -g -I $(ICU_HOME)/common -I $(ICU_HOME)/i18n -I $(ICU_HOME)/io -L$(ICU_BUILD)/lib -L$(ICU_BUILD)/stubdata -licuuc -licui18n -licudata -o genregexcasing
|
140
tools/unicode/c/genregexcasing/genregexcasing.cpp
Normal file
140
tools/unicode/c/genregexcasing/genregexcasing.cpp
Normal file
|
@ -0,0 +1,140 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
// file name: genregexcasing.cpp
|
||||
//
|
||||
// Program to generate the casing data for use by ICU regular expressions.
|
||||
// The data declarations output when running this program are to be copied
|
||||
// into the file i18n/regexcmp.h
|
||||
//
|
||||
// See the function RegexCompile::findCaseInsensitiveStarters() for more explanation.
|
||||
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/usetiter.h"
|
||||
#include "iostream"
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
std::string sstring(const UnicodeString &us) {
|
||||
string retString;
|
||||
us.toUTF8String(retString);
|
||||
return retString;
|
||||
}
|
||||
|
||||
int main() {
|
||||
|
||||
std::map<UChar32, std::set<UChar32>> cmap;
|
||||
|
||||
for (UChar32 cp = 0; cp<=0x10ffff; cp++) {
|
||||
UnicodeSet s(cp, cp);
|
||||
s.closeOver(USET_CASE_INSENSITIVE);
|
||||
|
||||
UnicodeSetIterator setIter(s);
|
||||
while (setIter.next()) {
|
||||
if (!setIter.isString()) {
|
||||
continue;
|
||||
}
|
||||
const UnicodeString &str = setIter.getString();
|
||||
|
||||
cout << "Got a string for \"" << sstring(UnicodeString(cp)) << "\" [\\u" << hex << cp << "]\n";
|
||||
cout << " \"" << sstring(str) << "\" [";
|
||||
for (int32_t j=0; j<str.length(); j=str.moveIndex32(j, 1)) {
|
||||
cout << hex << "\\u" << str.char32At(j) << " ";
|
||||
}
|
||||
cout << "]" << endl;
|
||||
UChar32 c32 = str.char32At(0);
|
||||
if (s.contains(c32)) {
|
||||
cout << " Set contains first char.\n";
|
||||
}
|
||||
cmap[c32].insert(cp);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
std::cout << "Iterating the map.\n";
|
||||
for (const auto &mapPair: cmap) {
|
||||
UChar32 cp = mapPair.first;
|
||||
std::cout << "key: \"" << sstring(UnicodeString(cp)) << "\" \\u" << cp << " : [";
|
||||
for (UChar32 valCP: mapPair.second) {
|
||||
std::cout << "\"" << sstring(UnicodeString(valCP)) << "\" \\u" << valCP << " ";
|
||||
}
|
||||
std::cout << "]\n";
|
||||
}
|
||||
|
||||
//
|
||||
// Create the data arrays to be pasted into regexcmp.cpp
|
||||
//
|
||||
|
||||
std::cout << "\n\nCopy the lines below into the file i18n/regexcmp.cpp.\n\n";
|
||||
std::cout << "// Machine Generated Data. Do not hand edit.\n";
|
||||
|
||||
UnicodeString outString;
|
||||
struct Item {
|
||||
UChar32 fCP = 0;
|
||||
int16_t fStrIndex = 0;
|
||||
int16_t fCount = 0;
|
||||
};
|
||||
|
||||
std::vector<Item> data;
|
||||
for (const auto &mapPair: cmap) {
|
||||
Item dataForCP;
|
||||
dataForCP.fCP = mapPair.first;
|
||||
dataForCP.fStrIndex = outString.length();
|
||||
for (UChar32 valCP: mapPair.second) {
|
||||
outString.append(valCP);
|
||||
dataForCP.fCount++;
|
||||
}
|
||||
data.push_back(dataForCP);
|
||||
}
|
||||
|
||||
std::cout << " static const UChar32 RECaseFixCodePoints[] = {" ;
|
||||
int items=0;
|
||||
for (const Item &d: data) {
|
||||
if (items++ % 10 == 0) {
|
||||
std::cout << "\n ";
|
||||
}
|
||||
std::cout << "0x" << d.fCP << ", ";
|
||||
}
|
||||
std::cout << "0x110000};\n\n";
|
||||
|
||||
std::cout << " static const int16_t RECaseFixStringOffsets[] = {";
|
||||
items = 0;
|
||||
for (const Item &d: data) {
|
||||
if (items++ % 10 == 0) {
|
||||
std::cout << "\n ";
|
||||
}
|
||||
std::cout << "0x" << d.fStrIndex << ", ";
|
||||
}
|
||||
std::cout << "0};\n\n";
|
||||
|
||||
std::cout << " static const int16_t RECaseFixCounts[] = {";
|
||||
items = 0;
|
||||
for (const Item &d: data) {
|
||||
if (items++ % 10 == 0) {
|
||||
std::cout << "\n ";
|
||||
}
|
||||
std::cout << "0x" << d.fCount << ", ";
|
||||
}
|
||||
std::cout << "0};\n\n";
|
||||
|
||||
std::cout << " static const UChar RECaseFixData[] = {";
|
||||
for (int i=0; i<outString.length(); i++) {
|
||||
if (i % 10 == 0) {
|
||||
std::cout << "\n ";
|
||||
}
|
||||
std::cout << "0x" << outString.charAt(i) << ", ";
|
||||
}
|
||||
std::cout << "0};\n\n";
|
||||
return 0;
|
||||
}
|
||||
|
20
tools/unicode/c/genregexcasing/readme.txt
Normal file
20
tools/unicode/c/genregexcasing/readme.txt
Normal file
|
@ -0,0 +1,20 @@
|
|||
# Copyright (C) 2014, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# created on: 2014 May 2
|
||||
# created by: Andy Heninger
|
||||
|
||||
genregexcasing is the tool for generating extended case closure data needed by
|
||||
regular expressions for case insensitive matching.
|
||||
|
||||
The tool generates c++ data declarations that are then manually copied into the file
|
||||
i18n/regexcmp.cpp.
|
||||
|
||||
Edit the Makefile to have the correct directories for your ICU sources and build
|
||||
(the top two lines.)
|
||||
|
||||
A Unix-like system and the clang compiler are assumed.
|
||||
|
||||
To build and run the tool, from within this directory, do a plain, unqualified
|
||||
make
|
||||
|
Loading…
Add table
Reference in a new issue