ICU-7151 ICU4J RBBI Line break tests updated for Unicode 5.2

X-SVN-Rev: 26639
This commit is contained in:
Andy Heninger 2009-09-16 21:56:30 +00:00
parent 1cc5cd1a8b
commit 5e28363a15
4 changed files with 85 additions and 25 deletions

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:91a5fa298946e4adc34f127284de8aef86b89a4b4353fc27dcd02303ca61b20d
size 6408595
oid sha256:569abd84cb1c912c72e1e86d6d82a5b42c3ba3037445495bb0c56738413c6992
size 6408913

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5321a103d09e461c153c530e0bf3ab9ea456a51c68bb73109731ad87b7c26fbc
oid sha256:eb381ea2266664ace19cac6ca554bbe6ff7623d2d9bb1bd9e30707990f992c38
size 717057

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2003-2008 International Business Machines Corporation and *
* Copyright (C) 2003-2009 International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -470,6 +470,8 @@ public class RBBITestMonkey extends TestFmwk {
}
// TODO: for class fCP, fCL: when Unicode 5.2 properties become available, change the definitions of
// these classes to use them.
static class RBBILineMonkey extends RBBIMonkeyKind {
List fSets;
@ -490,6 +492,7 @@ public class RBBITestMonkey extends TestFmwk {
UnicodeSet fBB;
UnicodeSet fHY;
UnicodeSet fCL;
UnicodeSet fCP;
UnicodeSet fEX;
UnicodeSet fIN;
UnicodeSet fNS;
@ -535,7 +538,9 @@ public class RBBITestMonkey extends TestFmwk {
fBA = new UnicodeSet("[\\p{Line_break=BA}]");
fBB = new UnicodeSet("[\\p{Line_break=BB}]");
fHY = new UnicodeSet("[\\p{Line_break=HY}]");
fCL = new UnicodeSet("[\\p{Line_break=CL}]");
fCL = new UnicodeSet("[[\\p{Line_break=CL}]-[\\u0029\\u005d]]");
fCP = new UnicodeSet("[\\u0029\\u005d]");
// fCP = new UnicodeSet("[\\p{Line_break=CP}]");
fEX = new UnicodeSet("[\\p{Line_break=EX}]");
fIN = new UnicodeSet("[\\p{Line_break=IN}]");
fNS = new UnicodeSet("[\\p{Line_break=NS}]");
@ -583,6 +588,7 @@ public class RBBITestMonkey extends TestFmwk {
fSets.add(fH2);
fSets.add(fH3);
fSets.add(fCL);
fSets.add(fCP);
fSets.add(fEX);
fSets.add(fIN);
fSets.add(fJL);
@ -763,13 +769,14 @@ public class RBBITestMonkey extends TestFmwk {
// LB 13 Don't break before closings.
// NU x CL and NU x IS are not matched here so that they will
// NU x CL, NU x CP and NU x IS are not matched here so that they will
// fall into LB 17 and the more general number regular expression.
//
if (!fNU.contains(prevChar) && fCL.contains(thisChar) ||
fEX.contains(thisChar) ||
!fNU.contains(prevChar) && fIS.contains(thisChar) ||
!fNU.contains(prevChar) && fSY.contains(thisChar)) {
!fNU.contains(prevChar) && fCP.contains(thisChar) ||
fEX.contains(thisChar) ||
!fNU.contains(prevChar) && fIS.contains(thisChar) ||
!fNU.contains(prevChar) && fSY.contains(thisChar)) {
continue;
}
@ -806,7 +813,7 @@ public class RBBITestMonkey extends TestFmwk {
}
}
// LB 16 CL SP* x NS
// LB 16 (CL | CP) SP* x NS
if (fNS.contains(thisChar)) {
tPos = prevPos;
while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
@ -815,7 +822,7 @@ public class RBBITestMonkey extends TestFmwk {
while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
tPos = moveIndex32(fText, tPos, -1);
}
if (fCL.contains(UTF16.charAt(fText, tPos))) {
if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) {
continue;
}
}
@ -960,7 +967,16 @@ public class RBBITestMonkey extends TestFmwk {
continue;
}
// LB 30 (Withdrawn as of Unicode 5.1)
// LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
// (AL | NU) x OP
// CP x (AL | NU)
if ((fAL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {
continue;
}
if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fNU.contains(thisChar))) {
continue;
}
// LB 31 Break everywhere else
break;
@ -972,8 +988,8 @@ public class RBBITestMonkey extends TestFmwk {
// Match the following regular expression in the input text.
// ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PR | PO) CM*)?
// 0 0 1 3 3 4 7 7 7 7 9 9 11 11 (match states)
// ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)? (PR | PO) CM*)?
// 0 0 1 3 3 4 7 7 7 7 9 9 9 11 11 (match states)
// retVals array [0] index of the start of the match, or -1 if no match
// [1] index of first char following the match.
// Can not use Java regex because need supplementary character support,
@ -1065,6 +1081,10 @@ public class RBBITestMonkey extends TestFmwk {
matchState = 9;
break;
}
//if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) {
// matchState = 9;
// break;
//}
if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
matchState = 11;
break;

View file

@ -1,4 +1,4 @@
# Copyright (c) 2001-2008 International Business Machines
# Copyright (c) 2001-2009 International Business Machines
# Corporation and others. All Rights Reserved.
#
# RBBI Test Data
@ -24,13 +24,8 @@
# Temp debugging tests
<sent>
<data>• (This is it). •Testing the sentence iterator. •\
"This isn't it." •Hi! \
•This is a simple sample sentence. •(This is it.) •This is a simple sample sentence. •\
"This isn't it." •\
Hi! •This is a simple sample sentence. •It does not have to make any sense as you can see. •Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. •Che la dritta via aveo smarrita. •He said, that I said, that you said!! •Don't rock the boat.\u2029•Because I am the daddy, that is why.
•Not on my time (el timo.)! •</data>
<line>
<data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb•</data>
########################################################################################
#
@ -95,6 +90,28 @@ Hi! •This is a simple sample sentence. •It does not have to make any sense a
# Treat Japanese Half Width voicing marks as combining
<data>•A\uff9e•B\uff9f\uff9e\uff9f•C•</data>
########################################################################################
#
#
# E x t e n d e d G r a p h e m e C l u s t e r T e s t s
#
#
##########################################################################################
#<xgc>
# Plain Vanilla grapheme clusters
#<data>•a•b•c•</data>
#<data>•a\u0301\u0302• •b\u0303\u0304•</data>
# Assorted Hindi combining marks
#<data>•\u0904\u0903• •\u0937\u093E• •\u0904\u093F• •\u0937\u0940• •\u0937\u0949• •\u0937\u094A• •\u0937\u094B• •\u0937\u094C•</data>
# Thai Clusters
# $Prepend $Extend* $PrependBase $Extend*;
#
#<data>•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• •</data>
########################################################################################
#
#
@ -486,15 +503,28 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
# Regression for bug 836
# Note: Unicode 5.1 changed this behavior
# ICU will want to change it back before releasing,
# so there is no break preceding the '('
<data>•AAA•(AAA •</data>
# Unicode 5.2 changed it again, there is no break following the '('
<data>•AAA(AAA •</data>
# Try some words from other scripts.
# Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin
#
<data>•ΑΒΓ •БВГ •אבג֓ •ابت •١٢٣ •\u10A0\u10A1\u10A2 •ABC •</data>
#
# ticket #4853: unpaired surrogates should behave like AL
#
<data>•abc\ud801xyz•</data>
#
# Regression tests for failures that originally came from the monkey test.
# Monkey test failure lines can, with slight reformatting, be copied into this section
# as test cases. The error display from here is more informative.
#
<data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb•</data>
<data>•\u114d\u31f3•\ube44\u002d•\u0362\u24e2\u276e\u2014\u205f\ufe16•\uc877•\u0fd0\u000a<100>\u20a3•</data>
<data>•\u080a\u215b\U0001d7d3\u002c•\u2025\U000e012e•\u02df\u118d\u0029\ua8d6\u0085<100>\u6cc4\u2024\u202f\ufffc•</data>
########################################################################################
#
@ -539,4 +569,14 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
#
<data>•สวัสดี<000>ครับ<000>สบาย<000>ดี<000>ไหม<000> •ครับ<000></data>
#
# Trac ticket 5595 Test Case
# (Omitted from ICU4J because of old dictionary implementation.
#
# <data>•บท<000>ที่๑พายุ<000>ไซโคลน<000>โด<000>โรธี<000>อาศัย<000>อยู่<000>ท่ามกลาง<000>\
# ทุ่งใหญ่<000>ใน<000>แคนซัส<000>กับ<000>ลุง<000>เฮ<000>นรี<000>ชาวไร่<000>และ<000>ป้า<000>เอ็ม<000>\
# ภรรยา<000>ชาวไร่<000>บ้าน<000>ของ<000>พวก<000>เขา<000>หลัง<000>เล็ก<000>เพราะ<000>ไม้<000>\
# สร้าง<000>บ้าน<000>ต้อง<000>ขน<000>มา<000>ด้วย<000>เกวียน<000>เป็น<000>ระยะ<000>ทาง<000>หลาย<000>\
# ไมล์<000></data>