diff --git a/icu4j/src/com/ibm/icu/dev/test/TestAll.java b/icu4j/src/com/ibm/icu/dev/test/TestAll.java index 375a597f547..ee50a014d17 100755 --- a/icu4j/src/com/ibm/icu/dev/test/TestAll.java +++ b/icu4j/src/com/ibm/icu/dev/test/TestAll.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/TestAll.java,v $ - * $Date: 2002/04/03 04:31:59 $ - * $Revision: 1.25 $ + * $Date: 2002/06/14 19:08:57 $ + * $Revision: 1.26 $ * ***************************************************************************************** */ @@ -126,6 +126,12 @@ public class TestAll extends TestFmwk { }); } + public void TestUScriptRun() throws Exception { + run( new TestFmwk[] { + new com.ibm.icu.dev.test.lang.TestUScriptRun(), + }); + } + public void TestNumberFormat() throws Exception { run(new TestFmwk[] { new com.ibm.icu.dev.test.format.IntlTestNumberFormat(), diff --git a/icu4j/src/com/ibm/icu/dev/test/lang/TestUScriptRun.java b/icu4j/src/com/ibm/icu/dev/test/lang/TestUScriptRun.java new file mode 100644 index 00000000000..1f963a5f185 --- /dev/null +++ b/icu4j/src/com/ibm/icu/dev/test/lang/TestUScriptRun.java @@ -0,0 +1,272 @@ +/** +******************************************************************************* +* Copyright (C) 1999-2002, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +*/ + +package com.ibm.icu.dev.test.lang; + +import com.ibm.icu.lang.UScript; +import com.ibm.icu.lang.UScriptRun; +import com.ibm.icu.dev.test.TestFmwk; + +public class TestUScriptRun extends TestFmwk +{ + public TestUScriptRun() + { + // nothing + } + + public static void main(String[] args) throws Exception { + new TestUScriptRun().run(args); + } + + private static final class RunTestData + { + String runText; + int runScript; + + public RunTestData(String theText, int theScriptCode) + { + runText = theText; + runScript = theScriptCode; + } + }; + + private static final RunTestData[] testData = { + new RunTestData("\u0020\u0946\u0939\u093F\u0928\u094D\u0926\u0940\u0020", UScript.DEVANAGARI), + new RunTestData("\u0627\u0644\u0639\u0631\u0628\u064A\u0629\u0020", UScript.ARABIC), + new RunTestData("\u0420\u0443\u0441\u0441\u043A\u0438\u0439\u0020", UScript.CYRILLIC), + new RunTestData("English (", UScript.LATIN), + new RunTestData("\u0E44\u0E17\u0E22", UScript.THAI), + new RunTestData(") ", UScript.LATIN), + new RunTestData("\u6F22\u5B75", UScript.HAN), + new RunTestData("\u3068\u3072\u3089\u304C\u306A\u3068", UScript.HIRAGANA), + new RunTestData("\u30AB\u30BF\u30AB\u30CA", UScript.KATAKANA), + new RunTestData("\uD801\uDC00\uD801\uDC01\uD801\uDC02\uD801\uDC03", UScript.DESERET) + }; + + private void CheckScriptRuns(UScriptRun scriptRun, int[] runStarts, RunTestData[] testData) + { + int run, runStart, runLimit; + int runScript; + + /* iterate over all the runs */ + run = 0; + while (scriptRun.next()) { + runStart = scriptRun.getScriptStart(); + runLimit = scriptRun.getScriptLimit(); + runScript = scriptRun.getScriptCode(); + + if (runStart != runStarts[run]) { + errln("Incorrect start offset for run " + run + ": expected " + runStarts[run] + ", got " + runStart); + } + + if (runLimit != runStarts[run + 1]) { + errln("Incorrect limit offset for run " + run + ": expected " + runStarts[run + 1] + ", got " + runLimit); + } + + if (runScript != testData[run].runScript) { + errln("Incorrect script for run " + run + ": expected \"" + UScript.getName(testData[run].runScript) + "\", got \"" + UScript.getName(runScript) + "\""); + } + + run += 1; + + /* stop when we've seen all the runs we expect to see */ + if (run >= testData.length) { + break; + } + } + + /* Complain if we didn't see then number of runs we expected */ + if (run != testData.length) { + errln("Incorrect number of runs: expected " + testData.length + ", got " + run); + } + } + + public void TestContstruction() + { + UScriptRun scriptRun = null; + char[] dummy = {'d', 'u', 'm', 'm', 'y'}; + + try { + scriptRun = new UScriptRun(null, 0, 100); + errln("new UScriptRun(null, 0, 100) did not produce an IllegalArgumentException!"); + } catch (IllegalArgumentException iae) { + } + + try { + scriptRun = new UScriptRun(null, 100, 0); + errln("new UScriptRun(null, 100, 0) did not produce an IllegalArgumentException!"); + } catch (IllegalArgumentException iae) { + } + + try { + scriptRun = new UScriptRun(null, 0, -100); + errln("new UScriptRun(null, 0, -100) did not produce an IllegalArgumentException!"); + } catch (IllegalArgumentException iae) { + } + + try { + scriptRun = new UScriptRun(null, -100, 0); + errln("new UScriptRun(null, -100, 0) did not produce an IllegalArgumentException!"); + } catch (IllegalArgumentException iae) { + } + + try { + scriptRun = new UScriptRun(dummy, 0, 6); + errln("new UScriptRun(dummy, 0, 6) did not produce an IllegalArgumentException!"); + } catch (IllegalArgumentException iae) { + } + + try { + scriptRun = new UScriptRun(dummy, 6, 0); + errln("new UScriptRun(dummy, 6, 0) did not produce an IllegalArgumentException!"); + }catch (IllegalArgumentException iae) { + } + + try { + scriptRun = new UScriptRun(dummy, 0, -100); + errln("new UScriptRun(dummy, 0, -100) did not produce an IllegalArgumentException!"); + } catch (IllegalArgumentException iae) { + } + + try { + scriptRun = new UScriptRun(dummy, -100, 0); + errln("new UScriptRun(dummy, -100, 0) did not produce an IllegalArgumentException!"); + } catch (IllegalArgumentException iae) { + } + + } + + public void TestReset() + { + UScriptRun scriptRun = null; + char[] dummy = {'d', 'u', 'm', 'm', 'y'}; + + try { + scriptRun = new UScriptRun(); + } catch (IllegalArgumentException iae) { + errln("new UScriptRun() produced an IllegalArgumentException!"); + } + + try { + scriptRun.reset(0, 100); + errln("scriptRun.reset(0, 100) did not produce an IllegalArgumentException!"); + } catch (IllegalArgumentException iae) { + } + + try { + scriptRun.reset(100, 0); + errln("scriptRun.reset(100, 0) did not produce an IllegalArgumentException!"); + } catch (IllegalArgumentException iae) { + } + + try { + scriptRun.reset(0, -100); + errln("scriptRun.reset(0, -100) did not produce an IllegalArgumentException!"); + } catch (IllegalArgumentException iae) { + } + + try { + scriptRun.reset(-100, 0); + errln("scriptRun.reset(-100, 0) did not produce an IllegalArgumentException!"); + } catch (IllegalArgumentException iae) { + } + + try { + scriptRun.reset(dummy, 0, 6); + errln("scriptRun.reset(dummy, 0, 6) did not produce an IllegalArgumentException!"); + } catch (IllegalArgumentException iae) { + } + + try { + scriptRun.reset(dummy, 6, 0); + errln("scriptRun.reset(dummy, 6, 0) did not produce an IllegalArgumentException!"); + }catch (IllegalArgumentException iae) { + } + + try { + scriptRun.reset(dummy, 0, -100); + errln("scriptRun.reset(dummy, 0, -100) did not produce an IllegalArgumentException!"); + } catch (IllegalArgumentException iae) { + } + + try { + scriptRun.reset(dummy, -100, 0); + errln("scriptRun.reset(dummy, -100, 0) did not produce an IllegalArgumentException!"); + } catch (IllegalArgumentException iae) { + } + + try { + scriptRun.reset(dummy, 0, dummy.length); + } catch (IllegalArgumentException iae) { + errln("scriptRun.reset(dummy, 0, dummy.length) produced an IllegalArgumentException!"); + } + + + try { + scriptRun.reset(0, 6); + errln("scriptRun.reset(0, 6) did not produce an IllegalArgumentException!"); + } catch (IllegalArgumentException iae) { + } + + try { + scriptRun.reset(6, 0); + errln("scriptRun.reset(6, 0) did not produce an IllegalArgumentException!"); + } catch (IllegalArgumentException iae) { + } + } + + public void TestRuns() + { + int stringLimit = 0; + int[] runStarts = new int[testData.length + 1]; + String testString = ""; + UScriptRun scriptRun = null; + + /* + * Fill in the test string and the runStarts array. + */ + for (int run = 0; run < testData.length; run += 1) { + runStarts[run] = stringLimit; + stringLimit += testData[run].runText.length(); + testString += testData[run].runText; + } + + /* The limit of the last run */ + runStarts[testData.length] = stringLimit; + + try { + scriptRun = new UScriptRun(testString.toCharArray()); + CheckScriptRuns(scriptRun, runStarts, testData); + } catch (IllegalArgumentException iae) { + errln("new UScriptRun(testString.toCharArray()) produced an IllegalArgumentException!"); + } + + try { + scriptRun.reset(); + CheckScriptRuns(scriptRun, runStarts, testData); + } catch (IllegalArgumentException iae) { + errln("scriptRun.reset() on a valid UScriptRun produced an IllegalArgumentException!"); + } + + try { + scriptRun = new UScriptRun(); + + if (scriptRun.next()) { + errln("scriptRun.next() on an empty UScriptRun returned true!"); + } + } catch (IllegalArgumentException iae) { + errln("new UScriptRun() produced an IllegalArgumentException!"); + } + + try { + scriptRun.reset(testString.toCharArray(), 0, testString.length()); + CheckScriptRuns(scriptRun, runStarts, testData); + } catch (IllegalArgumentException iae) { + errln("scriptRun.reset(testString.toCharArray(), 0, testString.length) produced an IllegalArgumentException!"); + } + } +} diff --git a/icu4j/src/com/ibm/icu/lang/UScriptRun.java b/icu4j/src/com/ibm/icu/lang/UScriptRun.java new file mode 100644 index 00000000000..d5b2d16d1b4 --- /dev/null +++ b/icu4j/src/com/ibm/icu/lang/UScriptRun.java @@ -0,0 +1,403 @@ +/* + ******************************************************************************* + * + * Copyright (C) 1999-2002, International Business Machines + * Corporation and others. All Rights Reserved. + * + ******************************************************************************* + */ + +package com.ibm.icu.lang; + +/** + * UScriptRun is used to find runs of characters in + * the same script. It implements a simple iterator over an array + * of characters. The iterator will resolve script-neutral characters + * like punctuation into the script of the surrounding characters. + * + * The iterator will try to match paired punctuation. If it sees an + * opening punctuation character, it will remember the script that + * was assigned to that character, and assign the same script to the + * matching closing punctuation. + * + * Scripts are chosen based on the UScript class. + * No attempt is made to combine related scripts into a single run. In + * particular, Hiragana, Katakana, and Han characters will appear in seperate + * runs. + + * Here is an example of how to iterate over script runs: + *
+ * void printScriptRuns(char []text)
+ * {
+ *     UScriptRun scriptRun = new UScriptRun(text);
+ *
+ *     while (scriptRun.next())) {
+ *         int start  = scriptRun.getScriptStart();
+ *         int limit  = scriptRun.getScriptLimit();
+ *         int script = scriptRun.getScriptCode();
+ *
+ *         System.out.println("Script \"" + UScript.getName(script) + "\" from " +
+ *                            start + " to " + limit + ".");
+ *     }
+ *  }
+ * 
+ */ +public final class UScriptRun +{ + /** + * Puts a copyright in the .class file + */ + private static final String copyrightNotice + = "Copyright \u00a91999-2002 IBM Corp. All rights reserved."; + + /** + * Construct an empty UScriptRun object. The next() + * method will return false the first time it is called. + */ + public UScriptRun() + { + reset(null, 0, 0); + } + + /** + * Construct a UScriptRun object which iterates over the given + * characetrs. + * + * @param chars the array of characters over which to iterate. + */ + public UScriptRun(char[] chars) + { + reset(chars, 0, chars.length); + } + + /** + * Construct a UScriptRun object which iterates over a subrange + * of the given characetrs. + * + * @param chars the array of characters over which to iterate. + * @param start the index of the first character over which to iterate + * @param count the number of characters over which to iterate + */ + public UScriptRun(char[] chars, int start, int count) + { + reset(chars, start, count); + } + + + /** + * Reset the iterator to the start of the text. + */ + public final void reset() { + scriptStart = charStart; + scriptLimit = charStart; + scriptCode = UScript.COMMON; + } + + /** + * Reset the iterator to iterate over the given range of the text. Throws + * IllegalArgumentException if the range is outside of the bounds of the + * character array. + * + * @param start the index of the new first character over which to iterate + * @param count the new number of characters over which to iterate. + * @exception IllegalArgumentException + */ + public final void reset(int start, int count) + throws IllegalArgumentException + { + int len = 0; + + if (charArray != null) { + len = charArray.length; + } + + if (start < 0 || count < 0 || start > len - count) { + throw new IllegalArgumentException(); + } + + charStart = start; + charLimit = start + count; + + reset(); + } + + /** + * Reset the iterator to iterate over count characters + * in chars starting at start. This allows + * clients to reuse an iterator. + * + * @param chars the new array of characters over which to iterate. + * @param start the index of the first character over which to iterate. + * @param count the nuber of characters over which to iterate. + */ + public final void reset(char[] chars, int start, int count) + { + charArray = chars; + + reset(start, count); + } + + + /** + * Get the starting index of the current script run. + * + * @returns the index of the first character in the current script run. + */ + public final int getScriptStart() + { + return scriptStart; + } + + /** + * Get the index of the first character after the current script run. + * + * @return the index of the first character after the current script run. + */ + public final int getScriptLimit() + { + return scriptLimit; + } + + /** + * Get the script code for the script of the current script run. + * + * @return the script code for the script of the current script run. + * @see com.ibm.icu.lang.UScript + */ + public final int getScriptCode() + { + return scriptCode; + } + + /** + * Find the next script run. Returns false if there + * isn't another run, returns true if there is. + * + * @return false if there isn't another run, true if there is. + */ + public final boolean next() + { + int startSP = parenSP; // used to find the first new open character + + // if we've fallen off the end of the text, we're done + if (scriptLimit >= charLimit) { + return false; + } + + scriptCode = UScript.COMMON; + + for (scriptStart = scriptLimit; scriptLimit < charLimit; scriptLimit += 1) { + int high = charArray[scriptLimit]; + int ch = high; + + // if the character is a high surrogate and it's not the last one + // in the text, see if it's followed by a low surrogate + if (high >= 0xD800 && high <= 0xDBFF && scriptLimit < charLimit - 1) + { + int low = charArray[scriptLimit + 1]; + + // if it is followed by a low surrogate, + // consume it and form the full character + if (low >= 0xDC00 && low <= 0xDFFF) { + ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000; + scriptLimit += 1; + } + } + + int sc = UScript.getScript(ch); + int pairIndex = getPairIndex(ch); + + // Paired character handling: + // + // if it's an open character, push it onto the stack. + // if it's a close character, find the matching open on the + // stack, and use that script code. Any non-matching open + // characters above it on the stack will be poped. + if (pairIndex >= 0) { + if ((pairIndex & 1) == 0) { + parenStack[++parenSP] = new ParenStackEntry(pairIndex, scriptCode); + } else if (parenSP >= 0) { + int pi = pairIndex & ~1; + + while (parenSP >= 0 && parenStack[parenSP].pairIndex != pi) { + parenSP -= 1; + } + + if (parenSP < startSP) { + startSP = parenSP; + } + + if (parenSP >= 0) { + sc = parenStack[parenSP].scriptCode; + } + } + } + + if (sameScript(scriptCode, sc)) { + if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) { + scriptCode = sc; + + // now that we have a final script code, fix any open + // characters we pushed before we knew the script code. + while (startSP < parenSP) { + parenStack[++startSP].scriptCode = scriptCode; + } + } + + // if this character is a close paired character, + // pop it from the stack + if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) { + parenSP -= 1; + startSP -= 1; + } + } else { + // if the run broke on a surrogate pair, + // end it before the high surrogate + if (ch >= 0x10000) { + scriptLimit -= 1; + } + + break; + } + } + + return true; + } + + /** + * Compare two script codes to see if they are in the same script. If one script is + * a strong script, and the other is INHERITED or COMMON, it will compare equal. + * + * @param scriptOne one of the script codes. + * @param scriptTwo the other script code. + * @return true if the two scripts are the same. + * @see com.ibm.icu.lang.UScript + */ + private static boolean sameScript(int scriptOne, int scriptTwo) + { + return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED || scriptOne == scriptTwo; + } + + /* + * An internal class which holds entries on the paren stack. + */ + private static final class ParenStackEntry + { + int pairIndex; + int scriptCode; + + public ParenStackEntry(int thePairIndex, int theScriptCode) + { + pairIndex = thePairIndex; + scriptCode = theScriptCode; + } + }; + + private int charStart; + private int charLimit; + private char charArray[]; + + private int scriptStart; + private int scriptLimit; + private int scriptCode; + + private static ParenStackEntry parenStack[] = new ParenStackEntry[128]; + private int parenSP; + + /** + * Find the highest bit that's set in a word. Uses a binary search through + * the bits. + * + * @param n the word in which to find the highest bit that's set. + * @return the bit number (counting from the low order bit) of the highest bit. + */ + private static final byte highBit(int n) + { + if (n <= 0) { + return -32; + } + + byte bit = 0; + + if (n >= 1 << 16) { + n >>= 16; + bit += 16; + } + + if (n >= 1 << 8) { + n >>= 8; + bit += 8; + } + + if (n >= 1 << 4) { + n >>= 4; + bit += 4; + } + + if (n >= 1 << 2) { + n >>= 2; + bit += 2; + } + + if (n >= 1 << 1) { + n >>= 1; + bit += 1; + } + + return bit; + } + + /** + * Search the pairedChars array for the given character. + * + * @param ch the character for which to search. + * @return the index of the character in the table, or -1 if it's not there. + */ + private static int getPairIndex(int ch) + { + int probe = pairedCharPower; + int index = 0; + + if (ch >= pairedChars[pairedCharExtra]) { + index = pairedCharExtra; + } + + while (probe > (1 << 0)) { + probe >>= 1; + + if (ch >= pairedChars[index + probe]) { + index += probe; + } + } + + if (pairedChars[index] != ch) { + index = -1; + } + + return index; + } + + private static int pairedChars[] = { + 0x0028, 0x0029, // ascii paired punctuation + 0x003c, 0x003e, + 0x005b, 0x005d, + 0x007b, 0x007d, + 0x00ab, 0x00bb, // guillemets + 0x2018, 0x2019, // general punctuation + 0x201c, 0x201d, + 0x2039, 0x203a, + 0x3008, 0x3009, // chinese paired punctuation + 0x300a, 0x300b, + 0x300c, 0x300d, + 0x300e, 0x300f, + 0x3010, 0x3011, + 0x3014, 0x3015, + 0x3016, 0x3017, + 0x3018, 0x3019, + 0x301a, 0x301b + }; + + private static int pairedCharPower = 1 << highBit(pairedChars.length); + private static int pairedCharExtra = pairedChars.length - pairedCharPower; +} +