ICU-9267 ICU4J Charset Detector Crash Fix

X-SVN-Rev: 31724
2025-04-07 14:31:31 +00:00 · 2012-04-18 00:01:23 +00:00 · 2012-04-18 00:01:23 +00:00 · fdea410032
commit fdea410032
parent 834cee36a1
3 changed files with 31 additions and 16 deletions
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java
@ -1,6 +1,6 @@
 /**
 *******************************************************************************
-* Copyright (C) 2005-2011, International Business Machines Corporation and    *
+* Copyright (C) 2005-2012, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
@ -417,7 +417,7 @@ public class CharsetDetector {
    byte[]      fInputBytes =       // The text to be checked.  Markup will have been
                   new byte[kBufSize];  //   removed if appropriate.
    
-    int         fInputLen;          // Length of the byte data in fInputText.
+    int         fInputLen;          // Length of the byte data in fInputBytes.
    
    short       fByteStats[] =      // byte frequency statistics for the input text.
                   new short[256];  //   Value is percent, not absolute.
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java
@ -1131,6 +1131,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
        //arabic shaping class, method shape/unshape
        protected static ArabicShaping as = new ArabicShaping(ArabicShaping.LETTERS_UNSHAPE);
        protected byte[] prev_fInputBytes = null;
+        protected int prev_fInputLen = 0;

        protected static byte[] byteMap = {
 /*                 -0           -1           -2           -3           -4           -5           -6           -7           -8           -9           -A           -B           -C           -D           -E           -F   */
@ -1179,11 +1180,10 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
        protected void matchInit(CharsetDetector det) 
        {
            assert prev_fInputBytes == null;
-            prev_fInputBytes = new byte[det.fInputLen];
-            System.arraycopy(det.fInputBytes, 0, prev_fInputBytes, 0, det.fInputLen);
-            byte bb[] = unshape(prev_fInputBytes);
-            System.arraycopy(bb, 0, det.fInputBytes, 0, bb.length);
-            det.fInputLen = bb.length;
+            prev_fInputBytes = det.fInputBytes;
+            prev_fInputLen = det.fInputLen;
+            det.fInputBytes = unshape(prev_fInputBytes, prev_fInputLen);
+            det.fInputLen = det.fInputBytes.length;
        }
        
        /*
@ -1193,22 +1193,22 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
         * on CharsetICU which we try to avoid. IBM420 converter amongst different versions
         * of JDK can produce different results and therefore is also avoided.
         */
-        private byte[] unshape(byte[] inputBytes) {
-            byte resultByteArr[] = unshapeLamAlef(inputBytes);
+        private byte[] unshape(byte[] inputBytes, int inputLen) {
+            byte resultByteArr[] = unshapeLamAlef(inputBytes, inputLen);
            
-            for (int i=0; i<inputBytes.length; i++){
+            for (int i=0; i<resultByteArr.length; i++){
                resultByteArr[i] = unshapeMap[resultByteArr[i]& 0xFF];
            }
            return resultByteArr;
        }

-        private byte[] unshapeLamAlef(byte[] inputBytes) {
-            ByteBuffer resultBigBuffer =  ByteBuffer.allocate(inputBytes.length*2);
+        private byte[] unshapeLamAlef(byte[] inputBytes, int inputLen) {
+            ByteBuffer resultBigBuffer =  ByteBuffer.allocate(inputLen*2);
            ByteBuffer resultBuffer;
            byte unshapedLamAlef[] = {(byte)0xb1, (byte)0x56};

           
-            for (int i=0; i<inputBytes.length; i++){
+            for (int i=0; i<inputLen; i++){
                if (isLamAlef(inputBytes[i]))
                    resultBigBuffer.put(unshapedLamAlef);
                else
@ -1229,8 +1229,8 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
        
        protected void matchFinish(CharsetDetector det) {
            if (prev_fInputBytes != null) {
-                System.arraycopy(prev_fInputBytes, 0, det.fInputBytes, 0, prev_fInputBytes.length);
-                det.fInputLen = prev_fInputBytes.length;
+                det.fInputBytes = prev_fInputBytes;
+                det.fInputLen = prev_fInputLen;
                prev_fInputBytes = null;
            }
        }
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
@ -1057,5 +1057,20 @@ public class TestCharsetDetector extends TestFmwk
      //
      // End of Bug #8309 Test Case
      //
+
+
+    public void TestBut9267() {
+        // Test a long input of Lam Alef characters for CharsetRecog_IBM424_he.
+        // Bug 9267 was an array out of bounds problem in the unshaping code for these.
+        byte [] input = new byte [7700]; 
+        int i;
+        for (i=0; i<input.length; i++) {
+          input[i] = (byte)0xb2;
+        }
+        CharsetDetector det = new CharsetDetector();
+        det.setText(input);
+        det.detect();
+    }    
+
      
-  }
+}