ICU-9267 ICU4J Charset Detector Crash Fix

X-SVN-Rev: 31724
This commit is contained in:
Andy Heninger 2012-04-18 00:01:23 +00:00
parent 834cee36a1
commit fdea410032
3 changed files with 31 additions and 16 deletions

View file

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 2005-2011, International Business Machines Corporation and *
* Copyright (C) 2005-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -417,7 +417,7 @@ public class CharsetDetector {
byte[] fInputBytes = // The text to be checked. Markup will have been
new byte[kBufSize]; // removed if appropriate.
int fInputLen; // Length of the byte data in fInputText.
int fInputLen; // Length of the byte data in fInputBytes.
short fByteStats[] = // byte frequency statistics for the input text.
new short[256]; // Value is percent, not absolute.

View file

@ -1131,6 +1131,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
//arabic shaping class, method shape/unshape
protected static ArabicShaping as = new ArabicShaping(ArabicShaping.LETTERS_UNSHAPE);
protected byte[] prev_fInputBytes = null;
protected int prev_fInputLen = 0;
protected static byte[] byteMap = {
/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
@ -1179,11 +1180,10 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
protected void matchInit(CharsetDetector det)
{
assert prev_fInputBytes == null;
prev_fInputBytes = new byte[det.fInputLen];
System.arraycopy(det.fInputBytes, 0, prev_fInputBytes, 0, det.fInputLen);
byte bb[] = unshape(prev_fInputBytes);
System.arraycopy(bb, 0, det.fInputBytes, 0, bb.length);
det.fInputLen = bb.length;
prev_fInputBytes = det.fInputBytes;
prev_fInputLen = det.fInputLen;
det.fInputBytes = unshape(prev_fInputBytes, prev_fInputLen);
det.fInputLen = det.fInputBytes.length;
}
/*
@ -1193,22 +1193,22 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
* on CharsetICU which we try to avoid. IBM420 converter amongst different versions
* of JDK can produce different results and therefore is also avoided.
*/
private byte[] unshape(byte[] inputBytes) {
byte resultByteArr[] = unshapeLamAlef(inputBytes);
private byte[] unshape(byte[] inputBytes, int inputLen) {
byte resultByteArr[] = unshapeLamAlef(inputBytes, inputLen);
for (int i=0; i<inputBytes.length; i++){
for (int i=0; i<resultByteArr.length; i++){
resultByteArr[i] = unshapeMap[resultByteArr[i]& 0xFF];
}
return resultByteArr;
}
private byte[] unshapeLamAlef(byte[] inputBytes) {
ByteBuffer resultBigBuffer = ByteBuffer.allocate(inputBytes.length*2);
private byte[] unshapeLamAlef(byte[] inputBytes, int inputLen) {
ByteBuffer resultBigBuffer = ByteBuffer.allocate(inputLen*2);
ByteBuffer resultBuffer;
byte unshapedLamAlef[] = {(byte)0xb1, (byte)0x56};
for (int i=0; i<inputBytes.length; i++){
for (int i=0; i<inputLen; i++){
if (isLamAlef(inputBytes[i]))
resultBigBuffer.put(unshapedLamAlef);
else
@ -1229,8 +1229,8 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
protected void matchFinish(CharsetDetector det) {
if (prev_fInputBytes != null) {
System.arraycopy(prev_fInputBytes, 0, det.fInputBytes, 0, prev_fInputBytes.length);
det.fInputLen = prev_fInputBytes.length;
det.fInputBytes = prev_fInputBytes;
det.fInputLen = prev_fInputLen;
prev_fInputBytes = null;
}
}

View file

@ -1057,5 +1057,20 @@ public class TestCharsetDetector extends TestFmwk
//
// End of Bug #8309 Test Case
//
public void TestBut9267() {
// Test a long input of Lam Alef characters for CharsetRecog_IBM424_he.
// Bug 9267 was an array out of bounds problem in the unshaping code for these.
byte [] input = new byte [7700];
int i;
for (i=0; i<input.length; i++) {
input[i] = (byte)0xb2;
}
CharsetDetector det = new CharsetDetector();
det.setText(input);
det.detect();
}
}
}