mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 14:31:31 +00:00
ICU-9267 ICU4J Charset Detector Crash Fix
X-SVN-Rev: 31724
This commit is contained in:
parent
834cee36a1
commit
fdea410032
3 changed files with 31 additions and 16 deletions
|
@ -1,6 +1,6 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2005-2011, International Business Machines Corporation and *
|
||||
* Copyright (C) 2005-2012, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -417,7 +417,7 @@ public class CharsetDetector {
|
|||
byte[] fInputBytes = // The text to be checked. Markup will have been
|
||||
new byte[kBufSize]; // removed if appropriate.
|
||||
|
||||
int fInputLen; // Length of the byte data in fInputText.
|
||||
int fInputLen; // Length of the byte data in fInputBytes.
|
||||
|
||||
short fByteStats[] = // byte frequency statistics for the input text.
|
||||
new short[256]; // Value is percent, not absolute.
|
||||
|
|
|
@ -1131,6 +1131,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
|
|||
//arabic shaping class, method shape/unshape
|
||||
protected static ArabicShaping as = new ArabicShaping(ArabicShaping.LETTERS_UNSHAPE);
|
||||
protected byte[] prev_fInputBytes = null;
|
||||
protected int prev_fInputLen = 0;
|
||||
|
||||
protected static byte[] byteMap = {
|
||||
/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
|
||||
|
@ -1179,11 +1180,10 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
|
|||
protected void matchInit(CharsetDetector det)
|
||||
{
|
||||
assert prev_fInputBytes == null;
|
||||
prev_fInputBytes = new byte[det.fInputLen];
|
||||
System.arraycopy(det.fInputBytes, 0, prev_fInputBytes, 0, det.fInputLen);
|
||||
byte bb[] = unshape(prev_fInputBytes);
|
||||
System.arraycopy(bb, 0, det.fInputBytes, 0, bb.length);
|
||||
det.fInputLen = bb.length;
|
||||
prev_fInputBytes = det.fInputBytes;
|
||||
prev_fInputLen = det.fInputLen;
|
||||
det.fInputBytes = unshape(prev_fInputBytes, prev_fInputLen);
|
||||
det.fInputLen = det.fInputBytes.length;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1193,22 +1193,22 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
|
|||
* on CharsetICU which we try to avoid. IBM420 converter amongst different versions
|
||||
* of JDK can produce different results and therefore is also avoided.
|
||||
*/
|
||||
private byte[] unshape(byte[] inputBytes) {
|
||||
byte resultByteArr[] = unshapeLamAlef(inputBytes);
|
||||
private byte[] unshape(byte[] inputBytes, int inputLen) {
|
||||
byte resultByteArr[] = unshapeLamAlef(inputBytes, inputLen);
|
||||
|
||||
for (int i=0; i<inputBytes.length; i++){
|
||||
for (int i=0; i<resultByteArr.length; i++){
|
||||
resultByteArr[i] = unshapeMap[resultByteArr[i]& 0xFF];
|
||||
}
|
||||
return resultByteArr;
|
||||
}
|
||||
|
||||
private byte[] unshapeLamAlef(byte[] inputBytes) {
|
||||
ByteBuffer resultBigBuffer = ByteBuffer.allocate(inputBytes.length*2);
|
||||
private byte[] unshapeLamAlef(byte[] inputBytes, int inputLen) {
|
||||
ByteBuffer resultBigBuffer = ByteBuffer.allocate(inputLen*2);
|
||||
ByteBuffer resultBuffer;
|
||||
byte unshapedLamAlef[] = {(byte)0xb1, (byte)0x56};
|
||||
|
||||
|
||||
for (int i=0; i<inputBytes.length; i++){
|
||||
for (int i=0; i<inputLen; i++){
|
||||
if (isLamAlef(inputBytes[i]))
|
||||
resultBigBuffer.put(unshapedLamAlef);
|
||||
else
|
||||
|
@ -1229,8 +1229,8 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
|
|||
|
||||
protected void matchFinish(CharsetDetector det) {
|
||||
if (prev_fInputBytes != null) {
|
||||
System.arraycopy(prev_fInputBytes, 0, det.fInputBytes, 0, prev_fInputBytes.length);
|
||||
det.fInputLen = prev_fInputBytes.length;
|
||||
det.fInputBytes = prev_fInputBytes;
|
||||
det.fInputLen = prev_fInputLen;
|
||||
prev_fInputBytes = null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1057,5 +1057,20 @@ public class TestCharsetDetector extends TestFmwk
|
|||
//
|
||||
// End of Bug #8309 Test Case
|
||||
//
|
||||
|
||||
|
||||
public void TestBut9267() {
|
||||
// Test a long input of Lam Alef characters for CharsetRecog_IBM424_he.
|
||||
// Bug 9267 was an array out of bounds problem in the unshaping code for these.
|
||||
byte [] input = new byte [7700];
|
||||
int i;
|
||||
for (i=0; i<input.length; i++) {
|
||||
input[i] = (byte)0xb2;
|
||||
}
|
||||
CharsetDetector det = new CharsetDetector();
|
||||
det.setText(input);
|
||||
det.detect();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue