ICU-5407 The UTF-32 converter is now supported by the ICU4J charset code.

X-SVN-Rev: 20485
This commit is contained in:
George Rhoten 2006-10-04 21:46:41 +00:00
parent ef584abc21
commit 820cc4cc73
3 changed files with 28 additions and 315 deletions

View file

@ -1,7 +1,7 @@
/*
**************************************************************************
* Copyright (C) 2005, International Business Machines Corporation and *
* others. All Rights Reserved. *
* Copyright (C) 2005-2006, International Business Machines Corporation *
* and others. All Rights Reserved. *
**************************************************************************
*
*/
@ -15,7 +15,6 @@ import java.net.URL;
import javax.swing.*;
import com.ibm.icu.impl.UTF32;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
@ -96,12 +95,6 @@ public class DetectingViewer extends JFrame implements ActionListener
return new BufferedInputStream(fileStream);
}
private void openFile(String directory, String filename)
{
openFile(new File(directory, filename));
}
private BufferedInputStream openURL(String url)
{
InputStream s = null;
@ -260,34 +253,14 @@ public class DetectingViewer extends JFrame implements ActionListener
inputStream.reset();
if (encoding.startsWith("UTF-32")) {
byte[] bytes = new byte[1024];
int offset = 0;
int chBytes = 0;
UTF32 utf32 = UTF32.getInstance(encoding);
while ((bytesRead = inputStream.read(bytes, offset, 1024)) >= 0) {
offset = bytesRead % 4;
chBytes = bytesRead - offset;
sb.append(utf32.fromBytes(bytes, 0, chBytes));
if (offset != 0) {
for (int i = 0; i < offset; i += 1) {
bytes[i] = bytes[chBytes + i];
}
}
}
} else {
isr = new InputStreamReader(inputStream, encoding);
while ((bytesRead = isr.read(buffer, 0, 1024)) >= 0) {
sb.append(buffer, 0, bytesRead);
}
isr.close();
isr = new InputStreamReader(inputStream, encoding);
while ((bytesRead = isr.read(buffer, 0, 1024)) >= 0) {
sb.append(buffer, 0, bytesRead);
}
isr.close();
this.setTitle(title + " - " + encodingName(matches[0]));
setMatchMenu(matches);

View file

@ -13,7 +13,6 @@ import java.io.Reader;
import java.io.UnsupportedEncodingException;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.UTF32;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
@ -141,36 +140,30 @@ public class TestCharsetDetector extends TestFmwk
CharsetDetector det = new CharsetDetector();
byte[] bytes;
if (enc.startsWith("UTF-32")) {
UTF32 utf32 = UTF32.getInstance(enc);
bytes = utf32.toBytes(testString);
} else {
String from = enc;
String from = enc;
while (true) {
try {
bytes = testString.getBytes(from);
} catch (UnsupportedOperationException uoe) {
// In some runtimes, the ISO-2022-CN converter
// only converts *to* Unicode - we have to use
// x-ISO-2022-CN-GB to convert *from* Unicode.
if (from.equals("ISO-2022-CN")) {
from = "x-ISO-2022-CN-GB";
continue;
}
// Ignore any other converters that can't
// convert from Unicode.
return;
} catch (UnsupportedEncodingException uee) {
// Ignore any encodings that this runtime
// doesn't support.
return;
while (true) {
try {
bytes = testString.getBytes(from);
} catch (UnsupportedOperationException uoe) {
// In some runtimes, the ISO-2022-CN converter
// only converts *to* Unicode - we have to use
// x-ISO-2022-CN-GB to convert *from* Unicode.
if (from.equals("ISO-2022-CN")) {
from = "x-ISO-2022-CN-GB";
continue;
}
break;
// Ignore any other converters that can't
// convert from Unicode.
return;
} catch (UnsupportedEncodingException uee) {
// Ignore any encodings that this runtime
// doesn't support.
return;
}
break;
}
det.setText(bytes);

View file

@ -1,253 +0,0 @@
/*
*******************************************************************************
* Copyright (C) 2005, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*/
package com.ibm.icu.impl;
import com.ibm.icu.text.UTF16;
/**
* This class converts between an array of bytes in UTF-32 encoding (BE or LE) and
* Java Strings.
*
* @internal
*/
public abstract class UTF32
{
/**
* This method packs a 32-bit Unicode code point into the byte array. It is
* implemented by subclasses that implement the BE and LE encodings.
*
* @param bytes the destination byte array
* @param codePoint the 32-bit Unicode code point
* @param out the destination index in <code>bytes</code>.
*
* @internal
*/
abstract protected void pack(byte[] bytes, int codePoint, int out);
/**
* This method unpacks bytes from the encoded byte array into a 32-bit
* Unicode code point. It is implmeented by subclasses that implmeent the BE and LE encodings.
*
* @param bytes the source byte array.
* @param index the index of the first source byte.
* @return the 32-bit Unicode code point.
*
* @internal
*/
abstract protected int unpack(byte[] bytes, int index);
/**
* Convert a Java String into an array of UTF-32 encoded bytes. Calls
* the <code>pack</code> method to do the encoding.
*
* @param utf16 the source Java String.
* @return an array of UTF-32 encoded bytes.
*
* @internal
*/
public byte[] toBytes(String utf16)
{
int codePoints = UTF16.countCodePoint(utf16);
byte[] bytes = new byte[codePoints * 4];
int out = 0;
for (int cp = 0; cp < codePoints; out += 4) {
int codePoint = UTF16.charAt(utf16, cp);
pack(bytes, codePoint, out);
cp += UTF16.getCharCount(codePoint);
}
return bytes;
}
/**
* This method converts a sequence of UTF-32 encoded bytes into
* a Java String. It calls the <code>unpack</code> method to implement
* the encoding.
*
* @param bytes the source byte array.
* @param offset the starting offset in the byte array.
* @param count the number of bytes to process.
* @return the Java String.
*
* @internal
*/
public String fromBytes(byte[] bytes, int offset, int count)
{
StringBuffer buffer = new StringBuffer();
int limit = offset + count;
for (int cp = offset; cp < limit; cp += 4) {
int codePoint = unpack(bytes, cp);
UTF16.append(buffer, codePoint);
}
return buffer.toString();
}
/**
* A convenience method that converts an entire byte array
* into a Java String.
*
* @param bytes the source byte array.
* @return the Java String.
*
* @internal
*/
public String fromBytes(byte[] bytes)
{
return fromBytes(bytes, 0, bytes.length);
}
/**
* Get an instance that implements UTF-32BE encoding.
*
* @return the instance.
*
* @internal
*/
static public UTF32 getBEInstance()
{
if (beInstance == null) {
beInstance = new BE();
}
return beInstance;
}
/**
* Get an instance that implemnts the UTF-32LE encoding.
*
* @return the instance.
*
* @internal
*/
static public UTF32 getLEInstance()
{
if (leInstance == null) {
leInstance = new LE();
}
return leInstance;
}
/**
* Get an instance that implements either UTF-32BE or UTF32-LE,
* depending on the encoding name suppled.
*
* @param encoding the encoding name - must be <code>"UTF-32BE"</code> or <code>"UTF-32LE"</code>.
* @return the instance.
*
* @internal
*/
static public UTF32 getInstance(String encoding)
{
if (encoding.equals("UTF-32BE")) {
return getBEInstance();
}
if (encoding.equals("UTF-32LE")) {
return getLEInstance();
}
return null;
}
/**
* This sublcass implements the UTF-32BE encoding via the
* <code>pack</code> and <code>unpack</code> methods.
*
* @internal
*/
static class BE extends UTF32
{
/**
* This method packs a 32-bit Unicode code point into the byte array using
* the UTF-32BE encoding.
*
* @param bytes the destination byte array
* @param codePoint the 32-bit Unicode code point
* @param out the destination index in <code>bytes</code>.
*
* @internal
*/
public void pack(byte[] bytes, int codePoint, int out)
{
bytes[out + 0] = (byte) ((codePoint >> 24) & 0xFF);
bytes[out + 1] = (byte) ((codePoint >> 16) & 0xFF);
bytes[out + 2] = (byte) ((codePoint >> 8) & 0xFF);
bytes[out + 3] = (byte) ((codePoint >> 0) & 0xFF);
}
/**
* This method unpacks bytes from the UTF-32BE encoded byte array into a 32-bit
* Unicode code point.
*
* @param bytes the source byte array.
* @param index the index of the first source byte.
* @return the 32-bit Unicode code point.
*
* @internal
*/
public int unpack(byte[] bytes, int index)
{
return (bytes[index + 0] & 0xFF) << 24 | (bytes[index + 1] & 0xFF) << 16 |
(bytes[index + 2] & 0xFF) << 8 | (bytes[index + 3] & 0xFF);
}
}
/**
* This sublcass implements the UTF-32LE encoding via the
* <code>pack</code> and <code>unpack</code> methods.
*
* @internal
*/
static class LE extends UTF32
{
/**
* This method packs a 32-bit Unicode code point into the byte array using
* the UTF-32LE encoding.
*
* @param bytes the destination byte array
* @param codePoint the 32-bit Unicode code point
* @param out the destination index in <code>bytes</code>.
*
* @internal
*/
public void pack(byte[] bytes, int codePoint, int out)
{
bytes[out + 3] = (byte) ((codePoint >> 24) & 0xFF);
bytes[out + 2] = (byte) ((codePoint >> 16) & 0xFF);
bytes[out + 1] = (byte) ((codePoint >> 8) & 0xFF);
bytes[out + 0] = (byte) ((codePoint >> 0) & 0xFF);
}
/**
* This method unpacks bytes from the UTF-32LE encoded byte array into a 32-bit
* Unicode code point.
*
* @param bytes the source byte array.
* @param index the index of the first source byte.
* @return the 32-bit Unicode code point.
*
* @internal
*/
public int unpack(byte[] bytes, int index)
{
return (bytes[index + 3] & 0xFF) << 24 | (bytes[index + 2] & 0xFF) << 16 |
(bytes[index + 1] & 0xFF) << 8 | (bytes[index + 0] & 0xFF);
}
}
private static UTF32 beInstance = null;
private static UTF32 leInstance = null;
}