diff --git a/icu4j/src/com/ibm/icu/text/CharsetDetector.java b/icu4j/src/com/ibm/icu/text/CharsetDetector.java new file mode 100644 index 00000000000..31e54ef8f9e --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/CharsetDetector.java @@ -0,0 +1,189 @@ +/** +******************************************************************************* +* Copyright (C) 2005, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +*/ +package com.ibm.icu.text; + +import java.io.InputStream; +import java.io.Reader; + + +/** + * + * CharsetDetector provides a facility for detecting the + * charset or encoding of character data in an unknown format. + * The input data can either be from an input stream or an array of bytes. + * The result of the detection operation is a list of possibly matching + * charsets, or, for simple use, you can just ask for a Java Reader that + * will will work over the input data. + *

+ * Character set detection is at best an imprecise operation. The detection + * process will attempt to identify the charset that best matches the characteristics + * of the byte data, but the process is partly statistical in nature, and + * the results can not be guaranteed to always be correct. + *

+ * For best accuracy in charset detection, the input data should be primarily + * in a single language, and a minimum of a few hundred bytes worth of plain text + * in the language are needed. The detection process will attempt to + * ignore html or xml style markup that could otherwise obscure the content. + *

+ * Question:Should we have getters corresponding to the setters for inut text + * and declared encoding? + *

+ * A thought: If we were to create our own type of Java Reader, we could defer + * figuring out an actual charset for data that starts out with too much English + * only ASCII until the user actually read through to something that didn't look + * like 7 bit English. If nothing else ever appeared, we would never need to + * actually choose the "real" charset. All assuming that the application just + * wants the data, and doesn't care about a char set name. + * + * + */ +public class CharsetDetector { + + + /** + * Constructor + */ + public CharsetDetector() { + } + + /** + * Set the declared encoding for charset detection. + * The declared encoding of an input text is an encoding obtained + * from an http header or xml declaration or similar source that + * can be provided as additional information to the charset detector. + * A match between a declared encoding and a possible detected encoding + * will raise the quality of that detected encoding by a small delta, + * and will also appear as a "reason" for the match. + *

+ * A declared encoding that is incompatible with the input data being + * analyzed will not be added to the list of possible encodings. + * + * @param encoding The declared encoding + */ + public CharsetDetector setDecaredEncoding(String encoding) { + return this; + } + + /** + * Set the input text (byte) data whose charset is to be detected. + * @param in the input text of unknown encoding + * @return This CharsetDetector + */ + public CharsetDetector setText(byte in[]) { + return this; + } + + /** + * Set the input text (byte) data whose charset is to be detected. + *

+ * The input stream that supplies the character data must have markSupported() + * == true; the charset detection process will read a small amount of data, + * then return the stream to its original position via + * the InputStream.reset() operation. The exact amount that will + * be read depends on the characteristics of the data itself. + + * @param in the input text of unknown encoding + * @return This CharsetDetector + */ + public CharsetDetector setText(InputStream in) { + return this; + } + + + /** + * Return the charset that best matches the supplied input data. + * + * Note though, that because the detection + * only looks at the start of the input data, + * there is a possibility that the returned charset will fail to handle + * the full set of input data. + *

+ * Raise an exception if + *

+ * + * @return a CharsetMatch object representing the best matching charset. + */ + public CharsetMatch detect() { + return null; + } + + /** + * Return an array of all charsets that appear to be plausible + * matches with the input data. The array is ordered with the + * best quality match first. + *

+ * Raise an exception if + *

+ * + * @return An array of CharsetMatch objects representing possibly matching charsets. + */ + public CharsetMatch[] detectAll() { + return null; + } + + + /** + * Autodetect the charset of an inputStream, and return a Java Reader + * to access the converted input data. + *

+ * This is a convenience method that is equivalent to + * this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader(); + *

+ * For the input stream that supplies the character data, markSupported() + * must be true; the charset detection will read a small amount of data, + * then return the stream to its original position via + * the InputStream.reset() operation. The exact amount that will + * be read depends on the characteristics of the data itself. + *

+ * Raise an exception if no charsets appear to match the input data. + * + * @param in The source of the byte data in the unknown charset. + * + * @param declaredEncoding A declared encoding for the data, if available, + * or null or an empty string if none is available. + */ + public Reader getReader(InputStream in, String declaredEncoding) { + return null; + } + + /** + * Autodetect the charset of an inputStream, and return a String + * containing the converted input data. + *

+ * This is a convenience method that is equivalent to + * this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString(); + *

+ * Raise an exception if no charsets appear to match the input data. + * + * @param in The source of the byte data in the unknown charset. + * + * @param declaredEncoding A declared encoding for the data, if available, + * or null or an empty string if none is available. + */ + public String getString(byte[] in, String declaredEncoding) { + return null; + } + + + /** + * Get the names of all char sets that can be recognized by the char set detector. + * + * @return an array of the names of all charsets that can be recognized + * by the charset detector. + */ + public static String[] getAllDetectableCharsets() { + return null; + } + + +} diff --git a/icu4j/src/com/ibm/icu/text/CharsetMatch.java b/icu4j/src/com/ibm/icu/text/CharsetMatch.java new file mode 100644 index 00000000000..50aa60ffd96 --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/CharsetMatch.java @@ -0,0 +1,104 @@ +/** +******************************************************************************* +* Copyright (C) 2005, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +*/ +package com.ibm.icu.text; + +import java.io.InputStream; +import java.io.Reader; + + +/** + * This class represents a charset that has been identified by a CharsetDetector + * as a possible encoding for a set of input data. From an instance of this + * class, you can ask for a confidence level in the charset identification, + * or for Java Reader or String to access the original byte data in Unicode form. + *

+ * Instances of this class are created only by CharsetDetectors. + */ +public class CharsetMatch { + + + /** + * Create a java.io.Reader for reading the Unicode character data corresponding + * to the original byte data supplied to the Charset detect operation. + * + * @return the Reader for the Unicode character data. + */ + public Reader getReader() { + return null; + } + + + + /** + * Create a Java String from Unicode character data corresponding + * to the original byte data supplied to the Charset detect operation. + * + * @return a String created from the converted input data. + */ + public String getString() { + return null; + + } + /** + * Create a Java String from Unicode character data corresponding + * to the original byte data supplied to the Charset detect operation. + * The length of the returned string is limited to the specified size; + * the string will be trunctated to this length if necessary. A limit value of + * zero or less is ignored, and treated as no limit. + * + * @param maxLength The maximium length of the String to be created. + * @return a String created from the converted input data. + */ + public String getString(int maxLength) { + return null; + + } + + /** + * Get an indication of the confidence in the charset detected. + * Confidence values range from 0-100, with larger numbers indicating + * a better match of the input data to the characteristics of the + * charset. + * + * @return the confidence in the charset match + */ + public int getConfidence() { + return 0; + } + + /** + * Return an indication of what it was about input data that + * that caused this charset to be considered as a possible match. + *

+ * TODO: create a list of enum-like constants for the possible types of matches. + * + * @return the type of match found for this charset. + */ + public int getMatchType() { + return 0; + } + + + + /** + * Get the name of the detected charset. + * The name will be one that can be used with other APIs on the + * platform that accept charset names. It is the "Canonical name" + * as defined by the class java.nio.charset.Charset; for + * charsets that are registered with the IANA charset registry, + * this is the MIME-preferred registerd name. + * + * @see java.nio.charset.Charset + * @see java.io.InputStreamReader + * + * @return The name of the charset. + */ + public String getName() { + return ""; + } + +}