diff --git a/icu4c/source/i18n/csdetect.cpp b/icu4c/source/i18n/csdetect.cpp index 954e4fed296..3ad7136f878 100644 --- a/icu4c/source/i18n/csdetect.cpp +++ b/icu4c/source/i18n/csdetect.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2011, International Business Machines + * Copyright (C) 2005-2012, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -89,20 +89,8 @@ void CharsetDetector::setRecognizers(UErrorCode &status) new CharsetRecog_UTF_32_BE(), new CharsetRecog_UTF_32_LE(), - new CharsetRecog_8859_1_en(), - new CharsetRecog_8859_1_da(), - new CharsetRecog_8859_1_de(), - new CharsetRecog_8859_1_es(), - new CharsetRecog_8859_1_fr(), - new CharsetRecog_8859_1_it(), - new CharsetRecog_8859_1_nl(), - new CharsetRecog_8859_1_no(), - new CharsetRecog_8859_1_pt(), - new CharsetRecog_8859_1_sv(), - new CharsetRecog_8859_2_cs(), - new CharsetRecog_8859_2_hu(), - new CharsetRecog_8859_2_pl(), - new CharsetRecog_8859_2_ro(), + new CharsetRecog_8859_1(), + new CharsetRecog_8859_2(), new CharsetRecog_8859_5_ru(), new CharsetRecog_8859_6_ar(), new CharsetRecog_8859_7_el(), @@ -263,10 +251,8 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set return NULL; - } else if(fFreshTextSet) { + } else if (fFreshTextSet) { CharsetRecognizer *csr; - int32_t detectResults; - int32_t confidence; int32_t i; textIn->MungeInput(fStripTags); @@ -276,46 +262,14 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, resultCount = 0; for (i = 0; i < fCSRecognizers_size; i += 1) { csr = fCSRecognizers[i]; - detectResults = csr->match(textIn); - confidence = detectResults; - - if (confidence > 0) { - resultArray[resultCount++]->set(textIn, csr, confidence); + if (csr->match(textIn, resultArray[resultCount])) { + resultCount++; } } - for(i = resultCount; i < fCSRecognizers_size; i += 1) { - resultArray[i]->set(textIn, 0, 0); + if (resultCount > 1) { + uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); } - - uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); - - // Remove duplicate charsets from the results. - // Simple minded, brute force approach - check each entry against all that follow. - // The first entry of any duplicated set is the one that should be kept because it will - // be the one with the highest confidence rating. - // (Duplicate matches have different languages, only the charset is the same) - // Because the resultArray contains preallocated CharsetMatch objects, they aren't actually - // deleted, just reordered, with the unwanted duplicates placed after the good results. - int32_t j, k; - for (i=0; igetName(); - for (j=i+1; jgetName()) != 0) { - // Not a duplicate. - j++; - } else { - // Duplicate entry at index j. - CharsetMatch *duplicate = resultArray[j]; - for (k=j; kgetName(); + } + if (fLang == NULL) { + fLang = csr->getLanguage(); + } + } } const char* CharsetMatch::getName()const { - return csr->getName(); + return fCharsetName; } const char* CharsetMatch::getLanguage()const { - return csr->getLanguage(); + return fLang; } int32_t CharsetMatch::getConfidence()const diff --git a/icu4c/source/i18n/csmatch.h b/icu4c/source/i18n/csmatch.h index 50b78a3a1dc..a05d2e3d173 100644 --- a/icu4c/source/i18n/csmatch.h +++ b/icu4c/source/i18n/csmatch.h @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2006, International Business Machines + * Copyright (C) 2005-2012, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -17,19 +17,45 @@ U_NAMESPACE_BEGIN class InputText; class CharsetRecognizer; +/* + * CharsetMatch represents the results produced by one Charset Recognizer for one input text + * Any confidence > 0 indicates a possible match, meaning that the input bytes + * are at least legal. + * + * The full results of a detect are represented by an array of these + * CharsetMatch objects, each representing a possible matching charset. + * + * Note that a single charset recognizer may detect multiple closely related + * charsets, and set different names depending on the exact input bytes seen. + */ class CharsetMatch : public UMemory { private: - CharsetRecognizer *csr; - InputText *textIn; - int32_t confidence; + const CharsetRecognizer *csr; + InputText *textIn; + int32_t confidence; + const char *fCharsetName; + const char *fLang; public: CharsetMatch(); - void set(InputText *input, CharsetRecognizer *cr, int32_t conf); + /** + * fully set the state of this CharsetMatch. + * Called by the CharsetRecognizers to record match results. + * Default (NULL) parameters for names will be filled by calling the + * corresponding getters on the recognizer. + */ + void set(InputText *input, + const CharsetRecognizer *cr, + int32_t conf, + const char *csName=NULL, + const char *lang=NULL); - const char *getName()const; + /** + * Return the name of the charset for this Match + */ + const char *getName() const; const char *getLanguage()const; diff --git a/icu4c/source/i18n/csr2022.cpp b/icu4c/source/i18n/csr2022.cpp index a890d11bda5..3db0bc9f36a 100644 --- a/icu4c/source/i18n/csr2022.cpp +++ b/icu4c/source/i18n/csr2022.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2011, International Business Machines + * Copyright (C) 2005-2012, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -12,6 +12,7 @@ #include "cstring.h" #include "csr2022.h" +#include "csmatch.h" U_NAMESPACE_BEGIN @@ -19,7 +20,7 @@ U_NAMESPACE_BEGIN /** * Matching function shared among the 2022 detectors JP, CN and KR - * Counts up the number of legal an unrecognized escape sequences in + * Counts up the number of legal and unrecognized escape sequences in * the sample of text, and computes a score based on the total number & * the proportion that fit the encoding. * @@ -29,7 +30,7 @@ U_NAMESPACE_BEGIN * @param escapeSequences the byte escape sequences to test for. * @return match quality, in the range of 0-100. */ -int32_t CharsetRecog_2022::match_2022(const uint8_t *text, int32_t textLen, const uint8_t escapeSequences[][5], int32_t escapeSequences_length) +int32_t CharsetRecog_2022::match_2022(const uint8_t *text, int32_t textLen, const uint8_t escapeSequences[][5], int32_t escapeSequences_length) const { int32_t i, j; int32_t escN; @@ -138,42 +139,50 @@ static const uint8_t escapeSequences_2022CN[][5] = { CharsetRecog_2022JP::~CharsetRecog_2022JP() {} -const char *CharsetRecog_2022JP::getName() const -{ +const char *CharsetRecog_2022JP::getName() const { return "ISO-2022-JP"; } -int32_t CharsetRecog_2022JP::match(InputText *textIn) -{ - return match_2022(textIn->fInputBytes, textIn->fInputLen, escapeSequences_2022JP, ARRAY_SIZE(escapeSequences_2022JP)); +UBool CharsetRecog_2022JP::match(InputText *textIn, CharsetMatch *results) const { + int32_t confidence = match_2022(textIn->fInputBytes, + textIn->fInputLen, + escapeSequences_2022JP, + ARRAY_SIZE(escapeSequences_2022JP)); + results->set(textIn, this, confidence); + return (confidence > 0); } CharsetRecog_2022KR::~CharsetRecog_2022KR() {} -const char *CharsetRecog_2022KR::getName() const -{ +const char *CharsetRecog_2022KR::getName() const { return "ISO-2022-KR"; } -int32_t CharsetRecog_2022KR::match(InputText *textIn) -{ - return match_2022(textIn->fInputBytes, textIn->fInputLen, escapeSequences_2022KR, ARRAY_SIZE(escapeSequences_2022KR)); +UBool CharsetRecog_2022KR::match(InputText *textIn, CharsetMatch *results) const { + int32_t confidence = match_2022(textIn->fInputBytes, + textIn->fInputLen, + escapeSequences_2022KR, + ARRAY_SIZE(escapeSequences_2022KR)); + results->set(textIn, this, confidence); + return (confidence > 0); } CharsetRecog_2022CN::~CharsetRecog_2022CN() {} -const char *CharsetRecog_2022CN::getName() const -{ +const char *CharsetRecog_2022CN::getName() const { return "ISO-2022-CN"; } -int32_t CharsetRecog_2022CN::match(InputText *textIn) -{ - return match_2022(textIn->fInputBytes, textIn->fInputLen, escapeSequences_2022CN, ARRAY_SIZE(escapeSequences_2022CN)); +UBool CharsetRecog_2022CN::match(InputText *textIn, CharsetMatch *results) const { + int32_t confidence = match_2022(textIn->fInputBytes, + textIn->fInputLen, + escapeSequences_2022CN, + ARRAY_SIZE(escapeSequences_2022CN)); + results->set(textIn, this, confidence); + return (confidence > 0); } -CharsetRecog_2022::~CharsetRecog_2022() -{ +CharsetRecog_2022::~CharsetRecog_2022() { // nothing to do } diff --git a/icu4c/source/i18n/csr2022.h b/icu4c/source/i18n/csr2022.h index 2b5b40fb1e3..2ac2b87db8d 100644 --- a/icu4c/source/i18n/csr2022.h +++ b/icu4c/source/i18n/csr2022.h @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2011, International Business Machines + * Copyright (C) 2005-2012, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -16,6 +16,8 @@ U_NAMESPACE_BEGIN +class CharsetMatch; + /** * class CharsetRecog_2022 part of the ICU charset detection imlementation. * This is a superclass for the individual detectors for @@ -46,7 +48,10 @@ protected: * @param escapeSequences the byte escape sequences to test for. * @return match quality, in the range of 0-100. */ - int32_t match_2022(const uint8_t *text, int32_t textLen, const uint8_t escapeSequences[][5], int32_t escapeSequences_length); + int32_t match_2022(const uint8_t *text, + int32_t textLen, + const uint8_t escapeSequences[][5], + int32_t escapeSequences_length) const; }; @@ -57,7 +62,7 @@ public: const char *getName() const; - int32_t match(InputText *textIn); + UBool match(InputText *textIn, CharsetMatch *results) const; }; class CharsetRecog_2022KR :public CharsetRecog_2022 { @@ -66,7 +71,7 @@ public: const char *getName() const; - int32_t match(InputText *textIn); + UBool match(InputText *textIn, CharsetMatch *results) const; }; @@ -77,7 +82,7 @@ public: const char* getName() const; - int32_t match(InputText *textIn); + UBool match(InputText *textIn, CharsetMatch *results) const; }; U_NAMESPACE_END diff --git a/icu4c/source/i18n/csrecog.h b/icu4c/source/i18n/csrecog.h index 75cf94f49f4..6b7573a1a48 100644 --- a/icu4c/source/i18n/csrecog.h +++ b/icu4c/source/i18n/csrecog.h @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2006, International Business Machines + * Copyright (C) 2005-2012, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -16,11 +16,17 @@ U_NAMESPACE_BEGIN +class CharsetMatch; + class CharsetRecognizer : public UMemory { public: /** * Get the IANA name of this charset. + * Note that some recognizers can recognize more than one charset, but that this API + * assumes just one name per recognizer. + * TODO: need to account for multiple names in public API that enumerates over the + * known detectable charsets. * @return the charset name. */ virtual const char *getName() const = 0; @@ -31,7 +37,14 @@ class CharsetRecognizer : public UMemory */ virtual const char *getLanguage() const; - virtual int32_t match(InputText *textIn) = 0; + /* + * Try the given input text against this Charset, and fill in the results object + * with the quality of the match plus other information related to the match. + * + * Return TRUE if the the input bytes are a potential match, and + * FALSE if the input data is not compatible with, or illegal in this charset. + */ + virtual UBool match(InputText *textIn, CharsetMatch *results) const = 0; virtual ~CharsetRecognizer(); }; diff --git a/icu4c/source/i18n/csrmbcs.cpp b/icu4c/source/i18n/csrmbcs.cpp index 6252a2b8254..fef2e869015 100644 --- a/icu4c/source/i18n/csrmbcs.cpp +++ b/icu4c/source/i18n/csrmbcs.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2011, International Business Machines + * Copyright (C) 2005-2012, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -9,6 +9,7 @@ #if !UCONFIG_NO_CONVERSION +#include "csmatch.h" #include "csrmbcs.h" #include @@ -143,7 +144,7 @@ CharsetRecog_mbcs::~CharsetRecog_mbcs() // nothing to do. } -int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) { +int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const { int32_t singleByteCharCount = 0; int32_t doubleByteCharCount = 0; int32_t commonCharCount = 0; @@ -239,7 +240,7 @@ CharsetRecog_sjis::~CharsetRecog_sjis() // nothing to do } -UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) { +UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const { it->index = it->nextIndex; it->error = FALSE; @@ -267,9 +268,10 @@ UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) { return TRUE; } -int32_t CharsetRecog_sjis::match(InputText* det) -{ - return match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis)); +UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const { + int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis)); + results->set(det, this, confidence); + return (confidence > 0); } const char *CharsetRecog_sjis::getName() const @@ -287,7 +289,7 @@ CharsetRecog_euc::~CharsetRecog_euc() // nothing to do } -UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) { +UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const { int32_t firstByte = 0; int32_t secondByte = 0; int32_t thirdByte = 0; @@ -366,9 +368,11 @@ const char *CharsetRecog_euc_jp::getLanguage() const return "ja"; } -int32_t CharsetRecog_euc_jp::match(InputText *det) +UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const { - return match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp)); + int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp)); + results->set(det, this, confidence); + return (confidence > 0); } CharsetRecog_euc_kr::~CharsetRecog_euc_kr() @@ -386,9 +390,11 @@ const char *CharsetRecog_euc_kr::getLanguage() const return "ko"; } -int32_t CharsetRecog_euc_kr::match(InputText *det) +UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const { - return match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr)); + int32_t confidence = match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr)); + results->set(det, this, confidence); + return (confidence > 0); } CharsetRecog_big5::~CharsetRecog_big5() @@ -396,7 +402,7 @@ CharsetRecog_big5::~CharsetRecog_big5() // nothing to do } -UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) +UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const { int32_t firstByte; @@ -436,9 +442,11 @@ const char *CharsetRecog_big5::getLanguage() const return "zh"; } -int32_t CharsetRecog_big5::match(InputText *det) +UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const { - return match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5)); + int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5)); + results->set(det, this, confidence); + return (confidence > 0); } CharsetRecog_gb_18030::~CharsetRecog_gb_18030() @@ -446,7 +454,7 @@ CharsetRecog_gb_18030::~CharsetRecog_gb_18030() // nothing to do } -UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) { +UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const { int32_t firstByte = 0; int32_t secondByte = 0; int32_t thirdByte = 0; @@ -510,9 +518,11 @@ const char *CharsetRecog_gb_18030::getLanguage() const return "zh"; } -int32_t CharsetRecog_gb_18030::match(InputText *det) +UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const { - return match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030)); + int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030)); + results->set(det, this, confidence); + return (confidence > 0); } U_NAMESPACE_END diff --git a/icu4c/source/i18n/csrmbcs.h b/icu4c/source/i18n/csrmbcs.h index 371889a1e61..9ea9d8f8ee0 100644 --- a/icu4c/source/i18n/csrmbcs.h +++ b/icu4c/source/i18n/csrmbcs.h @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2008, International Business Machines + * Copyright (C) 2005-2012, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -58,7 +58,7 @@ protected: *
* bits 8-15: The match reason, an enum-like value. */ - int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen); + int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const; public: @@ -71,7 +71,7 @@ public: const char *getName() const = 0; const char *getLanguage() const = 0; - int32_t match(InputText* det) = 0; + UBool match(InputText* input, CharsetMatch *results) const = 0; /** * Get the next character (however many bytes it is) from the input data @@ -85,7 +85,7 @@ public: * being iterated over. * @return True if a character was returned, false at end of input. */ - virtual UBool nextChar(IteratedChar *it, InputText *textIn) = 0; + virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0; }; @@ -98,9 +98,9 @@ class CharsetRecog_sjis : public CharsetRecog_mbcs { public: virtual ~CharsetRecog_sjis(); - UBool nextChar(IteratedChar *it, InputText *det); + UBool nextChar(IteratedChar *it, InputText *det) const; - int32_t match(InputText *det); + UBool match(InputText* input, CharsetMatch *results) const; const char *getName() const; const char *getLanguage() const; @@ -122,14 +122,14 @@ public: const char *getName() const = 0; const char *getLanguage() const = 0; - int32_t match(InputText* det) = 0; + UBool match(InputText* input, CharsetMatch *results) const = 0; /* * (non-Javadoc) * Get the next character value for EUC based encodings. * Character "value" is simply the raw bytes that make up the character * packed into an int. */ - UBool nextChar(IteratedChar *it, InputText *det); + UBool nextChar(IteratedChar *it, InputText *det) const; }; /** @@ -144,7 +144,7 @@ public: const char *getName() const; const char *getLanguage() const; - int32_t match(InputText *det); + UBool match(InputText* input, CharsetMatch *results) const; }; /** @@ -159,7 +159,7 @@ public: const char *getName() const; const char *getLanguage() const; - int32_t match(InputText *det); + UBool match(InputText* input, CharsetMatch *results) const; }; /** @@ -172,12 +172,12 @@ class CharsetRecog_big5 : public CharsetRecog_mbcs public: virtual ~CharsetRecog_big5(); - UBool nextChar(IteratedChar* it, InputText* det); + UBool nextChar(IteratedChar* it, InputText* det) const; const char *getName() const; const char *getLanguage() const; - int32_t match(InputText *det); + UBool match(InputText* input, CharsetMatch *results) const; }; @@ -191,12 +191,12 @@ class CharsetRecog_gb_18030 : public CharsetRecog_mbcs public: virtual ~CharsetRecog_gb_18030(); - UBool nextChar(IteratedChar* it, InputText* det); + UBool nextChar(IteratedChar* it, InputText* det) const; const char *getName() const; const char *getLanguage() const; - int32_t match(InputText *det); + UBool match(InputText* input, CharsetMatch *results) const; }; U_NAMESPACE_END diff --git a/icu4c/source/i18n/csrsbcs.cpp b/icu4c/source/i18n/csrsbcs.cpp index 5b55af64751..4c387c7d634 100644 --- a/icu4c/source/i18n/csrsbcs.cpp +++ b/icu4c/source/i18n/csrsbcs.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2010, International Business Machines + * Copyright (C) 2005-2012, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -11,9 +11,11 @@ #if !UCONFIG_NO_CONVERSION #include "csrsbcs.h" +#include "csmatch.h" #define N_GRAM_SIZE 3 #define N_GRAM_MASK 0xFFFFFF +#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) U_NAMESPACE_BEGIN @@ -131,7 +133,6 @@ int32_t NGramParser::parse(InputText *det) } CharsetRecog_sbcs::CharsetRecog_sbcs() -: haveC1Bytes(FALSE) { // nothing else to do } @@ -141,12 +142,11 @@ CharsetRecog_sbcs::~CharsetRecog_sbcs() // nothing to do } -int32_t CharsetRecog_sbcs::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) +int32_t CharsetRecog_sbcs::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const { NGramParser parser(ngrams, byteMap); int32_t result; - haveC1Bytes = det->fC1Bytes; result = parser.parse(det); return result; @@ -591,103 +591,144 @@ static const uint8_t charMap_IBM420_ar[]= { /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40, }; -//ISO-8859-1,2,5,6,7,8,9 Ngrams -static const int32_t ngrams_8859_1_en[] = { +//ISO-8859-1,2,5,6,7,8,9 Ngrams + +struct NGramsPlusLang { + const int32_t ngrams[64]; + const char * lang; +}; + +static const NGramsPlusLang ngrams_8859_1[] = { + { + { 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F, 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74, 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420, 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320, -}; - -static const int32_t ngrams_8859_1_da[] = { + }, + "en" + }, + { + { 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320, 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520, 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572, -}; - -static const int32_t ngrams_8859_1_de[] = { + }, + "da" + }, + { + { 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F, 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220, 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572, -}; - -static const int32_t ngrams_8859_1_es[] = { + }, + "de" + }, + { + { 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365, 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C, 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064, 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20, -}; - -static const int32_t ngrams_8859_1_fr[] = { + }, + "es" + }, + { + { 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E, 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20, 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420, 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220, -}; - -static const int32_t ngrams_8859_1_it[] = { + }, + "fr" + }, + { + { 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073, 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220, 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20, 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F, -}; - -static const int32_t ngrams_8859_1_nl[] = { + }, + "it" + }, + { + { 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665, 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E, 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F, 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F, -}; - -static const int32_t ngrams_8859_1_no[] = { + }, + "nl" + }, + { + { 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469, 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474, 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572, -}; - -static const int32_t ngrams_8859_1_pt[] = { + }, + "no" + }, + { + { 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365, 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20, 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065, 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F, -}; - -static const int32_t ngrams_8859_1_sv[] = { + }, + "pt" + }, + { + { 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469, 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220, 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20, 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220, + }, + "sv" + } }; -static const int32_t ngrams_8859_2_cs[] = { + +static const NGramsPlusLang ngrams_8859_2[] = { + { + { 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F, 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465, 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865, 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564, -}; - -static const int32_t ngrams_8859_2_hu[] = { + }, + "cs" + }, + { + { 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69, 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20, 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061, 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320, -}; - -static const int32_t ngrams_8859_2_pl[] = { + }, + "hu" + }, + { + { 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779, 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20, 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769, 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720, -}; - -static const int32_t ngrams_8859_2_ro[] = { + }, + "pl" + }, + { + { 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69, 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070, 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72, 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20, + }, + "ro" + } }; static const int32_t ngrams_8859_5_ru[] = { @@ -737,224 +778,54 @@ CharsetRecog_8859_1::~CharsetRecog_8859_1() // nothing to do } +UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const { + const char *name = textIn->fC1Bytes? "windows-1252" : "ISO-8859-1"; + uint32_t i; + int32_t bestConfidenceSoFar = -1; + for (i=0; i < ARRAY_SIZE(ngrams_8859_1) ; i++) { + const int32_t *ngrams = ngrams_8859_1[i].ngrams; + const char *lang = ngrams_8859_1[i].lang; + int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_1); + if (confidence > bestConfidenceSoFar) { + results->set(textIn, this, confidence, name, lang); + bestConfidenceSoFar = confidence; + } + } + return (bestConfidenceSoFar > 0); +} + const char *CharsetRecog_8859_1::getName() const { - return haveC1Bytes? "windows-1252" : "ISO-8859-1"; + return "ISO-8859-1"; } -const char *CharsetRecog_8859_1_en::getLanguage() const -{ - return "en"; -} - -CharsetRecog_8859_1_en::~CharsetRecog_8859_1_en() -{ - // nothing to do -} - -int32_t CharsetRecog_8859_1_en::match(InputText *textIn) -{ - int32_t result = match_sbcs(textIn, ngrams_8859_1_en, charMap_8859_1); - - // printf("8859_1_en: result = %d\n", result); - return result; //match_sbcs(textIn, ngrams, charMap); -} - -CharsetRecog_8859_1_da::~CharsetRecog_8859_1_da() -{ - // nothing to do -} - -const char *CharsetRecog_8859_1_da::getLanguage() const -{ - return "da"; -} - -int32_t CharsetRecog_8859_1_da::match(InputText *textIn) -{ - return match_sbcs(textIn, ngrams_8859_1_da, charMap_8859_1); -} - -CharsetRecog_8859_1_de::~CharsetRecog_8859_1_de() {} - -const char *CharsetRecog_8859_1_de::getLanguage() const -{ - return "de"; -} - -int32_t CharsetRecog_8859_1_de::match(InputText *textIn) -{ - return match_sbcs(textIn, ngrams_8859_1_de, charMap_8859_1); -} - -CharsetRecog_8859_1_es::~CharsetRecog_8859_1_es() -{ - // nothing to do -} - -const char *CharsetRecog_8859_1_es::getLanguage() const -{ - return "es"; -} - -int32_t CharsetRecog_8859_1_es::match(InputText *textIn) -{ - return match_sbcs(textIn, ngrams_8859_1_es, charMap_8859_1); -} - -CharsetRecog_8859_1_fr::~CharsetRecog_8859_1_fr() -{ - // nothing to do -} - -const char *CharsetRecog_8859_1_fr::getLanguage() const -{ - return "fr"; -} - -int32_t CharsetRecog_8859_1_fr::match(InputText *textIn) -{ - return match_sbcs(textIn, ngrams_8859_1_fr, charMap_8859_1); -} - -CharsetRecog_8859_1_it::~CharsetRecog_8859_1_it() -{ - // nothing to do -} - -const char *CharsetRecog_8859_1_it::getLanguage() const -{ - return "it"; -} - -int32_t CharsetRecog_8859_1_it::match(InputText *textIn) -{ - return match_sbcs(textIn, ngrams_8859_1_it, charMap_8859_1); -} - -CharsetRecog_8859_1_nl::~CharsetRecog_8859_1_nl() -{ - // nothing to do -} - -const char *CharsetRecog_8859_1_nl::getLanguage() const -{ - return "nl"; -} - -int32_t CharsetRecog_8859_1_nl::match(InputText *textIn) -{ - return match_sbcs(textIn, ngrams_8859_1_nl, charMap_8859_1); -} - -CharsetRecog_8859_1_no::~CharsetRecog_8859_1_no() {} - -const char *CharsetRecog_8859_1_no::getLanguage() const -{ - return "no"; -} - -int32_t CharsetRecog_8859_1_no::match(InputText *textIn) -{ - return match_sbcs(textIn, ngrams_8859_1_no, charMap_8859_1); -} - -CharsetRecog_8859_1_pt::~CharsetRecog_8859_1_pt() -{ - // nothing to do -} - -const char *CharsetRecog_8859_1_pt::getLanguage() const -{ - return "pt"; -} - -int32_t CharsetRecog_8859_1_pt::match(InputText *textIn) -{ - return match_sbcs(textIn, ngrams_8859_1_pt, charMap_8859_1); -} - -CharsetRecog_8859_1_sv::~CharsetRecog_8859_1_sv() {} - -const char *CharsetRecog_8859_1_sv::getLanguage() const -{ - return "sv"; -} - -int32_t CharsetRecog_8859_1_sv::match(InputText *textIn) -{ - return match_sbcs(textIn, ngrams_8859_1_sv, charMap_8859_1); -} CharsetRecog_8859_2::~CharsetRecog_8859_2() { // nothing to do } +UBool CharsetRecog_8859_2::match(InputText *textIn, CharsetMatch *results) const { + const char *name = textIn->fC1Bytes? "windows-1250" : "ISO-8859-2"; + uint32_t i; + int32_t bestConfidenceSoFar = -1; + for (i=0; i < ARRAY_SIZE(ngrams_8859_2) ; i++) { + const int32_t *ngrams = ngrams_8859_2[i].ngrams; + const char *lang = ngrams_8859_2[i].lang; + int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_2); + if (confidence > bestConfidenceSoFar) { + results->set(textIn, this, confidence, name, lang); + bestConfidenceSoFar = confidence; + } + } + return (bestConfidenceSoFar > 0); +} + const char *CharsetRecog_8859_2::getName() const { - return haveC1Bytes? "windows-1250" : "ISO-8859-2"; + return "ISO-8859-2"; } -CharsetRecog_8859_2_cs::~CharsetRecog_8859_2_cs() -{ - // nothing to do -} - -const char *CharsetRecog_8859_2_cs::getLanguage() const -{ - return "cs"; -} - -int32_t CharsetRecog_8859_2_cs::match(InputText *textIn) -{ - return match_sbcs(textIn, ngrams_8859_2_cs, charMap_8859_2); -} - -CharsetRecog_8859_2_hu::~CharsetRecog_8859_2_hu() -{ - // nothing to do -} - -const char *CharsetRecog_8859_2_hu::getLanguage() const -{ - return "hu"; -} - -int32_t CharsetRecog_8859_2_hu::match(InputText *textIn) -{ - return match_sbcs(textIn, ngrams_8859_2_hu, charMap_8859_2); -} - -CharsetRecog_8859_2_pl::~CharsetRecog_8859_2_pl() -{ - // nothing to do -} - -const char *CharsetRecog_8859_2_pl::getLanguage() const -{ - return "pl"; -} - -int32_t CharsetRecog_8859_2_pl::match(InputText *textIn) -{ - return match_sbcs(textIn, ngrams_8859_2_pl, charMap_8859_2); -} - -CharsetRecog_8859_2_ro::~CharsetRecog_8859_2_ro() -{ - // nothing to do -} - -const char *CharsetRecog_8859_2_ro::getLanguage() const -{ - return "ro"; -} - -int32_t CharsetRecog_8859_2_ro::match(InputText *textIn) -{ - return match_sbcs(textIn, ngrams_8859_2_ro, charMap_8859_2); -} CharsetRecog_8859_5::~CharsetRecog_8859_5() { @@ -976,9 +847,11 @@ const char *CharsetRecog_8859_5_ru::getLanguage() const return "ru"; } -int32_t CharsetRecog_8859_5_ru::match(InputText *textIn) +UBool CharsetRecog_8859_5_ru::match(InputText *textIn, CharsetMatch *results) const { - return match_sbcs(textIn, ngrams_8859_5_ru, charMap_8859_5); + int32_t confidence = match_sbcs(textIn, ngrams_8859_5_ru, charMap_8859_5); + results->set(textIn, this, confidence); + return (confidence > 0); } CharsetRecog_8859_6::~CharsetRecog_8859_6() @@ -1001,9 +874,11 @@ const char *CharsetRecog_8859_6_ar::getLanguage() const return "ar"; } -int32_t CharsetRecog_8859_6_ar::match(InputText *textIn) +UBool CharsetRecog_8859_6_ar::match(InputText *textIn, CharsetMatch *results) const { - return match_sbcs(textIn, ngrams_8859_6_ar, charMap_8859_6); + int32_t confidence = match_sbcs(textIn, ngrams_8859_6_ar, charMap_8859_6); + results->set(textIn, this, confidence); + return (confidence > 0); } CharsetRecog_8859_7::~CharsetRecog_8859_7() @@ -1013,7 +888,7 @@ CharsetRecog_8859_7::~CharsetRecog_8859_7() const char *CharsetRecog_8859_7::getName() const { - return haveC1Bytes? "windows-1253" : "ISO-8859-7"; + return "ISO-8859-7"; } CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el() @@ -1026,9 +901,12 @@ const char *CharsetRecog_8859_7_el::getLanguage() const return "el"; } -int32_t CharsetRecog_8859_7_el::match(InputText *textIn) +UBool CharsetRecog_8859_7_el::match(InputText *textIn, CharsetMatch *results) const { - return match_sbcs(textIn, ngrams_8859_7_el, charMap_8859_7); + const char *name = textIn->fC1Bytes? "windows-1253" : "ISO-8859-7"; + int32_t confidence = match_sbcs(textIn, ngrams_8859_7_el, charMap_8859_7); + results->set(textIn, this, confidence, name, "el"); + return (confidence > 0); } CharsetRecog_8859_8::~CharsetRecog_8859_8() @@ -1038,7 +916,7 @@ CharsetRecog_8859_8::~CharsetRecog_8859_8() const char *CharsetRecog_8859_8::getName() const { - return haveC1Bytes? "windows-1255" : "ISO-8859-8"; + return "ISO-8859-8"; } CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he () @@ -1048,7 +926,7 @@ CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he () const char *CharsetRecog_8859_8_I_he::getName() const { - return haveC1Bytes? "windows-1255" : "ISO-8859-8-I"; + return "ISO-8859-8-I"; } const char *CharsetRecog_8859_8_I_he::getLanguage() const @@ -1056,9 +934,12 @@ const char *CharsetRecog_8859_8_I_he::getLanguage() const return "he"; } -int32_t CharsetRecog_8859_8_I_he::match(InputText *textIn) +UBool CharsetRecog_8859_8_I_he::match(InputText *textIn, CharsetMatch *results) const { - return match_sbcs(textIn, ngrams_8859_8_I_he, charMap_8859_8); + const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8-I"; + int32_t confidence = match_sbcs(textIn, ngrams_8859_8_I_he, charMap_8859_8); + results->set(textIn, this, confidence, name, "he"); + return (confidence > 0); } CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he() @@ -1071,9 +952,12 @@ const char *CharsetRecog_8859_8_he::getLanguage() const return "he"; } -int32_t CharsetRecog_8859_8_he::match(InputText *textIn) +UBool CharsetRecog_8859_8_he::match(InputText *textIn, CharsetMatch *results) const { - return match_sbcs(textIn, ngrams_8859_8_he, charMap_8859_8); + const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8"; + int32_t confidence = match_sbcs(textIn, ngrams_8859_8_he, charMap_8859_8); + results->set(textIn, this, confidence, name, "he"); + return (confidence > 0); } CharsetRecog_8859_9::~CharsetRecog_8859_9() @@ -1083,7 +967,7 @@ CharsetRecog_8859_9::~CharsetRecog_8859_9() const char *CharsetRecog_8859_9::getName() const { - return haveC1Bytes? "windows-1254" : "ISO-8859-9"; + return "ISO-8859-9"; } CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr () @@ -1096,9 +980,12 @@ const char *CharsetRecog_8859_9_tr::getLanguage() const return "tr"; } -int32_t CharsetRecog_8859_9_tr::match(InputText *textIn) +UBool CharsetRecog_8859_9_tr::match(InputText *textIn, CharsetMatch *results) const { - return match_sbcs(textIn, ngrams_8859_9_tr, charMap_8859_9); + const char *name = textIn->fC1Bytes? "windows-1254" : "ISO-8859-9"; + int32_t confidence = match_sbcs(textIn, ngrams_8859_9_tr, charMap_8859_9); + results->set(textIn, this, confidence, name, "tr"); + return (confidence > 0); } CharsetRecog_windows_1256::~CharsetRecog_windows_1256() @@ -1116,9 +1003,11 @@ const char *CharsetRecog_windows_1256::getLanguage() const return "ar"; } -int32_t CharsetRecog_windows_1256::match(InputText *textIn) +UBool CharsetRecog_windows_1256::match(InputText *textIn, CharsetMatch *results) const { - return match_sbcs(textIn, ngrams_windows_1256, charMap_windows_1256); + int32_t confidence = match_sbcs(textIn, ngrams_windows_1256, charMap_windows_1256); + results->set(textIn, this, confidence); + return (confidence > 0); } CharsetRecog_windows_1251::~CharsetRecog_windows_1251() @@ -1136,9 +1025,11 @@ const char *CharsetRecog_windows_1251::getLanguage() const return "ru"; } -int32_t CharsetRecog_windows_1251::match(InputText *textIn) +UBool CharsetRecog_windows_1251::match(InputText *textIn, CharsetMatch *results) const { - return match_sbcs(textIn, ngrams_windows_1251, charMap_windows_1251); + int32_t confidence = match_sbcs(textIn, ngrams_windows_1251, charMap_windows_1251); + results->set(textIn, this, confidence); + return (confidence > 0); } CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R() @@ -1156,9 +1047,11 @@ const char *CharsetRecog_KOI8_R::getLanguage() const return "ru"; } -int32_t CharsetRecog_KOI8_R::match(InputText *textIn) +UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const { - return match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R); + int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R); + results->set(textIn, this, confidence); + return (confidence > 0); } CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he() @@ -1181,9 +1074,11 @@ const char *CharsetRecog_IBM424_he_rtl::getName() const return "IBM424_rtl"; } -int32_t CharsetRecog_IBM424_he_rtl::match(InputText *textIn) +UBool CharsetRecog_IBM424_he_rtl::match(InputText *textIn, CharsetMatch *results) const { - return match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he); + int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he); + results->set(textIn, this, confidence); + return (confidence > 0); } CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr() @@ -1196,9 +1091,11 @@ const char *CharsetRecog_IBM424_he_ltr::getName() const return "IBM424_ltr"; } -int32_t CharsetRecog_IBM424_he_ltr::match(InputText *textIn) +UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results) const { - return match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he); + int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he); + results->set(textIn, this, confidence); + return (confidence > 0); } static const uint8_t unshapeMap_IBM420[] = { @@ -1325,9 +1222,11 @@ const char *CharsetRecog_IBM420_ar_rtl::getName() const return "IBM420_rtl"; } -int32_t CharsetRecog_IBM420_ar_rtl::match(InputText *textIn) +UBool CharsetRecog_IBM420_ar_rtl::match(InputText *textIn, CharsetMatch *results) const { - return match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar); + int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar); + results->set(textIn, this, confidence); + return (confidence > 0); } CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr() @@ -1340,9 +1239,11 @@ const char *CharsetRecog_IBM420_ar_ltr::getName() const return "IBM420_ltr"; } -int32_t CharsetRecog_IBM420_ar_ltr::match(InputText *textIn) +UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results) const { - return match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar); + int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar); + results->set(textIn, this, confidence); + return (confidence > 0); } U_NAMESPACE_END diff --git a/icu4c/source/i18n/csrsbcs.h b/icu4c/source/i18n/csrsbcs.h index 21cbabe5663..cc26b057b48 100644 --- a/icu4c/source/i18n/csrsbcs.h +++ b/icu4c/source/i18n/csrsbcs.h @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2009, International Business Machines + * Copyright (C) 2005-2012, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -46,44 +46,37 @@ public: }; + class CharsetRecog_sbcs : public CharsetRecognizer { -protected: - UBool haveC1Bytes; - public: CharsetRecog_sbcs(); - virtual ~CharsetRecog_sbcs(); - virtual const char *getName() const = 0; - - virtual int32_t match(InputText *det) = 0; - - int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]); + virtual UBool match(InputText *det, CharsetMatch *results) const = 0; + virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; }; class CharsetRecog_8859_1 : public CharsetRecog_sbcs { public: virtual ~CharsetRecog_8859_1(); - const char *getName() const; + virtual UBool match(InputText *det, CharsetMatch *results) const; }; class CharsetRecog_8859_2 : public CharsetRecog_sbcs { public: virtual ~CharsetRecog_8859_2(); - const char *getName() const; + virtual UBool match(InputText *det, CharsetMatch *results) const; }; class CharsetRecog_8859_5 : public CharsetRecog_sbcs { public: virtual ~CharsetRecog_8859_5(); - const char *getName() const; }; @@ -119,145 +112,7 @@ public: const char *getName() const; }; -class CharsetRecog_8859_1_en : public CharsetRecog_8859_1 -{ -public: - virtual ~CharsetRecog_8859_1_en(); - const char *getLanguage() const; - - int32_t match(InputText *textIn); -}; - -class CharsetRecog_8859_1_da : public CharsetRecog_8859_1 -{ -public: - virtual ~CharsetRecog_8859_1_da(); - - const char *getLanguage() const; - - int32_t match(InputText *textIn); -}; - -class CharsetRecog_8859_1_de : public CharsetRecog_8859_1 -{ -public: - virtual ~CharsetRecog_8859_1_de(); - - const char *getLanguage() const; - - int32_t match(InputText *textIn); -}; - -class CharsetRecog_8859_1_es : public CharsetRecog_8859_1 -{ -public: - virtual ~CharsetRecog_8859_1_es(); - - const char *getLanguage() const; - - int32_t match(InputText *textIn); -}; - -class CharsetRecog_8859_1_fr : public CharsetRecog_8859_1 -{ -public: - virtual ~CharsetRecog_8859_1_fr(); - - const char *getLanguage() const; - - int32_t match(InputText *textIn); -}; - -class CharsetRecog_8859_1_it : public CharsetRecog_8859_1 -{ -public: - virtual ~CharsetRecog_8859_1_it(); - - const char *getLanguage() const; - - int32_t match(InputText *textIn); -}; - -class CharsetRecog_8859_1_nl : public CharsetRecog_8859_1 -{ -public: - virtual ~CharsetRecog_8859_1_nl(); - - const char *getLanguage() const; - - int32_t match(InputText *textIn); -}; - -class CharsetRecog_8859_1_no : public CharsetRecog_8859_1 -{ -public: - virtual ~CharsetRecog_8859_1_no(); - - const char *getLanguage() const; - - int32_t match(InputText *textIn); -}; - -class CharsetRecog_8859_1_pt : public CharsetRecog_8859_1 -{ -public: - virtual ~CharsetRecog_8859_1_pt(); - - const char *getLanguage() const; - - int32_t match(InputText *textIn); -}; - -class CharsetRecog_8859_1_sv : public CharsetRecog_8859_1 -{ -public: - virtual ~CharsetRecog_8859_1_sv(); - - const char *getLanguage() const; - - int32_t match(InputText *textIn); -}; - -class CharsetRecog_8859_2_cs : public CharsetRecog_8859_2 -{ -public: - virtual ~CharsetRecog_8859_2_cs(); - - const char *getLanguage() const; - - int32_t match(InputText *textIn); -}; - -class CharsetRecog_8859_2_hu : public CharsetRecog_8859_2 -{ -public: - virtual ~CharsetRecog_8859_2_hu(); - - const char *getLanguage() const; - - int32_t match(InputText *textIn); -}; - -class CharsetRecog_8859_2_pl : public CharsetRecog_8859_2 -{ -public: - virtual ~CharsetRecog_8859_2_pl(); - - const char *getLanguage() const; - - int32_t match(InputText *textIn); -}; - -class CharsetRecog_8859_2_ro : public CharsetRecog_8859_2 -{ -public: - virtual ~CharsetRecog_8859_2_ro(); - - const char *getLanguage() const; - - int32_t match(InputText *textIn); -}; class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5 { @@ -266,7 +121,7 @@ public: const char *getLanguage() const; - int32_t match(InputText *textIn); + virtual UBool match(InputText *det, CharsetMatch *results) const; }; class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6 @@ -276,7 +131,7 @@ public: const char *getLanguage() const; - int32_t match(InputText *textIn); + virtual UBool match(InputText *det, CharsetMatch *results) const; }; class CharsetRecog_8859_7_el : public CharsetRecog_8859_7 @@ -286,7 +141,7 @@ public: const char *getLanguage() const; - int32_t match(InputText *textIn); + virtual UBool match(InputText *det, CharsetMatch *results) const; }; class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8 @@ -298,7 +153,7 @@ public: const char *getLanguage() const; - int32_t match(InputText *textIn); + virtual UBool match(InputText *det, CharsetMatch *results) const; }; class CharsetRecog_8859_8_he : public CharsetRecog_8859_8 @@ -308,7 +163,7 @@ public: const char *getLanguage() const; - int32_t match(InputText *textIn); + virtual UBool match(InputText *det, CharsetMatch *results) const; }; class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9 @@ -318,7 +173,7 @@ public: const char *getLanguage() const; - int32_t match(InputText *textIn); + virtual UBool match(InputText *det, CharsetMatch *results) const; }; class CharsetRecog_windows_1256 : public CharsetRecog_sbcs @@ -330,7 +185,7 @@ public: const char *getLanguage() const; - int32_t match(InputText *textIn); + virtual UBool match(InputText *det, CharsetMatch *results) const; }; class CharsetRecog_windows_1251 : public CharsetRecog_sbcs @@ -342,7 +197,7 @@ public: const char *getLanguage() const; - int32_t match(InputText *textIn); + virtual UBool match(InputText *det, CharsetMatch *results) const; }; @@ -355,7 +210,7 @@ public: const char *getLanguage() const; - int32_t match(InputText *textIn); + virtual UBool match(InputText *det, CharsetMatch *results) const; }; class CharsetRecog_IBM424_he : public CharsetRecog_sbcs @@ -372,7 +227,7 @@ public: const char *getName() const; - int32_t match(InputText *textIn); + virtual UBool match(InputText *det, CharsetMatch *results) const; }; class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he { @@ -380,7 +235,7 @@ class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he { const char *getName() const; - int32_t match(InputText *textIn); + virtual UBool match(InputText *det, CharsetMatch *results) const; }; class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs @@ -410,7 +265,7 @@ public: const char *getName() const; - int32_t match(InputText *textIn); + virtual UBool match(InputText *det, CharsetMatch *results) const; }; class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar { @@ -418,10 +273,10 @@ class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar { const char *getName() const; - int32_t match(InputText *textIn); + virtual UBool match(InputText *det, CharsetMatch *results) const; }; U_NAMESPACE_END -#endif +#endif /* !UCONFIG_NO_CONVERSION */ #endif /* __CSRSBCS_H */ diff --git a/icu4c/source/i18n/csrucode.cpp b/icu4c/source/i18n/csrucode.cpp index 99a76d850e6..d286cd3a504 100644 --- a/icu4c/source/i18n/csrucode.cpp +++ b/icu4c/source/i18n/csrucode.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2006, International Business Machines + * Copyright (C) 2005-2012, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -10,6 +10,7 @@ #if !UCONFIG_NO_CONVERSION #include "csrucode.h" +#include "csmatch.h" U_NAMESPACE_BEGIN @@ -28,16 +29,18 @@ const char *CharsetRecog_UTF_16_BE::getName() const return "UTF-16BE"; } -int32_t CharsetRecog_UTF_16_BE::match(InputText* textIn) +UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) const { const uint8_t *input = textIn->fRawInput; + int32_t confidence = 0; if (input[0] == 0xFE && input[1] == 0xFF) { - return 100; + confidence = 100; } // TODO: Do some statastics to check for unsigned UTF-16BE - return 0; + results->set(textIn, this, confidence); + return (confidence > 0); } CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE() @@ -50,16 +53,18 @@ const char *CharsetRecog_UTF_16_LE::getName() const return "UTF-16LE"; } -int32_t CharsetRecog_UTF_16_LE::match(InputText* textIn) +UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) const { const uint8_t *input = textIn->fRawInput; + int32_t confidence = 0; if (input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00 || input[3] != 0x00)) { - return 100; + confidence = 100; } // TODO: Do some statastics to check for unsigned UTF-16LE - return 0; + results->set(textIn, this, confidence); + return (confidence > 0); } CharsetRecog_UTF_32::~CharsetRecog_UTF_32() @@ -67,7 +72,7 @@ CharsetRecog_UTF_32::~CharsetRecog_UTF_32() // nothing to do } -int32_t CharsetRecog_UTF_32::match(InputText* textIn) +UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const { const uint8_t *input = textIn->fRawInput; int32_t limit = (textIn->fRawLength / 4) * 4; @@ -106,7 +111,8 @@ int32_t CharsetRecog_UTF_32::match(InputText* textIn) confidence = 25; } - return confidence; + results->set(textIn, this, confidence); + return (confidence > 0); } CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE() diff --git a/icu4c/source/i18n/csrucode.h b/icu4c/source/i18n/csrucode.h index 315bad6b25e..a8a4f2bc521 100644 --- a/icu4c/source/i18n/csrucode.h +++ b/icu4c/source/i18n/csrucode.h @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2006, International Business Machines + * Copyright (C) 2005-2012, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -36,7 +36,7 @@ public: /* (non-Javadoc) * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector) */ - int32_t match(InputText* textIn) = 0; + UBool match(InputText* textIn, CharsetMatch *results) const = 0; }; @@ -48,7 +48,7 @@ public: const char *getName() const; - int32_t match(InputText* textIn); + UBool match(InputText* textIn, CharsetMatch *results) const; }; class CharsetRecog_UTF_16_LE : public CharsetRecog_Unicode @@ -59,7 +59,7 @@ public: const char *getName() const; - int32_t match(InputText* textIn); + UBool match(InputText* textIn, CharsetMatch *results) const; }; class CharsetRecog_UTF_32 : public CharsetRecog_Unicode @@ -72,7 +72,7 @@ public: const char* getName() const = 0; - int32_t match(InputText* textIn); + UBool match(InputText* textIn, CharsetMatch *results) const; }; diff --git a/icu4c/source/i18n/csrutf8.cpp b/icu4c/source/i18n/csrutf8.cpp index b87c277fbf9..420c66909d4 100644 --- a/icu4c/source/i18n/csrutf8.cpp +++ b/icu4c/source/i18n/csrutf8.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2008, International Business Machines + * Copyright (C) 2005-2012, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -10,6 +10,7 @@ #if !UCONFIG_NO_CONVERSION #include "csrutf8.h" +#include "csmatch.h" U_NAMESPACE_BEGIN @@ -23,23 +24,23 @@ const char *CharsetRecog_UTF8::getName() const return "UTF-8"; } -int32_t CharsetRecog_UTF8::match(InputText* det) { +UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const { bool hasBOM = FALSE; int32_t numValid = 0; int32_t numInvalid = 0; - const uint8_t *input = det->fRawInput; + const uint8_t *inputBytes = input->fRawInput; int32_t i; int32_t trailBytes = 0; int32_t confidence; - if (det->fRawLength >= 3 && - input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) { + if (input->fRawLength >= 3 && + inputBytes[0] == 0xEF && inputBytes[1] == 0xBB && inputBytes[2] == 0xBF) { hasBOM = TRUE; } // Scan for multi-byte sequences - for (i=0; i < det->fRawLength; i += 1) { - int32_t b = input[i]; + for (i=0; i < input->fRawLength; i += 1) { + int32_t b = inputBytes[i]; if ((b & 0x80) == 0) { continue; // ASCII @@ -66,11 +67,11 @@ int32_t CharsetRecog_UTF8::match(InputText* det) { for (;;) { i += 1; - if (i >= det->fRawLength) { + if (i >= input->fRawLength) { break; } - b = input[i]; + b = inputBytes[i]; if ((b & 0xC0) != 0x080) { numInvalid += 1; @@ -104,7 +105,8 @@ int32_t CharsetRecog_UTF8::match(InputText* det) { confidence = 25; } - return confidence; + results->set(input, this, confidence); + return (confidence > 0); } U_NAMESPACE_END diff --git a/icu4c/source/i18n/csrutf8.h b/icu4c/source/i18n/csrutf8.h index 10fe287a435..82e8f9d7faf 100644 --- a/icu4c/source/i18n/csrutf8.h +++ b/icu4c/source/i18n/csrutf8.h @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2006, International Business Machines + * Copyright (C) 2005-2012, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -32,7 +32,7 @@ class CharsetRecog_UTF8: public CharsetRecognizer { /* (non-Javadoc) * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector) */ - int32_t match(InputText *det); + UBool match(InputText *input, CharsetMatch *results) const; }; diff --git a/icu4c/source/test/intltest/csdetest.cpp b/icu4c/source/test/intltest/csdetest.cpp index 0316383de2c..a9d9fad3ee5 100644 --- a/icu4c/source/test/intltest/csdetest.cpp +++ b/icu4c/source/test/intltest/csdetest.cpp @@ -799,10 +799,11 @@ void CharsetDetectionTest::Ticket6954Test() { name1 = ucsdet_getName(match1, &status); TEST_ASSERT_SUCCESS(status); - // Test fails now - // TEST_ASSERT(strcmp(name1, "windows-1252")==0); + TEST_ASSERT(strcmp(name1, "windows-1252")==0); ucsdet_close(csd1); ucsdet_close(csd2); + freeBytes(bISO); + freeBytes(bWindows); #endif }