From ae77a4f9edf516144a88d3050cf1549970e12c99 Mon Sep 17 00:00:00 2001 From: Ramy Said Date: Tue, 11 Jun 2013 12:39:56 +0000 Subject: [PATCH] ICU-9928 Charset Detector, IBM420_ar oddities X-SVN-Rev: 33817 --- icu4c/source/i18n/csrsbcs.cpp | 210 ++++++++++++++++++---------------- icu4c/source/i18n/csrsbcs.h | 43 ++++--- 2 files changed, 133 insertions(+), 120 deletions(-) diff --git a/icu4c/source/i18n/csrsbcs.cpp b/icu4c/source/i18n/csrsbcs.cpp index 1aad70e39ae..80d21149280 100644 --- a/icu4c/source/i18n/csrsbcs.cpp +++ b/icu4c/source/i18n/csrsbcs.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2012, International Business Machines + * Copyright (C) 2005-2013, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -96,7 +96,7 @@ int32_t NGramParser::nextByte(InputText *det) return det->fInputBytes[byteIndex++]; } -int32_t NGramParser::parse(InputText *det) +void NGramParser::parseCharacters(InputText *det) { int32_t b; bool ignoreSpace = FALSE; @@ -113,6 +113,11 @@ int32_t NGramParser::parse(InputText *det) ignoreSpace = (mb == 0x20); } } +} + +int32_t NGramParser::parse(InputText *det) +{ + parseCharacters(det); // TODO: Is this OK? The buffer could have ended in the middle of a word... addByte(0x20); @@ -132,6 +137,102 @@ int32_t NGramParser::parse(InputText *det) return (int32_t) (rawPercent * 300.0); } +static const uint8_t unshapeMap_IBM420[] = { +/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ +/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, +/* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, +/* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, +/* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, +/* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F, +/* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E, +/* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF, +/* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF, +/* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF, +/* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF, +/* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, +/* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, +}; + +NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap):NGramParser(theNgramList, theCharMap) +{ + alef = 0x00; +} + + +int32_t NGramParser_IBM420::isLamAlef(int32_t b) +{ + if(b == 0xB2 || b == 0xB3){ + return 0x47; + }else if(b == 0xB4 || b == 0xB5){ + return 0x49; + }else if(b == 0xB8 || b == 0xB9){ + return 0x56; + }else + return 0x00; +} + +/* +* Arabic shaping needs to be done manually. Cannot call ArabicShaping class +* because CharsetDetector is dealing with bytes not Unicode code points. We could +* convert the bytes to Unicode code points but that would leave us dependent +* on CharsetICU which we try to avoid. IBM420 converter amongst different versions +* of JDK can produce different results and therefore is also avoided. +*/ +int32_t NGramParser_IBM420::nextByte(InputText *det) +{ + + if (byteIndex >= det->fInputLen || det->fInputBytes[byteIndex] == 0) { + return -1; + } + int next; + + alef = isLamAlef(det->fInputBytes[byteIndex]); + if(alef != 0x00) + next = 0xB1 & 0xFF; + else + next = unshapeMap_IBM420[det->fInputBytes[byteIndex]& 0xFF] & 0xFF; + + byteIndex++; + + return next; +} + +void NGramParser_IBM420::parseCharacters(InputText *det) +{ + int32_t b; + bool ignoreSpace = FALSE; + + while ((b = nextByte(det)) >= 0) { + uint8_t mb = charMap[b]; + + // TODO: 0x20 might not be a space in all character sets... + if (mb != 0) { + if (!(mb == 0x20 && ignoreSpace)) { + addByte(mb); + } + ignoreSpace = (mb == 0x20); + } + + if(alef != 0x00){ + mb = charMap[alef & 0xFF]; + + // TODO: 0x20 might not be a space in all character sets... + if (mb != 0) { + if (!(mb == 0x20 && ignoreSpace)) { + addByte(mb); + } + + ignoreSpace = (mb == 0x20); + } + + } + } +} + CharsetRecog_sbcs::CharsetRecog_sbcs() { // nothing else to do @@ -1098,26 +1199,6 @@ UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results return (confidence > 0); } -static const uint8_t unshapeMap_IBM420[] = { -/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ -/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, -/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, -/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, -/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, -/* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, -/* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, -/* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, -/* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, -/* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F, -/* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E, -/* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF, -/* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF, -/* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF, -/* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF, -/* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, -/* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, -}; - CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar() { // nothing to do @@ -1128,88 +1209,15 @@ const char *CharsetRecog_IBM420_ar::getLanguage() const return "ar"; } -void CharsetRecog_IBM420_ar::matchInit(InputText *textIn) { - prev_fInputBytesLength = textIn->fInputLen; - prev_fInputBytes = textIn->fInputBytes; - int32_t length = 0; - uint8_t *bb = unshape(prev_fInputBytes, prev_fInputBytesLength, length); +int32_t CharsetRecog_IBM420_ar::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const +{ + NGramParser_IBM420 parser(ngrams, byteMap); + int32_t result; - if (bb != NULL) { - textIn->fInputBytes = bb; - textIn->fInputLen = length; + result = parser.parse(det); - deleteBuffer = TRUE; - } else { - deleteBuffer = FALSE; - } -} - -uint8_t *CharsetRecog_IBM420_ar::unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) { - uint8_t *resultArray = unshapeLamAlef(inputBytes, inputBytesLength, length); - - if (resultArray != NULL) { - for (int32_t i = 0; i < inputBytesLength; i++) { - resultArray[i] = unshapeMap_IBM420[resultArray[i]]; - } - } - - return resultArray; -} - -uint8_t *CharsetRecog_IBM420_ar::unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) { - int32_t bigBufferLength = inputBytesLength * 2; - uint8_t *bigBuffer = (uint8_t *)uprv_malloc(bigBufferLength); - uint8_t *resultBuffer = NULL; - - if (bigBuffer != NULL) { - int32_t bufferIndex; - static const uint8_t unshapedLamAlef[] = { 0xb1, 0x56 }; - - for (int32_t i = bufferIndex = 0; i < inputBytesLength; i++) { - if (isLamAlef(inputBytes[i])) { - bigBuffer[bufferIndex++] = unshapedLamAlef[0]; - bigBuffer[bufferIndex++] = unshapedLamAlef[1]; - } else { - bigBuffer[bufferIndex++] = inputBytes[i]; - } - } - - length = bufferIndex; - resultBuffer = (uint8_t *)uprv_malloc(length); - if (resultBuffer != NULL) { - uprv_memcpy(resultBuffer, bigBuffer, length); - } - } - - if (bigBuffer != NULL) { - uprv_free(bigBuffer); - } - - return resultBuffer; -} - -void CharsetRecog_IBM420_ar::matchFinish(InputText *textIn) { - if (deleteBuffer) { - uprv_free(textIn->fInputBytes); - - textIn->fInputBytes = prev_fInputBytes; - textIn->fInputLen = prev_fInputBytesLength; - } -} - -UBool CharsetRecog_IBM420_ar::isLamAlef(uint8_t b) { - static const uint8_t shapedLamAlef[] = { - 0xb2, 0xb3, 0xb4, 0xb5, 0xb7, 0xb8 - }; - - for (uint32_t i = 0; i < sizeof(shapedLamAlef); i++) { - if (b == shapedLamAlef[i]) { - return TRUE; - } - } - - return FALSE; + return result; } CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl() diff --git a/icu4c/source/i18n/csrsbcs.h b/icu4c/source/i18n/csrsbcs.h index cc26b057b48..2579c02905f 100644 --- a/icu4c/source/i18n/csrsbcs.h +++ b/icu4c/source/i18n/csrsbcs.h @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2012, International Business Machines + * Copyright (C) 2005-2013, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -19,15 +19,18 @@ U_NAMESPACE_BEGIN class NGramParser : public UMemory { private: - int32_t byteIndex; int32_t ngram; - - const int32_t *ngramList; - const uint8_t *charMap; + const int32_t *ngramList; int32_t ngramCount; int32_t hitCount; +protected: + int32_t byteIndex; + const uint8_t *charMap; + + void addByte(int32_t b); + public: NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap); @@ -38,14 +41,27 @@ private: int32_t search(const int32_t *table, int32_t value); void lookup(int32_t thisNgram); - void addByte(int32_t b); - int32_t nextByte(InputText *det); + + virtual int32_t nextByte(InputText *det); + virtual void parseCharacters(InputText *det); public: int32_t parse(InputText *det); }; +class NGramParser_IBM420 : public NGramParser +{ +private: + int32_t alef; + int32_t isLamAlef(int32_t b); + int32_t nextByte(InputText *det); + void parseCharacters(InputText *det); + +public: + NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap); +}; + class CharsetRecog_sbcs : public CharsetRecognizer { @@ -244,19 +260,8 @@ public: virtual ~CharsetRecog_IBM420_ar(); const char *getLanguage() const; + int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; -protected: - void matchInit(InputText *textIn); - void matchFinish(InputText *textIn); - -private: - uint8_t *prev_fInputBytes; - int32_t prev_fInputBytesLength; - UBool deleteBuffer; - - UBool isLamAlef(uint8_t b); - uint8_t *unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length); - uint8_t *unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length); }; class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {