ICU-9928 Charset Detector, IBM420_ar oddities

X-SVN-Rev: 33817
2025-04-07 22:44:49 +00:00 · 2013-06-11 12:39:56 +00:00 · 2013-06-11 12:39:56 +00:00 · ae77a4f9ed
commit ae77a4f9ed
parent 715d77036f
2 changed files with 133 additions and 120 deletions
--- a/icu4c/source/i18n/csrsbcs.cpp
+++ b/icu4c/source/i18n/csrsbcs.cpp
@ -1,6 +1,6 @@
 /*
 **********************************************************************
- *   Copyright (C) 2005-2012, International Business Machines
+ *   Copyright (C) 2005-2013, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */
@ -96,7 +96,7 @@ int32_t NGramParser::nextByte(InputText *det)
    return det->fInputBytes[byteIndex++];
 }

-int32_t NGramParser::parse(InputText *det)
+void NGramParser::parseCharacters(InputText *det)
 {
    int32_t b;
    bool ignoreSpace = FALSE;
@ -113,6 +113,11 @@ int32_t NGramParser::parse(InputText *det)
            ignoreSpace = (mb == 0x20);
        }
    }
+}
+
+int32_t NGramParser::parse(InputText *det)
+{
+    parseCharacters(det);

    // TODO: Is this OK? The buffer could have ended in the middle of a word...
    addByte(0x20);
@ -132,6 +137,102 @@ int32_t NGramParser::parse(InputText *det)
    return (int32_t) (rawPercent * 300.0);
 }

+static const uint8_t unshapeMap_IBM420[] = {
+/*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
+/* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 
+/* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 
+/* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 
+/* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 
+/* 4- */    0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 
+/* 5- */    0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, 
+/* 6- */    0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 
+/* 7- */    0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 
+/* 8- */    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F, 
+/* 9- */    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E, 
+/* A- */    0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF, 
+/* B- */    0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF, 
+/* C- */    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF, 
+/* D- */    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF, 
+/* E- */    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 
+/* F- */    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, 
+};
+
+NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap):NGramParser(theNgramList, theCharMap)
+{
+	alef = 0x00;
+}
+
+
+int32_t NGramParser_IBM420::isLamAlef(int32_t b)
+{
+	if(b == 0xB2 || b == 0xB3){
+         	return 0x47;        		
+        }else if(b == 0xB4 || b == 0xB5){
+         	return 0x49;
+        }else if(b == 0xB8 || b == 0xB9){
+         	return 0x56;
+        }else
+         	return 0x00;
+}
+
+/*
+* Arabic shaping needs to be done manually. Cannot call ArabicShaping class
+* because CharsetDetector is dealing with bytes not Unicode code points. We could
+* convert the bytes to Unicode code points but that would leave us dependent
+* on CharsetICU which we try to avoid. IBM420 converter amongst different versions
+* of JDK can produce different results and therefore is also avoided.
+*/ 
+int32_t NGramParser_IBM420::nextByte(InputText *det)
+{
+	
+    if (byteIndex >= det->fInputLen || det->fInputBytes[byteIndex] == 0) {
+        return -1;
+    }              
+    int next;
+             
+    alef = isLamAlef(det->fInputBytes[byteIndex]);
+    if(alef != 0x00)
+        next = 0xB1 & 0xFF;
+    else
+        next = unshapeMap_IBM420[det->fInputBytes[byteIndex]& 0xFF] & 0xFF;
+            
+    byteIndex++;
+             
+    return next;
+}
+
+void NGramParser_IBM420::parseCharacters(InputText *det)
+{
+	int32_t b;
+    bool ignoreSpace = FALSE;
+
+    while ((b = nextByte(det)) >= 0) {
+        uint8_t mb = charMap[b];
+
+        // TODO: 0x20 might not be a space in all character sets...
+        if (mb != 0) {
+            if (!(mb == 0x20 && ignoreSpace)) {
+                addByte(mb);
+            }
+            ignoreSpace = (mb == 0x20);
+        }
+		
+		if(alef != 0x00){
+            mb = charMap[alef & 0xFF];
+                     
+            // TODO: 0x20 might not be a space in all character sets...
+            if (mb != 0) {
+                if (!(mb == 0x20 && ignoreSpace)) {
+                    addByte(mb);                    
+                }
+                         
+                ignoreSpace = (mb == 0x20);
+            }
+                	 
+        }
+    }
+}
+
 CharsetRecog_sbcs::CharsetRecog_sbcs()
 {
    // nothing else to do
@ -1098,26 +1199,6 @@ UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results
    return (confidence > 0);
 }

-static const uint8_t unshapeMap_IBM420[] = {
-/*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
-/* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 
-/* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 
-/* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 
-/* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 
-/* 4- */    0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 
-/* 5- */    0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, 
-/* 6- */    0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 
-/* 7- */    0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 
-/* 8- */    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F, 
-/* 9- */    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E, 
-/* A- */    0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF, 
-/* B- */    0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF, 
-/* C- */    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF, 
-/* D- */    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF, 
-/* E- */    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 
-/* F- */    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, 
-};
-
 CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
 {
    // nothing to do
@ -1128,88 +1209,15 @@ const char *CharsetRecog_IBM420_ar::getLanguage() const
    return "ar";
 }

-void CharsetRecog_IBM420_ar::matchInit(InputText *textIn) {
-    prev_fInputBytesLength = textIn->fInputLen;
-    prev_fInputBytes = textIn->fInputBytes;
    
-    int32_t length = 0;
-    uint8_t *bb = unshape(prev_fInputBytes, prev_fInputBytesLength, length);
+int32_t CharsetRecog_IBM420_ar::match_sbcs(InputText *det, const int32_t ngrams[],  const uint8_t byteMap[]) const
+{
+    NGramParser_IBM420 parser(ngrams, byteMap);
+    int32_t result;
    
-    if (bb != NULL) {
-        textIn->fInputBytes = bb;
-        textIn->fInputLen = length;
+    result = parser.parse(det);
        
-        deleteBuffer = TRUE;
-    } else {
-        deleteBuffer = FALSE;
-    }
-}
-
-uint8_t *CharsetRecog_IBM420_ar::unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
-    uint8_t *resultArray = unshapeLamAlef(inputBytes, inputBytesLength, length);
-    
-    if (resultArray != NULL) {
-        for (int32_t i = 0; i < inputBytesLength; i++) {
-            resultArray[i] = unshapeMap_IBM420[resultArray[i]];
-        }
-    }
-    
-    return resultArray;
-}
-
-uint8_t *CharsetRecog_IBM420_ar::unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
-    int32_t bigBufferLength = inputBytesLength * 2;
-    uint8_t *bigBuffer = (uint8_t *)uprv_malloc(bigBufferLength);
-    uint8_t *resultBuffer = NULL;
-    
-    if (bigBuffer != NULL) {
-        int32_t bufferIndex;
-        static const uint8_t unshapedLamAlef[] = { 0xb1, 0x56 };
-        
-        for (int32_t i = bufferIndex = 0; i < inputBytesLength; i++) {
-            if (isLamAlef(inputBytes[i])) {
-                bigBuffer[bufferIndex++] = unshapedLamAlef[0];
-                bigBuffer[bufferIndex++] = unshapedLamAlef[1];
-            } else {
-                bigBuffer[bufferIndex++] = inputBytes[i];
-            }
-        }
-        
-        length = bufferIndex;
-        resultBuffer = (uint8_t *)uprv_malloc(length);
-        if (resultBuffer != NULL) {
-            uprv_memcpy(resultBuffer, bigBuffer, length);
-        }
-    }
-    
-    if (bigBuffer != NULL) {
-        uprv_free(bigBuffer);
-    }
-    
-    return resultBuffer;
-}
-
-void CharsetRecog_IBM420_ar::matchFinish(InputText *textIn) {
-    if (deleteBuffer) {
-        uprv_free(textIn->fInputBytes);
-        
-        textIn->fInputBytes = prev_fInputBytes;
-        textIn->fInputLen = prev_fInputBytesLength;
-    }
-}
-
-UBool CharsetRecog_IBM420_ar::isLamAlef(uint8_t b) {
-    static const uint8_t shapedLamAlef[] = {
-        0xb2, 0xb3, 0xb4, 0xb5, 0xb7, 0xb8 
-    };
-    
-    for (uint32_t i = 0; i < sizeof(shapedLamAlef); i++) {
-        if (b == shapedLamAlef[i]) {
-            return TRUE;
-        }
-    }
-    
-    return FALSE;
+    return result;
 }

 CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
--- a/icu4c/source/i18n/csrsbcs.h
+++ b/icu4c/source/i18n/csrsbcs.h
@ -1,6 +1,6 @@
 /*
 **********************************************************************
- *   Copyright (C) 2005-2012, International Business Machines
+ *   Copyright (C) 2005-2013, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */
@ -19,15 +19,18 @@ U_NAMESPACE_BEGIN
 class NGramParser : public UMemory
 {
 private:
-    int32_t byteIndex;
    int32_t ngram;
-
-    const int32_t *ngramList;
-    const uint8_t *charMap;
+    const int32_t *ngramList;    

    int32_t ngramCount;
    int32_t hitCount;

+protected:
+	int32_t byteIndex;
+    const uint8_t *charMap;
+
+	void addByte(int32_t b);
+
 public:
    NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap);

@ -38,14 +41,27 @@ private:
    int32_t search(const int32_t *table, int32_t value);

    void lookup(int32_t thisNgram);
-    void addByte(int32_t b);
-    int32_t nextByte(InputText *det);
+    
+    virtual int32_t nextByte(InputText *det);
+	virtual void parseCharacters(InputText *det);

 public:
    int32_t parse(InputText *det);

 };

+class NGramParser_IBM420 : public NGramParser
+{
+private:
+	int32_t alef;
+	int32_t isLamAlef(int32_t b);
+	int32_t nextByte(InputText *det);
+	void parseCharacters(InputText *det);
+
+public:
+    NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap);
+};
+

 class CharsetRecog_sbcs : public CharsetRecognizer
 {
@ -244,19 +260,8 @@ public:
    virtual ~CharsetRecog_IBM420_ar();

    const char *getLanguage() const;
+	int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
    
-protected:
-    void matchInit(InputText *textIn);
-    void matchFinish(InputText *textIn);
-    
-private:
-    uint8_t *prev_fInputBytes;
-    int32_t prev_fInputBytesLength;
-    UBool deleteBuffer;
-    
-    UBool isLamAlef(uint8_t b);
-    uint8_t *unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
-    uint8_t *unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
 };

 class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {