ICU-9928 Charset Detector, IBM420_ar oddities

X-SVN-Rev: 33817
This commit is contained in:
Ramy Said 2013-06-11 12:39:56 +00:00
parent 715d77036f
commit ae77a4f9ed
2 changed files with 133 additions and 120 deletions

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2005-2012, International Business Machines
* Copyright (C) 2005-2013, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -96,7 +96,7 @@ int32_t NGramParser::nextByte(InputText *det)
return det->fInputBytes[byteIndex++];
}
int32_t NGramParser::parse(InputText *det)
void NGramParser::parseCharacters(InputText *det)
{
int32_t b;
bool ignoreSpace = FALSE;
@ -113,6 +113,11 @@ int32_t NGramParser::parse(InputText *det)
ignoreSpace = (mb == 0x20);
}
}
}
int32_t NGramParser::parse(InputText *det)
{
parseCharacters(det);
// TODO: Is this OK? The buffer could have ended in the middle of a word...
addByte(0x20);
@ -132,6 +137,102 @@ int32_t NGramParser::parse(InputText *det)
return (int32_t) (rawPercent * 300.0);
}
static const uint8_t unshapeMap_IBM420[] = {
/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
/* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
/* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
/* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
/* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
/* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
/* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
/* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
/* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
/* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
/* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
/* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
/* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
};
NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap):NGramParser(theNgramList, theCharMap)
{
alef = 0x00;
}
int32_t NGramParser_IBM420::isLamAlef(int32_t b)
{
if(b == 0xB2 || b == 0xB3){
return 0x47;
}else if(b == 0xB4 || b == 0xB5){
return 0x49;
}else if(b == 0xB8 || b == 0xB9){
return 0x56;
}else
return 0x00;
}
/*
* Arabic shaping needs to be done manually. Cannot call ArabicShaping class
* because CharsetDetector is dealing with bytes not Unicode code points. We could
* convert the bytes to Unicode code points but that would leave us dependent
* on CharsetICU which we try to avoid. IBM420 converter amongst different versions
* of JDK can produce different results and therefore is also avoided.
*/
int32_t NGramParser_IBM420::nextByte(InputText *det)
{
if (byteIndex >= det->fInputLen || det->fInputBytes[byteIndex] == 0) {
return -1;
}
int next;
alef = isLamAlef(det->fInputBytes[byteIndex]);
if(alef != 0x00)
next = 0xB1 & 0xFF;
else
next = unshapeMap_IBM420[det->fInputBytes[byteIndex]& 0xFF] & 0xFF;
byteIndex++;
return next;
}
void NGramParser_IBM420::parseCharacters(InputText *det)
{
int32_t b;
bool ignoreSpace = FALSE;
while ((b = nextByte(det)) >= 0) {
uint8_t mb = charMap[b];
// TODO: 0x20 might not be a space in all character sets...
if (mb != 0) {
if (!(mb == 0x20 && ignoreSpace)) {
addByte(mb);
}
ignoreSpace = (mb == 0x20);
}
if(alef != 0x00){
mb = charMap[alef & 0xFF];
// TODO: 0x20 might not be a space in all character sets...
if (mb != 0) {
if (!(mb == 0x20 && ignoreSpace)) {
addByte(mb);
}
ignoreSpace = (mb == 0x20);
}
}
}
}
CharsetRecog_sbcs::CharsetRecog_sbcs()
{
// nothing else to do
@ -1098,26 +1199,6 @@ UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results
return (confidence > 0);
}
static const uint8_t unshapeMap_IBM420[] = {
/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
/* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
/* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
/* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
/* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
/* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
/* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
/* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
/* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
/* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
/* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
/* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
/* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
};
CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
{
// nothing to do
@ -1128,88 +1209,15 @@ const char *CharsetRecog_IBM420_ar::getLanguage() const
return "ar";
}
void CharsetRecog_IBM420_ar::matchInit(InputText *textIn) {
prev_fInputBytesLength = textIn->fInputLen;
prev_fInputBytes = textIn->fInputBytes;
int32_t length = 0;
uint8_t *bb = unshape(prev_fInputBytes, prev_fInputBytesLength, length);
int32_t CharsetRecog_IBM420_ar::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const
{
NGramParser_IBM420 parser(ngrams, byteMap);
int32_t result;
if (bb != NULL) {
textIn->fInputBytes = bb;
textIn->fInputLen = length;
result = parser.parse(det);
deleteBuffer = TRUE;
} else {
deleteBuffer = FALSE;
}
}
uint8_t *CharsetRecog_IBM420_ar::unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
uint8_t *resultArray = unshapeLamAlef(inputBytes, inputBytesLength, length);
if (resultArray != NULL) {
for (int32_t i = 0; i < inputBytesLength; i++) {
resultArray[i] = unshapeMap_IBM420[resultArray[i]];
}
}
return resultArray;
}
uint8_t *CharsetRecog_IBM420_ar::unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
int32_t bigBufferLength = inputBytesLength * 2;
uint8_t *bigBuffer = (uint8_t *)uprv_malloc(bigBufferLength);
uint8_t *resultBuffer = NULL;
if (bigBuffer != NULL) {
int32_t bufferIndex;
static const uint8_t unshapedLamAlef[] = { 0xb1, 0x56 };
for (int32_t i = bufferIndex = 0; i < inputBytesLength; i++) {
if (isLamAlef(inputBytes[i])) {
bigBuffer[bufferIndex++] = unshapedLamAlef[0];
bigBuffer[bufferIndex++] = unshapedLamAlef[1];
} else {
bigBuffer[bufferIndex++] = inputBytes[i];
}
}
length = bufferIndex;
resultBuffer = (uint8_t *)uprv_malloc(length);
if (resultBuffer != NULL) {
uprv_memcpy(resultBuffer, bigBuffer, length);
}
}
if (bigBuffer != NULL) {
uprv_free(bigBuffer);
}
return resultBuffer;
}
void CharsetRecog_IBM420_ar::matchFinish(InputText *textIn) {
if (deleteBuffer) {
uprv_free(textIn->fInputBytes);
textIn->fInputBytes = prev_fInputBytes;
textIn->fInputLen = prev_fInputBytesLength;
}
}
UBool CharsetRecog_IBM420_ar::isLamAlef(uint8_t b) {
static const uint8_t shapedLamAlef[] = {
0xb2, 0xb3, 0xb4, 0xb5, 0xb7, 0xb8
};
for (uint32_t i = 0; i < sizeof(shapedLamAlef); i++) {
if (b == shapedLamAlef[i]) {
return TRUE;
}
}
return FALSE;
return result;
}
CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2005-2012, International Business Machines
* Copyright (C) 2005-2013, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -19,15 +19,18 @@ U_NAMESPACE_BEGIN
class NGramParser : public UMemory
{
private:
int32_t byteIndex;
int32_t ngram;
const int32_t *ngramList;
const uint8_t *charMap;
const int32_t *ngramList;
int32_t ngramCount;
int32_t hitCount;
protected:
int32_t byteIndex;
const uint8_t *charMap;
void addByte(int32_t b);
public:
NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap);
@ -38,14 +41,27 @@ private:
int32_t search(const int32_t *table, int32_t value);
void lookup(int32_t thisNgram);
void addByte(int32_t b);
int32_t nextByte(InputText *det);
virtual int32_t nextByte(InputText *det);
virtual void parseCharacters(InputText *det);
public:
int32_t parse(InputText *det);
};
class NGramParser_IBM420 : public NGramParser
{
private:
int32_t alef;
int32_t isLamAlef(int32_t b);
int32_t nextByte(InputText *det);
void parseCharacters(InputText *det);
public:
NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap);
};
class CharsetRecog_sbcs : public CharsetRecognizer
{
@ -244,19 +260,8 @@ public:
virtual ~CharsetRecog_IBM420_ar();
const char *getLanguage() const;
int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
protected:
void matchInit(InputText *textIn);
void matchFinish(InputText *textIn);
private:
uint8_t *prev_fInputBytes;
int32_t prev_fInputBytesLength;
UBool deleteBuffer;
UBool isLamAlef(uint8_t b);
uint8_t *unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
uint8_t *unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
};
class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {