mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-9928 Charset Detector, IBM420_ar oddities
X-SVN-Rev: 33817
This commit is contained in:
parent
715d77036f
commit
ae77a4f9ed
2 changed files with 133 additions and 120 deletions
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2005-2012, International Business Machines
|
||||
* Copyright (C) 2005-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
@ -96,7 +96,7 @@ int32_t NGramParser::nextByte(InputText *det)
|
|||
return det->fInputBytes[byteIndex++];
|
||||
}
|
||||
|
||||
int32_t NGramParser::parse(InputText *det)
|
||||
void NGramParser::parseCharacters(InputText *det)
|
||||
{
|
||||
int32_t b;
|
||||
bool ignoreSpace = FALSE;
|
||||
|
@ -113,6 +113,11 @@ int32_t NGramParser::parse(InputText *det)
|
|||
ignoreSpace = (mb == 0x20);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int32_t NGramParser::parse(InputText *det)
|
||||
{
|
||||
parseCharacters(det);
|
||||
|
||||
// TODO: Is this OK? The buffer could have ended in the middle of a word...
|
||||
addByte(0x20);
|
||||
|
@ -132,6 +137,102 @@ int32_t NGramParser::parse(InputText *det)
|
|||
return (int32_t) (rawPercent * 300.0);
|
||||
}
|
||||
|
||||
static const uint8_t unshapeMap_IBM420[] = {
|
||||
/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
|
||||
/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
|
||||
/* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
|
||||
/* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
/* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
|
||||
/* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
|
||||
/* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
|
||||
/* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
|
||||
/* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
|
||||
/* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
|
||||
/* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
|
||||
/* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
/* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
|
||||
};
|
||||
|
||||
NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap):NGramParser(theNgramList, theCharMap)
|
||||
{
|
||||
alef = 0x00;
|
||||
}
|
||||
|
||||
|
||||
int32_t NGramParser_IBM420::isLamAlef(int32_t b)
|
||||
{
|
||||
if(b == 0xB2 || b == 0xB3){
|
||||
return 0x47;
|
||||
}else if(b == 0xB4 || b == 0xB5){
|
||||
return 0x49;
|
||||
}else if(b == 0xB8 || b == 0xB9){
|
||||
return 0x56;
|
||||
}else
|
||||
return 0x00;
|
||||
}
|
||||
|
||||
/*
|
||||
* Arabic shaping needs to be done manually. Cannot call ArabicShaping class
|
||||
* because CharsetDetector is dealing with bytes not Unicode code points. We could
|
||||
* convert the bytes to Unicode code points but that would leave us dependent
|
||||
* on CharsetICU which we try to avoid. IBM420 converter amongst different versions
|
||||
* of JDK can produce different results and therefore is also avoided.
|
||||
*/
|
||||
int32_t NGramParser_IBM420::nextByte(InputText *det)
|
||||
{
|
||||
|
||||
if (byteIndex >= det->fInputLen || det->fInputBytes[byteIndex] == 0) {
|
||||
return -1;
|
||||
}
|
||||
int next;
|
||||
|
||||
alef = isLamAlef(det->fInputBytes[byteIndex]);
|
||||
if(alef != 0x00)
|
||||
next = 0xB1 & 0xFF;
|
||||
else
|
||||
next = unshapeMap_IBM420[det->fInputBytes[byteIndex]& 0xFF] & 0xFF;
|
||||
|
||||
byteIndex++;
|
||||
|
||||
return next;
|
||||
}
|
||||
|
||||
void NGramParser_IBM420::parseCharacters(InputText *det)
|
||||
{
|
||||
int32_t b;
|
||||
bool ignoreSpace = FALSE;
|
||||
|
||||
while ((b = nextByte(det)) >= 0) {
|
||||
uint8_t mb = charMap[b];
|
||||
|
||||
// TODO: 0x20 might not be a space in all character sets...
|
||||
if (mb != 0) {
|
||||
if (!(mb == 0x20 && ignoreSpace)) {
|
||||
addByte(mb);
|
||||
}
|
||||
ignoreSpace = (mb == 0x20);
|
||||
}
|
||||
|
||||
if(alef != 0x00){
|
||||
mb = charMap[alef & 0xFF];
|
||||
|
||||
// TODO: 0x20 might not be a space in all character sets...
|
||||
if (mb != 0) {
|
||||
if (!(mb == 0x20 && ignoreSpace)) {
|
||||
addByte(mb);
|
||||
}
|
||||
|
||||
ignoreSpace = (mb == 0x20);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CharsetRecog_sbcs::CharsetRecog_sbcs()
|
||||
{
|
||||
// nothing else to do
|
||||
|
@ -1098,26 +1199,6 @@ UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results
|
|||
return (confidence > 0);
|
||||
}
|
||||
|
||||
static const uint8_t unshapeMap_IBM420[] = {
|
||||
/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
|
||||
/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
|
||||
/* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
|
||||
/* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
/* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
|
||||
/* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
|
||||
/* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
|
||||
/* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
|
||||
/* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
|
||||
/* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
|
||||
/* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
|
||||
/* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
/* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
|
||||
};
|
||||
|
||||
CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
|
||||
{
|
||||
// nothing to do
|
||||
|
@ -1128,88 +1209,15 @@ const char *CharsetRecog_IBM420_ar::getLanguage() const
|
|||
return "ar";
|
||||
}
|
||||
|
||||
void CharsetRecog_IBM420_ar::matchInit(InputText *textIn) {
|
||||
prev_fInputBytesLength = textIn->fInputLen;
|
||||
prev_fInputBytes = textIn->fInputBytes;
|
||||
|
||||
int32_t length = 0;
|
||||
uint8_t *bb = unshape(prev_fInputBytes, prev_fInputBytesLength, length);
|
||||
int32_t CharsetRecog_IBM420_ar::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const
|
||||
{
|
||||
NGramParser_IBM420 parser(ngrams, byteMap);
|
||||
int32_t result;
|
||||
|
||||
if (bb != NULL) {
|
||||
textIn->fInputBytes = bb;
|
||||
textIn->fInputLen = length;
|
||||
result = parser.parse(det);
|
||||
|
||||
deleteBuffer = TRUE;
|
||||
} else {
|
||||
deleteBuffer = FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t *CharsetRecog_IBM420_ar::unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
|
||||
uint8_t *resultArray = unshapeLamAlef(inputBytes, inputBytesLength, length);
|
||||
|
||||
if (resultArray != NULL) {
|
||||
for (int32_t i = 0; i < inputBytesLength; i++) {
|
||||
resultArray[i] = unshapeMap_IBM420[resultArray[i]];
|
||||
}
|
||||
}
|
||||
|
||||
return resultArray;
|
||||
}
|
||||
|
||||
uint8_t *CharsetRecog_IBM420_ar::unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
|
||||
int32_t bigBufferLength = inputBytesLength * 2;
|
||||
uint8_t *bigBuffer = (uint8_t *)uprv_malloc(bigBufferLength);
|
||||
uint8_t *resultBuffer = NULL;
|
||||
|
||||
if (bigBuffer != NULL) {
|
||||
int32_t bufferIndex;
|
||||
static const uint8_t unshapedLamAlef[] = { 0xb1, 0x56 };
|
||||
|
||||
for (int32_t i = bufferIndex = 0; i < inputBytesLength; i++) {
|
||||
if (isLamAlef(inputBytes[i])) {
|
||||
bigBuffer[bufferIndex++] = unshapedLamAlef[0];
|
||||
bigBuffer[bufferIndex++] = unshapedLamAlef[1];
|
||||
} else {
|
||||
bigBuffer[bufferIndex++] = inputBytes[i];
|
||||
}
|
||||
}
|
||||
|
||||
length = bufferIndex;
|
||||
resultBuffer = (uint8_t *)uprv_malloc(length);
|
||||
if (resultBuffer != NULL) {
|
||||
uprv_memcpy(resultBuffer, bigBuffer, length);
|
||||
}
|
||||
}
|
||||
|
||||
if (bigBuffer != NULL) {
|
||||
uprv_free(bigBuffer);
|
||||
}
|
||||
|
||||
return resultBuffer;
|
||||
}
|
||||
|
||||
void CharsetRecog_IBM420_ar::matchFinish(InputText *textIn) {
|
||||
if (deleteBuffer) {
|
||||
uprv_free(textIn->fInputBytes);
|
||||
|
||||
textIn->fInputBytes = prev_fInputBytes;
|
||||
textIn->fInputLen = prev_fInputBytesLength;
|
||||
}
|
||||
}
|
||||
|
||||
UBool CharsetRecog_IBM420_ar::isLamAlef(uint8_t b) {
|
||||
static const uint8_t shapedLamAlef[] = {
|
||||
0xb2, 0xb3, 0xb4, 0xb5, 0xb7, 0xb8
|
||||
};
|
||||
|
||||
for (uint32_t i = 0; i < sizeof(shapedLamAlef); i++) {
|
||||
if (b == shapedLamAlef[i]) {
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
return FALSE;
|
||||
return result;
|
||||
}
|
||||
|
||||
CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2005-2012, International Business Machines
|
||||
* Copyright (C) 2005-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
@ -19,15 +19,18 @@ U_NAMESPACE_BEGIN
|
|||
class NGramParser : public UMemory
|
||||
{
|
||||
private:
|
||||
int32_t byteIndex;
|
||||
int32_t ngram;
|
||||
|
||||
const int32_t *ngramList;
|
||||
const uint8_t *charMap;
|
||||
const int32_t *ngramList;
|
||||
|
||||
int32_t ngramCount;
|
||||
int32_t hitCount;
|
||||
|
||||
protected:
|
||||
int32_t byteIndex;
|
||||
const uint8_t *charMap;
|
||||
|
||||
void addByte(int32_t b);
|
||||
|
||||
public:
|
||||
NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap);
|
||||
|
||||
|
@ -38,14 +41,27 @@ private:
|
|||
int32_t search(const int32_t *table, int32_t value);
|
||||
|
||||
void lookup(int32_t thisNgram);
|
||||
void addByte(int32_t b);
|
||||
int32_t nextByte(InputText *det);
|
||||
|
||||
virtual int32_t nextByte(InputText *det);
|
||||
virtual void parseCharacters(InputText *det);
|
||||
|
||||
public:
|
||||
int32_t parse(InputText *det);
|
||||
|
||||
};
|
||||
|
||||
class NGramParser_IBM420 : public NGramParser
|
||||
{
|
||||
private:
|
||||
int32_t alef;
|
||||
int32_t isLamAlef(int32_t b);
|
||||
int32_t nextByte(InputText *det);
|
||||
void parseCharacters(InputText *det);
|
||||
|
||||
public:
|
||||
NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap);
|
||||
};
|
||||
|
||||
|
||||
class CharsetRecog_sbcs : public CharsetRecognizer
|
||||
{
|
||||
|
@ -244,19 +260,8 @@ public:
|
|||
virtual ~CharsetRecog_IBM420_ar();
|
||||
|
||||
const char *getLanguage() const;
|
||||
int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
|
||||
|
||||
protected:
|
||||
void matchInit(InputText *textIn);
|
||||
void matchFinish(InputText *textIn);
|
||||
|
||||
private:
|
||||
uint8_t *prev_fInputBytes;
|
||||
int32_t prev_fInputBytesLength;
|
||||
UBool deleteBuffer;
|
||||
|
||||
UBool isLamAlef(uint8_t b);
|
||||
uint8_t *unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
|
||||
uint8_t *unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
|
||||
};
|
||||
|
||||
class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {
|
||||
|
|
Loading…
Add table
Reference in a new issue