icu/icu4c/source/i18n/csrutf8.cpp
Fredrik Roubert 030fa1a479 ICU-21148 Consistently use standard lowercase true/false everywhere.
This is the normal standard way in C, C++ as well as Java and there's no
longer any reason for ICU to be different. The various internal macros
providing custom boolean constants can all be deleted and code as well
as documentation can be updated to use lowercase true/false everywhere.
2022-09-07 20:56:33 +02:00

111 lines
3 KiB
C++

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2005-2014, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION
#include "csrutf8.h"
#include "csmatch.h"
U_NAMESPACE_BEGIN
CharsetRecog_UTF8::~CharsetRecog_UTF8()
{
// nothing to do
}
const char *CharsetRecog_UTF8::getName() const
{
return "UTF-8";
}
UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
bool hasBOM = false;
int32_t numValid = 0;
int32_t numInvalid = 0;
const uint8_t *inputBytes = input->fRawInput;
int32_t i;
int32_t trailBytes = 0;
int32_t confidence;
if (input->fRawLength >= 3 &&
inputBytes[0] == 0xEF && inputBytes[1] == 0xBB && inputBytes[2] == 0xBF) {
hasBOM = true;
}
// Scan for multi-byte sequences
for (i=0; i < input->fRawLength; i += 1) {
int32_t b = inputBytes[i];
if ((b & 0x80) == 0) {
continue; // ASCII
}
// Hi bit on char found. Figure out how long the sequence should be
if ((b & 0x0E0) == 0x0C0) {
trailBytes = 1;
} else if ((b & 0x0F0) == 0x0E0) {
trailBytes = 2;
} else if ((b & 0x0F8) == 0xF0) {
trailBytes = 3;
} else {
numInvalid += 1;
continue;
}
// Verify that we've got the right number of trail bytes in the sequence
for (;;) {
i += 1;
if (i >= input->fRawLength) {
break;
}
b = inputBytes[i];
if ((b & 0xC0) != 0x080) {
numInvalid += 1;
break;
}
if (--trailBytes == 0) {
numValid += 1;
break;
}
}
}
// Cook up some sort of confidence score, based on presence of a BOM
// and the existence of valid and/or invalid multi-byte sequences.
confidence = 0;
if (hasBOM && numInvalid == 0) {
confidence = 100;
} else if (hasBOM && numValid > numInvalid*10) {
confidence = 80;
} else if (numValid > 3 && numInvalid == 0) {
confidence = 100;
} else if (numValid > 0 && numInvalid == 0) {
confidence = 80;
} else if (numValid == 0 && numInvalid == 0) {
// Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
// accepts ASCII with confidence = 10.
confidence = 15;
} else if (numValid > numInvalid*10) {
// Probably corrupt utf-8 data. Valid sequences aren't likely by chance.
confidence = 25;
}
results->set(input, this, confidence);
return (confidence > 0);
}
U_NAMESPACE_END
#endif