JS: Fixed UTF-8 string encoder/decoder for high codepoints.
This commit is contained in:
parent
8d8115bf52
commit
23f108d471
2 changed files with 36 additions and 15 deletions
|
@ -895,11 +895,9 @@ jspb.BinaryDecoder.prototype.readEnum = function() {
|
|||
|
||||
/**
|
||||
* Reads and parses a UTF-8 encoded unicode string from the stream.
|
||||
* The code is inspired by maps.vectortown.parse.StreamedDataViewReader, with
|
||||
* the exception that the implementation here does not get confused if it
|
||||
* encounters characters longer than three bytes. These characters are ignored
|
||||
* though, as they are extremely rare: three UTF-8 bytes cover virtually all
|
||||
* characters in common use (http://en.wikipedia.org/wiki/UTF-8).
|
||||
* The code is inspired by maps.vectortown.parse.StreamedDataViewReader.
|
||||
* Supports codepoints from U+0000 up to U+10FFFF.
|
||||
* (http://en.wikipedia.org/wiki/UTF-8).
|
||||
* @param {number} length The length of the string to read.
|
||||
* @return {string} The decoded string.
|
||||
*/
|
||||
|
@ -907,30 +905,47 @@ jspb.BinaryDecoder.prototype.readString = function(length) {
|
|||
var bytes = this.bytes_;
|
||||
var cursor = this.cursor_;
|
||||
var end = cursor + length;
|
||||
var chars = [];
|
||||
var codepoints = [];
|
||||
|
||||
while (cursor < end) {
|
||||
var c = bytes[cursor++];
|
||||
if (c < 128) { // Regular 7-bit ASCII.
|
||||
chars.push(c);
|
||||
codepoints.push(c);
|
||||
} else if (c < 192) {
|
||||
// UTF-8 continuation mark. We are out of sync. This
|
||||
// might happen if we attempted to read a character
|
||||
// with more than three bytes.
|
||||
// with more than four bytes.
|
||||
continue;
|
||||
} else if (c < 224) { // UTF-8 with two bytes.
|
||||
var c2 = bytes[cursor++];
|
||||
chars.push(((c & 31) << 6) | (c2 & 63));
|
||||
codepoints.push(((c & 31) << 6) | (c2 & 63));
|
||||
} else if (c < 240) { // UTF-8 with three bytes.
|
||||
var c2 = bytes[cursor++];
|
||||
var c3 = bytes[cursor++];
|
||||
chars.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
|
||||
codepoints.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
|
||||
} else if (c < 248) { // UTF-8 with 4 bytes.
|
||||
var c2 = bytes[cursor++];
|
||||
var c3 = bytes[cursor++];
|
||||
var c4 = bytes[cursor++];
|
||||
// Characters written on 4 bytes have 21 bits for a codepoint.
|
||||
// We can't fit that on 16bit characters, so we use surrogates.
|
||||
var codepoint = ((c & 7) << 18) | ((c2 & 63) << 12) | ((c3 & 63) << 6) | (c4 & 63);
|
||||
// Surrogates formula from wikipedia.
|
||||
// 1. Subtract 0x10000 from codepoint
|
||||
codepoint -= 65536;
|
||||
// 2. Split this into the high 10-bit value and the low 10-bit value
|
||||
var low = codepoint & 1023;
|
||||
var high = (codepoint >> 10) & 1023;
|
||||
// 3. Add 0xD800 to the high value to form the high surrogate
|
||||
high += 55296;
|
||||
// 4. Add 0xDC00 to the low value to form the low surrogate:
|
||||
low += 56320;
|
||||
codepoints.push(high);
|
||||
codepoints.push(low);
|
||||
}
|
||||
}
|
||||
|
||||
// String.fromCharCode.apply is faster than manually appending characters on
|
||||
// Chrome 25+, and generates no additional cons string garbage.
|
||||
var result = String.fromCharCode.apply(null, chars);
|
||||
var result = String.fromCodePoint.apply(null, codepoints);
|
||||
this.cursor_ = cursor;
|
||||
return result;
|
||||
};
|
||||
|
|
|
@ -412,16 +412,22 @@ jspb.BinaryEncoder.prototype.writeString = function(value) {
|
|||
|
||||
// UTF16 to UTF8 conversion loop swiped from goog.crypt.stringToUtf8ByteArray.
|
||||
for (var i = 0; i < value.length; i++) {
|
||||
var c = value.charCodeAt(i);
|
||||
var c = value.codePointAt(i);
|
||||
if (c < 128) {
|
||||
this.buffer_.push(c);
|
||||
} else if (c < 2048) {
|
||||
this.buffer_.push((c >> 6) | 192);
|
||||
this.buffer_.push((c & 63) | 128);
|
||||
} else {
|
||||
} else if (c < 65536) {
|
||||
this.buffer_.push((c >> 12) | 224);
|
||||
this.buffer_.push(((c >> 6) & 63) | 128);
|
||||
this.buffer_.push((c & 63) | 128);
|
||||
} else {
|
||||
this.buffer_.push((c >> 18) | 240);
|
||||
this.buffer_.push(((c >> 12) & 63 ) | 128);
|
||||
this.buffer_.push(((c >> 6) & 63) | 128);
|
||||
this.buffer_.push((c & 63) | 128);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue