From eab76c5312504aacba2ff4c82cf44726171a9e32 Mon Sep 17 00:00:00 2001 From: nemtrif Date: Sun, 6 Aug 2023 07:32:16 -0400 Subject: [PATCH] next16 --- source/utf8/checked.h | 14 ++++++++++++++ source/utf8/core.h | 36 ++++++++++++++++++++++++++++++++++++ source/utf8/unchecked.h | 9 +++++++++ 3 files changed, 59 insertions(+) diff --git a/source/utf8/checked.h b/source/utf8/checked.h index 2d159f5..c10f9b3 100644 --- a/source/utf8/checked.h +++ b/source/utf8/checked.h @@ -167,6 +167,20 @@ namespace utf8 return cp; } + template + utfchar32_t next16(word_iterator& it, word_iterator end) + { + utfchar32_t cp = 0; + internal::utf_error err_code = utf8::internal::validate_next16(it, end, cp); + switch (err_code) { + case internal::UTF8_OK : + break; + case internal::NOT_ENOUGH_ROOM : + throw not_enough_room(); + } + return cp; + } + template utfchar32_t peek_next(octet_iterator it, octet_iterator end) { diff --git a/source/utf8/core.h b/source/utf8/core.h index 096e529..4494c53 100644 --- a/source/utf8/core.h +++ b/source/utf8/core.h @@ -305,6 +305,42 @@ namespace internal return utf8::internal::validate_next(it, end, ignored); } + template + utf_error validate_next16(word_iterator& it, word_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + word_iterator original_it = it; + + utf_error err = UTF8_OK; + + const utfchar16_t first_word = *it++; + if (!is_surrogate(first_word)) { + code_point = first_word; + return UTF8_OK; + } + else { + if (it == end) + err = NOT_ENOUGH_ROOM; + else if (is_lead_surrogate(first_word)) { + const utfchar16_t second_word = *it++; + if (is_trail_surrogate(second_word)) { + code_point = (first_word << 10) + second_word + SURROGATE_OFFSET; + return UTF8_OK; + } else + err = INCOMPLETE_SEQUENCE; + + } else { + err = INVALID_LEAD; + } + } + // error branch + it = original_it; + return err; + } + // Internal implementation of both checked and unchecked append() function // This function will be invoked by the overloads below, as they will know // the octet_type. diff --git a/source/utf8/unchecked.h b/source/utf8/unchecked.h index 835c429..94f10a5 100644 --- a/source/utf8/unchecked.h +++ b/source/utf8/unchecked.h @@ -136,6 +136,15 @@ namespace utf8 return utf8::unchecked::next(it); } + template + utfchar32_t next16(word_iterator& it) + { + utfchar32_t cp = utf8::internal::mask16(*it++); + if (utf8::internal::is_lead_surrogate(cp)) + return (cp << 10) + *it++ + utf8::internal::SURROGATE_OFFSET; + return cp; + } + template utfchar32_t prior(octet_iterator& it) {