osb/source/core/StarUnicode.hpp

234 lines
6.5 KiB
C++
Raw Normal View History

2023-06-20 04:33:09 +00:00
#ifndef STAR_UTF8_HPP
#define STAR_UTF8_HPP
#include "StarByteArray.hpp"
#include "StarMaybe.hpp"
namespace Star {
STAR_EXCEPTION(UnicodeException, StarException);
typedef char Utf8Type;
typedef char32_t Utf32Type;
#define STAR_UTF32_REPLACEMENT_CHAR 0x000000b7L
void throwInvalidUtf8Sequence();
void throwMissingUtf8End();
void throwInvalidUtf32CodePoint(Utf32Type val);
// If passed NPos as a size, assumes modified UTF-8 and stops on NULL byte.
// Otherwise, ignores NULL.
size_t utf8Length(Utf8Type const* utf8, size_t size = NPos);
// Encode up to six utf8 bytes into a utf32 character. If passed NPos as len,
// assumes modified UTF-8 and stops on NULL, otherwise ignores.
size_t utf8DecodeChar(Utf8Type const* utf8, Utf32Type* utf32, size_t len = NPos);
// Encode single utf32 char into up to 6 utf8 characters.
size_t utf8EncodeChar(Utf8Type* utf8, Utf32Type utf32, size_t len = 6);
Utf32Type hexStringToUtf32(std::string const& codepoint, Maybe<Utf32Type> previousCodepoint = {});
std::string hexStringFromUtf32(Utf32Type character);
bool isUtf16LeadSurrogate(Utf32Type codepoint);
bool isUtf16TrailSurrogate(Utf32Type codepoint);
Utf32Type utf32FromUtf16SurrogatePair(Utf32Type lead, Utf32Type trail);
pair<Utf32Type, Maybe<Utf32Type>> utf32ToUtf16SurrogatePair(Utf32Type codepoint);
// Bidirectional iterator that can make utf8 appear as utf32
template <class BaseIterator, class U32Type = Utf32Type>
class U8ToU32Iterator {
public:
typedef ptrdiff_t difference_type;
typedef U32Type value_type;
typedef U32Type* pointer;
typedef U32Type& reference;
typedef std::bidirectional_iterator_tag iterator_category;
U8ToU32Iterator() : m_position(), m_value(pending_read) {}
U8ToU32Iterator(BaseIterator b) : m_position(b), m_value(pending_read) {}
2023-06-28 10:07:22 +00:00
BaseIterator const& base() const {
return m_position;
}
2023-06-20 04:33:09 +00:00
U32Type const& operator*() const {
if (m_value == pending_read)
extract_current();
return m_value;
}
U8ToU32Iterator const& operator++() {
increment();
return *this;
}
U8ToU32Iterator operator++(int) {
U8ToU32Iterator clone(*this);
increment();
return clone;
}
U8ToU32Iterator const& operator--() {
decrement();
return *this;
}
U8ToU32Iterator operator--(int) {
U8ToU32Iterator clone(*this);
decrement();
return clone;
}
bool operator==(U8ToU32Iterator const& that) const {
return equal(that);
}
bool operator!=(U8ToU32Iterator const& that) const {
return !equal(that);
}
private:
// special values for pending iterator reads:
static U32Type const pending_read = 0xffffffffu;
static void invalid_sequence() {
throwInvalidUtf8Sequence();
}
static unsigned utf8_byte_count(Utf8Type c) {
// if the most significant bit with a zero in it is in position
// 8-N then there are N bytes in this UTF-8 sequence:
uint8_t mask = 0x80u;
unsigned result = 0;
while (c & mask) {
++result;
mask >>= 1;
}
return (result == 0) ? 1 : ((result > 4) ? 4 : result);
}
static unsigned utf8_trailing_byte_count(Utf8Type c) {
return utf8_byte_count(c) - 1;
}
void increment() {
// skip high surrogate first if there is one:
unsigned c = utf8_byte_count(*m_position);
std::advance(m_position, c);
m_value = pending_read;
}
void decrement() {
// Keep backtracking until we don't have a trailing character:
unsigned count = 0;
while (((uint8_t) * --m_position & 0xC0u) == 0x80u)
++count;
// now check that the sequence was valid:
if (count != utf8_trailing_byte_count(*m_position))
invalid_sequence();
m_value = pending_read;
}
bool equal(const U8ToU32Iterator& that) const {
return m_position == that.m_position;
}
void extract_current() const {
m_value = static_cast<Utf8Type>(*m_position);
// we must not have a continuation character:
if (((uint8_t)m_value & 0xC0u) == 0x80u)
invalid_sequence();
// see how many extra byts we have:
unsigned extra = utf8_trailing_byte_count(*m_position);
// extract the extra bits, 6 from each extra byte:
BaseIterator next(m_position);
for (unsigned c = 0; c < extra; ++c) {
++next;
m_value <<= 6;
auto entry = static_cast<uint8_t>(*next);
if ((c > 0) && ((entry & 0xC0u) != 0x80u))
invalid_sequence();
m_value += entry & 0x3Fu;
}
// we now need to remove a few of the leftmost bits, but how many depends
// upon how many extra bytes we've extracted:
static const Utf32Type masks[4] = {
0x7Fu, 0x7FFu, 0xFFFFu, 0x1FFFFFu,
};
m_value &= masks[extra];
// check the result:
if ((uint32_t)m_value > (uint32_t)0x10FFFFu)
invalid_sequence();
}
BaseIterator m_position;
mutable U32Type m_value;
};
// Output iterator
template <class BaseIterator, class U32Type = Utf32Type>
class Utf8OutputIterator {
public:
typedef void difference_type;
typedef void value_type;
typedef U32Type* pointer;
typedef U32Type& reference;
Utf8OutputIterator(const BaseIterator& b) : m_position(b) {}
Utf8OutputIterator(const Utf8OutputIterator& that) : m_position(that.m_position) {}
Utf8OutputIterator& operator=(const Utf8OutputIterator& that) {
m_position = that.m_position;
return *this;
}
const Utf8OutputIterator& operator*() const {
return *this;
}
void operator=(U32Type val) const {
push(val);
}
Utf8OutputIterator& operator++() {
return *this;
}
Utf8OutputIterator& operator++(int) {
return *this;
}
private:
static void invalid_utf32_code_point(U32Type val) {
throwInvalidUtf32CodePoint(val);
}
void push(U32Type c) const {
if (c > 0x10FFFFu)
invalid_utf32_code_point(c);
if ((uint32_t)c < 0x80u) {
*m_position++ = static_cast<Utf8Type>((uint32_t)c);
} else if ((uint32_t)c < 0x800u) {
*m_position++ = static_cast<Utf8Type>(0xC0u + ((uint32_t)c >> 6));
*m_position++ = static_cast<Utf8Type>(0x80u + ((uint32_t)c & 0x3Fu));
} else if ((uint32_t)c < 0x10000u) {
*m_position++ = static_cast<Utf8Type>(0xE0u + ((uint32_t)c >> 12));
*m_position++ = static_cast<Utf8Type>(0x80u + (((uint32_t)c >> 6) & 0x3Fu));
*m_position++ = static_cast<Utf8Type>(0x80u + ((uint32_t)c & 0x3Fu));
} else {
*m_position++ = static_cast<Utf8Type>(0xF0u + ((uint32_t)c >> 18));
*m_position++ = static_cast<Utf8Type>(0x80u + (((uint32_t)c >> 12) & 0x3Fu));
*m_position++ = static_cast<Utf8Type>(0x80u + (((uint32_t)c >> 6) & 0x3Fu));
*m_position++ = static_cast<Utf8Type>(0x80u + ((uint32_t)c & 0x3Fu));
}
}
mutable BaseIterator m_position;
};
}
#endif