461 lines
14 KiB
C++
461 lines
14 KiB
C++
|
//
|
||
|
// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
|
||
|
//
|
||
|
// Distributed under the Boost Software License, Version 1.0. (See
|
||
|
// accompanying file LICENSE_1_0.txt or copy at
|
||
|
// http://www.boost.org/LICENSE_1_0.txt)
|
||
|
//
|
||
|
#ifndef BOOST_LOCALE_UTF_HPP_INCLUDED
|
||
|
#define BOOST_LOCALE_UTF_HPP_INCLUDED
|
||
|
|
||
|
#include <boost/cstdint.hpp>
|
||
|
|
||
|
namespace boost {
|
||
|
namespace locale {
|
||
|
///
|
||
|
/// \brief Namespace that holds basic operations on UTF encoded sequences
|
||
|
///
|
||
|
/// All functions defined in this namespace do not require linking with Boost.Locale library
|
||
|
///
|
||
|
namespace utf {
|
||
|
/// \cond INTERNAL
|
||
|
#ifdef __GNUC__
|
||
|
# define BOOST_LOCALE_LIKELY(x) __builtin_expect((x),1)
|
||
|
# define BOOST_LOCALE_UNLIKELY(x) __builtin_expect((x),0)
|
||
|
#else
|
||
|
# define BOOST_LOCALE_LIKELY(x) (x)
|
||
|
# define BOOST_LOCALE_UNLIKELY(x) (x)
|
||
|
#endif
|
||
|
/// \endcond
|
||
|
|
||
|
///
|
||
|
/// \brief The integral type that can hold a Unicode code point
|
||
|
///
|
||
|
typedef uint32_t code_point;
|
||
|
|
||
|
///
|
||
|
/// \brief Special constant that defines illegal code point
|
||
|
///
|
||
|
static const code_point illegal = 0xFFFFFFFFu;
|
||
|
|
||
|
///
|
||
|
/// \brief Special constant that defines incomplete code point
|
||
|
///
|
||
|
static const code_point incomplete = 0xFFFFFFFEu;
|
||
|
|
||
|
///
|
||
|
/// \brief the function checks if \a v is a valid code point
|
||
|
///
|
||
|
inline bool is_valid_codepoint(code_point v)
|
||
|
{
|
||
|
if(v>0x10FFFF)
|
||
|
return false;
|
||
|
if(0xD800 <=v && v<= 0xDFFF) // surragates
|
||
|
return false;
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
#ifdef BOOST_LOCALE_DOXYGEN
|
||
|
///
|
||
|
/// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
|
||
|
///
|
||
|
template<typename CharType,int size=sizeof(CharType)>
|
||
|
struct utf_traits {
|
||
|
///
|
||
|
/// The type of the character
|
||
|
///
|
||
|
typedef CharType char_type;
|
||
|
///
|
||
|
/// Read one code point from the range [p,e) and return it.
|
||
|
///
|
||
|
/// - If the sequence that was read is incomplete sequence returns \ref incomplete,
|
||
|
/// - If illegal sequence detected returns \ref illegal
|
||
|
///
|
||
|
/// Requirements
|
||
|
///
|
||
|
/// - Iterator is valid input iterator
|
||
|
///
|
||
|
/// Postconditions
|
||
|
///
|
||
|
/// - p points to the last consumed character
|
||
|
///
|
||
|
template<typename Iterator>
|
||
|
static code_point decode(Iterator &p,Iterator e);
|
||
|
|
||
|
///
|
||
|
/// Maximal width of valid sequence in the code units:
|
||
|
///
|
||
|
/// - UTF-8 - 4
|
||
|
/// - UTF-16 - 2
|
||
|
/// - UTF-32 - 1
|
||
|
///
|
||
|
static const int max_width;
|
||
|
///
|
||
|
/// The width of specific code point in the code units.
|
||
|
///
|
||
|
/// Requirement: value is a valid Unicode code point
|
||
|
/// Returns value in range [1..max_width]
|
||
|
///
|
||
|
static int width(code_point value);
|
||
|
|
||
|
///
|
||
|
/// Get the size of the trail part of variable length encoded sequence.
|
||
|
///
|
||
|
/// Returns -1 if C is not valid lead character
|
||
|
///
|
||
|
static int trail_length(char_type c);
|
||
|
///
|
||
|
/// Returns true if c is trail code unit, always false for UTF-32
|
||
|
///
|
||
|
static bool is_trail(char_type c);
|
||
|
///
|
||
|
/// Returns true if c is lead code unit, always true of UTF-32
|
||
|
///
|
||
|
static bool is_lead(char_type c);
|
||
|
|
||
|
///
|
||
|
/// Convert valid Unicode code point \a value to the UTF sequence.
|
||
|
///
|
||
|
/// Requirements:
|
||
|
///
|
||
|
/// - \a value is valid code point
|
||
|
/// - \a out is an output iterator should be able to accept at least width(value) units
|
||
|
///
|
||
|
/// Returns the iterator past the last written code unit.
|
||
|
///
|
||
|
template<typename Iterator>
|
||
|
static Iterator encode(code_point value,Iterator out);
|
||
|
///
|
||
|
/// Decodes valid UTF sequence that is pointed by p into code point.
|
||
|
///
|
||
|
/// If the sequence is invalid or points to end the behavior is undefined
|
||
|
///
|
||
|
template<typename Iterator>
|
||
|
static code_point decode_valid(Iterator &p);
|
||
|
};
|
||
|
|
||
|
#else
|
||
|
|
||
|
template<typename CharType,int size=sizeof(CharType)>
|
||
|
struct utf_traits;
|
||
|
|
||
|
template<typename CharType>
|
||
|
struct utf_traits<CharType,1> {
|
||
|
|
||
|
typedef CharType char_type;
|
||
|
|
||
|
static int trail_length(char_type ci)
|
||
|
{
|
||
|
unsigned char c = ci;
|
||
|
if(c < 128)
|
||
|
return 0;
|
||
|
if(BOOST_LOCALE_UNLIKELY(c < 194))
|
||
|
return -1;
|
||
|
if(c < 224)
|
||
|
return 1;
|
||
|
if(c < 240)
|
||
|
return 2;
|
||
|
if(BOOST_LOCALE_LIKELY(c <=244))
|
||
|
return 3;
|
||
|
return -1;
|
||
|
}
|
||
|
|
||
|
static const int max_width = 4;
|
||
|
|
||
|
static int width(code_point value)
|
||
|
{
|
||
|
if(value <=0x7F) {
|
||
|
return 1;
|
||
|
}
|
||
|
else if(value <=0x7FF) {
|
||
|
return 2;
|
||
|
}
|
||
|
else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) {
|
||
|
return 3;
|
||
|
}
|
||
|
else {
|
||
|
return 4;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static bool is_trail(char_type ci)
|
||
|
{
|
||
|
unsigned char c=ci;
|
||
|
return (c & 0xC0)==0x80;
|
||
|
}
|
||
|
|
||
|
static bool is_lead(char_type ci)
|
||
|
{
|
||
|
return !is_trail(ci);
|
||
|
}
|
||
|
|
||
|
template<typename Iterator>
|
||
|
static code_point decode(Iterator &p,Iterator e)
|
||
|
{
|
||
|
if(BOOST_LOCALE_UNLIKELY(p==e))
|
||
|
return incomplete;
|
||
|
|
||
|
unsigned char lead = *p++;
|
||
|
|
||
|
// First byte is fully validated here
|
||
|
int trail_size = trail_length(lead);
|
||
|
|
||
|
if(BOOST_LOCALE_UNLIKELY(trail_size < 0))
|
||
|
return illegal;
|
||
|
|
||
|
//
|
||
|
// Ok as only ASCII may be of size = 0
|
||
|
// also optimize for ASCII text
|
||
|
//
|
||
|
if(trail_size == 0)
|
||
|
return lead;
|
||
|
|
||
|
code_point c = lead & ((1<<(6-trail_size))-1);
|
||
|
|
||
|
// Read the rest
|
||
|
unsigned char tmp;
|
||
|
switch(trail_size) {
|
||
|
case 3:
|
||
|
if(BOOST_LOCALE_UNLIKELY(p==e))
|
||
|
return incomplete;
|
||
|
tmp = *p++;
|
||
|
if (!is_trail(tmp))
|
||
|
return illegal;
|
||
|
c = (c << 6) | ( tmp & 0x3F);
|
||
|
case 2:
|
||
|
if(BOOST_LOCALE_UNLIKELY(p==e))
|
||
|
return incomplete;
|
||
|
tmp = *p++;
|
||
|
if (!is_trail(tmp))
|
||
|
return illegal;
|
||
|
c = (c << 6) | ( tmp & 0x3F);
|
||
|
case 1:
|
||
|
if(BOOST_LOCALE_UNLIKELY(p==e))
|
||
|
return incomplete;
|
||
|
tmp = *p++;
|
||
|
if (!is_trail(tmp))
|
||
|
return illegal;
|
||
|
c = (c << 6) | ( tmp & 0x3F);
|
||
|
}
|
||
|
|
||
|
// Check code point validity: no surrogates and
|
||
|
// valid range
|
||
|
if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
|
||
|
return illegal;
|
||
|
|
||
|
// make sure it is the most compact representation
|
||
|
if(BOOST_LOCALE_UNLIKELY(width(c)!=trail_size + 1))
|
||
|
return illegal;
|
||
|
|
||
|
return c;
|
||
|
|
||
|
}
|
||
|
|
||
|
template<typename Iterator>
|
||
|
static code_point decode_valid(Iterator &p)
|
||
|
{
|
||
|
unsigned char lead = *p++;
|
||
|
if(lead < 192)
|
||
|
return lead;
|
||
|
|
||
|
int trail_size;
|
||
|
|
||
|
if(lead < 224)
|
||
|
trail_size = 1;
|
||
|
else if(BOOST_LOCALE_LIKELY(lead < 240)) // non-BMP rare
|
||
|
trail_size = 2;
|
||
|
else
|
||
|
trail_size = 3;
|
||
|
|
||
|
code_point c = lead & ((1<<(6-trail_size))-1);
|
||
|
|
||
|
switch(trail_size) {
|
||
|
case 3:
|
||
|
c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
|
||
|
case 2:
|
||
|
c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
|
||
|
case 1:
|
||
|
c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
|
||
|
}
|
||
|
|
||
|
return c;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
template<typename Iterator>
|
||
|
static Iterator encode(code_point value,Iterator out)
|
||
|
{
|
||
|
if(value <= 0x7F) {
|
||
|
*out++ = static_cast<char_type>(value);
|
||
|
}
|
||
|
else if(value <= 0x7FF) {
|
||
|
*out++ = static_cast<char_type>((value >> 6) | 0xC0);
|
||
|
*out++ = static_cast<char_type>((value & 0x3F) | 0x80);
|
||
|
}
|
||
|
else if(BOOST_LOCALE_LIKELY(value <= 0xFFFF)) {
|
||
|
*out++ = static_cast<char_type>((value >> 12) | 0xE0);
|
||
|
*out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
|
||
|
*out++ = static_cast<char_type>((value & 0x3F) | 0x80);
|
||
|
}
|
||
|
else {
|
||
|
*out++ = static_cast<char_type>((value >> 18) | 0xF0);
|
||
|
*out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
|
||
|
*out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
|
||
|
*out++ = static_cast<char_type>((value & 0x3F) | 0x80);
|
||
|
}
|
||
|
return out;
|
||
|
}
|
||
|
}; // utf8
|
||
|
|
||
|
template<typename CharType>
|
||
|
struct utf_traits<CharType,2> {
|
||
|
typedef CharType char_type;
|
||
|
|
||
|
// See RFC 2781
|
||
|
static bool is_first_surrogate(uint16_t x)
|
||
|
{
|
||
|
return 0xD800 <=x && x<= 0xDBFF;
|
||
|
}
|
||
|
static bool is_second_surrogate(uint16_t x)
|
||
|
{
|
||
|
return 0xDC00 <=x && x<= 0xDFFF;
|
||
|
}
|
||
|
static code_point combine_surrogate(uint16_t w1,uint16_t w2)
|
||
|
{
|
||
|
return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
|
||
|
}
|
||
|
static int trail_length(char_type c)
|
||
|
{
|
||
|
if(is_first_surrogate(c))
|
||
|
return 1;
|
||
|
if(is_second_surrogate(c))
|
||
|
return -1;
|
||
|
return 0;
|
||
|
}
|
||
|
///
|
||
|
/// Returns true if c is trail code unit, always false for UTF-32
|
||
|
///
|
||
|
static bool is_trail(char_type c)
|
||
|
{
|
||
|
return is_second_surrogate(c);
|
||
|
}
|
||
|
///
|
||
|
/// Returns true if c is lead code unit, always true of UTF-32
|
||
|
///
|
||
|
static bool is_lead(char_type c)
|
||
|
{
|
||
|
return !is_second_surrogate(c);
|
||
|
}
|
||
|
|
||
|
template<typename It>
|
||
|
static code_point decode(It ¤t,It last)
|
||
|
{
|
||
|
if(BOOST_LOCALE_UNLIKELY(current == last))
|
||
|
return incomplete;
|
||
|
uint16_t w1=*current++;
|
||
|
if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
|
||
|
return w1;
|
||
|
}
|
||
|
if(w1 > 0xDBFF)
|
||
|
return illegal;
|
||
|
if(current==last)
|
||
|
return incomplete;
|
||
|
uint16_t w2=*current++;
|
||
|
if(w2 < 0xDC00 || 0xDFFF < w2)
|
||
|
return illegal;
|
||
|
return combine_surrogate(w1,w2);
|
||
|
}
|
||
|
template<typename It>
|
||
|
static code_point decode_valid(It ¤t)
|
||
|
{
|
||
|
uint16_t w1=*current++;
|
||
|
if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
|
||
|
return w1;
|
||
|
}
|
||
|
uint16_t w2=*current++;
|
||
|
return combine_surrogate(w1,w2);
|
||
|
}
|
||
|
|
||
|
static const int max_width = 2;
|
||
|
static int width(code_point u)
|
||
|
{
|
||
|
return u>=0x10000 ? 2 : 1;
|
||
|
}
|
||
|
template<typename It>
|
||
|
static It encode(code_point u,It out)
|
||
|
{
|
||
|
if(BOOST_LOCALE_LIKELY(u<=0xFFFF)) {
|
||
|
*out++ = static_cast<char_type>(u);
|
||
|
}
|
||
|
else {
|
||
|
u -= 0x10000;
|
||
|
*out++ = static_cast<char_type>(0xD800 | (u>>10));
|
||
|
*out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
|
||
|
}
|
||
|
return out;
|
||
|
}
|
||
|
}; // utf16;
|
||
|
|
||
|
|
||
|
template<typename CharType>
|
||
|
struct utf_traits<CharType,4> {
|
||
|
typedef CharType char_type;
|
||
|
static int trail_length(char_type c)
|
||
|
{
|
||
|
if(is_valid_codepoint(c))
|
||
|
return 0;
|
||
|
return -1;
|
||
|
}
|
||
|
static bool is_trail(char_type /*c*/)
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
static bool is_lead(char_type /*c*/)
|
||
|
{
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
template<typename It>
|
||
|
static code_point decode_valid(It ¤t)
|
||
|
{
|
||
|
return *current++;
|
||
|
}
|
||
|
|
||
|
template<typename It>
|
||
|
static code_point decode(It ¤t,It last)
|
||
|
{
|
||
|
if(BOOST_LOCALE_UNLIKELY(current == last))
|
||
|
return boost::locale::utf::incomplete;
|
||
|
code_point c=*current++;
|
||
|
if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
|
||
|
return boost::locale::utf::illegal;
|
||
|
return c;
|
||
|
}
|
||
|
static const int max_width = 1;
|
||
|
static int width(code_point /*u*/)
|
||
|
{
|
||
|
return 1;
|
||
|
}
|
||
|
template<typename It>
|
||
|
static It encode(code_point u,It out)
|
||
|
{
|
||
|
*out++ = static_cast<char_type>(u);
|
||
|
return out;
|
||
|
}
|
||
|
|
||
|
}; // utf32
|
||
|
|
||
|
#endif
|
||
|
|
||
|
|
||
|
} // utf
|
||
|
} // locale
|
||
|
} // boost
|
||
|
|
||
|
|
||
|
#endif
|
||
|
|
||
|
// vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
|
||
|
|