* Split out the code that encodes a single Unicode character as UTF-8 into a separate function: encode_utf8_char
* This will allow encoding of Unicode strings to UTF-8 regardless of its current encoding git-svn-id: svn+ssh://svn.gna.org/svn/warzone/trunk@5899 4a71c877-e1ca-e34f-864e-861f7616d084master
parent
ac08ca70b0
commit
2cb4c71883
|
@ -207,6 +207,98 @@ size_t utf32_utf8_buffer_length(const utf_32_char* unicode_string)
|
|||
return length;
|
||||
}
|
||||
|
||||
/** Encodes a single Unicode character to a UTF-8 encoded string.
|
||||
*
|
||||
* \param unicode_char A UTF-32 encoded Unicode codepoint that will be encoded
|
||||
* into UTF-8.
|
||||
* \param out_char Points to the position in a buffer where the UTF-8
|
||||
* encoded character can be stored.
|
||||
*
|
||||
* \return A pointer pointing to the first byte <em>after</em> the encoded
|
||||
* UTF-8 sequence. This can be used as the \c out_char parameter for a
|
||||
* next invocation of encode_utf8_char().
|
||||
*/
|
||||
static char* encode_utf8_char(const utf_32_char unicode_char, char * const out_char)
|
||||
{
|
||||
char * next_char = out_char;
|
||||
|
||||
// 7 bits
|
||||
if (unicode_char < 0x00000080)
|
||||
{
|
||||
*(next_char++) = unicode_char;
|
||||
}
|
||||
// 11 bits
|
||||
else if (unicode_char < 0x00000800)
|
||||
{
|
||||
// 0xc0 provides the counting bits: 110
|
||||
// then append the 5 most significant bits
|
||||
*(next_char++) = 0xc0 | (unicode_char >> 6);
|
||||
// Put the next 6 bits in a byte of their own
|
||||
*(next_char++) = 0x80 | (unicode_char & 0x3f);
|
||||
}
|
||||
// 16 bits
|
||||
else if (unicode_char < 0x00010000)
|
||||
{
|
||||
// 0xe0 provides the counting bits: 1110
|
||||
// then append the 4 most significant bits
|
||||
*(next_char++) = 0xe0 | (unicode_char >> 12);
|
||||
// Put the next 12 bits in two bytes of their own
|
||||
*(next_char++) = 0x80 | ((unicode_char >> 6) & 0x3f);
|
||||
*(next_char++) = 0x80 | (unicode_char & 0x3f);
|
||||
}
|
||||
// 21 bits
|
||||
else if (unicode_char < 0x00200000)
|
||||
{
|
||||
// 0xf0 provides the counting bits: 11110
|
||||
// then append the 3 most significant bits
|
||||
*(next_char++) = 0xf0 | (unicode_char >> 18);
|
||||
// Put the next 18 bits in three bytes of their own
|
||||
*(next_char++) = 0x80 | ((unicode_char >> 12) & 0x3f);
|
||||
*(next_char++) = 0x80 | ((unicode_char >> 6) & 0x3f);
|
||||
*(next_char++) = 0x80 | (unicode_char & 0x3f);
|
||||
}
|
||||
// 26 bits
|
||||
else if (unicode_char < 0x04000000)
|
||||
{
|
||||
// 0xf8 provides the counting bits: 111110
|
||||
// then append the 2 most significant bits
|
||||
*(next_char++) = 0xf8 | (unicode_char >> 24 );
|
||||
// Put the next 24 bits in four bytes of their own
|
||||
*(next_char++) = 0x80 | ((unicode_char >> 18) & 0x3f);
|
||||
*(next_char++) = 0x80 | ((unicode_char >> 12) & 0x3f);
|
||||
*(next_char++) = 0x80 | ((unicode_char >> 6) & 0x3f);
|
||||
*(next_char++) = 0x80 | (unicode_char & 0x3f);
|
||||
}
|
||||
// 31 bits
|
||||
else if (unicode_char < 0x80000000)
|
||||
{
|
||||
// 0xfc provides the counting bits: 1111110
|
||||
// then append the 1 most significant bit
|
||||
*(next_char++) = 0xfc | (unicode_char >> 30);
|
||||
// Put the next 30 bits in five bytes of their own
|
||||
*(next_char++) = 0x80 | ((unicode_char >> 24) & 0x3f);
|
||||
*(next_char++) = 0x80 | ((unicode_char >> 18) & 0x3f);
|
||||
*(next_char++) = 0x80 | ((unicode_char >> 12) & 0x3f);
|
||||
*(next_char++) = 0x80 | ((unicode_char >> 6) & 0x3f);
|
||||
*(next_char++) = 0x80 | (unicode_char & 0x3f);
|
||||
}
|
||||
// 36 bits
|
||||
else
|
||||
{
|
||||
// 0xfe provides the counting bits: 11111110
|
||||
*(next_char++) = 0xfe;
|
||||
// Put the next 36 bits in six bytes of their own
|
||||
*(next_char++) = 0x80 | ((unicode_char >> 30) & 0x3f);
|
||||
*(next_char++) = 0x80 | ((unicode_char >> 24) & 0x3f);
|
||||
*(next_char++) = 0x80 | ((unicode_char >> 18) & 0x3f);
|
||||
*(next_char++) = 0x80 | ((unicode_char >> 12) & 0x3f);
|
||||
*(next_char++) = 0x80 | ((unicode_char >> 6) & 0x3f);
|
||||
*(next_char++) = 0x80 | (unicode_char & 0x3f);
|
||||
}
|
||||
|
||||
return next_char;
|
||||
}
|
||||
|
||||
char* utf8_encode(const utf_32_char* unicode_string)
|
||||
{
|
||||
const utf_32_char* curChar;
|
||||
|
@ -225,79 +317,7 @@ char* utf8_encode(const utf_32_char* unicode_string)
|
|||
|
||||
for (curChar = unicode_string; *curChar != 0; ++curChar)
|
||||
{
|
||||
// 7 bits
|
||||
if (*curChar < 0x00000080)
|
||||
{
|
||||
*(curOutPos++) = *curChar;
|
||||
}
|
||||
// 11 bits
|
||||
else if (*curChar < 0x00000800)
|
||||
{
|
||||
// 0xc0 provides the counting bits: 110
|
||||
// then append the 5 most significant bits
|
||||
*(curOutPos++) = 0xc0 | (*curChar >> 6);
|
||||
// Put the next 6 bits in a byte of their own
|
||||
*(curOutPos++) = 0x80 | (*curChar & 0x3f);
|
||||
}
|
||||
// 16 bits
|
||||
else if (*curChar < 0x00010000)
|
||||
{
|
||||
// 0xe0 provides the counting bits: 1110
|
||||
// then append the 4 most significant bits
|
||||
*(curOutPos++) = 0xe0 | (*curChar >> 12);
|
||||
// Put the next 12 bits in two bytes of their own
|
||||
*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
|
||||
*(curOutPos++) = 0x80 | (*curChar & 0x3f);
|
||||
}
|
||||
// 21 bits
|
||||
else if (*curChar < 0x00200000)
|
||||
{
|
||||
// 0xf0 provides the counting bits: 11110
|
||||
// then append the 3 most significant bits
|
||||
*(curOutPos++) = 0xf0 | (*curChar >> 18);
|
||||
// Put the next 18 bits in three bytes of their own
|
||||
*(curOutPos++) = 0x80 | ((*curChar >> 12) & 0x3f);
|
||||
*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
|
||||
*(curOutPos++) = 0x80 | (*curChar & 0x3f);
|
||||
}
|
||||
// 26 bits
|
||||
else if (*curChar < 0x04000000)
|
||||
{
|
||||
// 0xf8 provides the counting bits: 111110
|
||||
// then append the 2 most significant bits
|
||||
*(curOutPos++) = 0xf8 | (*curChar >> 24 );
|
||||
// Put the next 24 bits in four bytes of their own
|
||||
*(curOutPos++) = 0x80 | ((*curChar >> 18) & 0x3f);
|
||||
*(curOutPos++) = 0x80 | ((*curChar >> 12) & 0x3f);
|
||||
*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
|
||||
*(curOutPos++) = 0x80 | (*curChar & 0x3f);
|
||||
}
|
||||
// 31 bits
|
||||
else if (*curChar < 0x80000000)
|
||||
{
|
||||
// 0xfc provides the counting bits: 1111110
|
||||
// then append the 1 most significant bit
|
||||
*(curOutPos++) = 0xfc | (*curChar >> 30);
|
||||
// Put the next 30 bits in five bytes of their own
|
||||
*(curOutPos++) = 0x80 | ((*curChar >> 24) & 0x3f);
|
||||
*(curOutPos++) = 0x80 | ((*curChar >> 18) & 0x3f);
|
||||
*(curOutPos++) = 0x80 | ((*curChar >> 12) & 0x3f);
|
||||
*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
|
||||
*(curOutPos++) = 0x80 | (*curChar & 0x3f);
|
||||
}
|
||||
// 36 bits
|
||||
else
|
||||
{
|
||||
// 0xfe provides the counting bits: 11111110
|
||||
*(curOutPos++) = 0xfe;
|
||||
// Put the next 36 bits in six bytes of their own
|
||||
*(curOutPos++) = 0x80 | ((*curChar >> 30) & 0x3f);
|
||||
*(curOutPos++) = 0x80 | ((*curChar >> 24) & 0x3f);
|
||||
*(curOutPos++) = 0x80 | ((*curChar >> 18) & 0x3f);
|
||||
*(curOutPos++) = 0x80 | ((*curChar >> 12) & 0x3f);
|
||||
*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
|
||||
*(curOutPos++) = 0x80 | (*curChar & 0x3f);
|
||||
}
|
||||
curOutPos = encode_utf8_char(*curChar, curOutPos);
|
||||
}
|
||||
|
||||
// Terminate the string with a nul character
|
||||
|
|
Loading…
Reference in New Issue