* Split out the code that encodes a single Unicode character as UTF-8 into a separate function: encode_utf8_char

* This will allow encoding of Unicode strings to UTF-8 regardless of its current encoding

git-svn-id: svn+ssh://svn.gna.org/svn/warzone/trunk@5899 4a71c877-e1ca-e34f-864e-861f7616d084
master
Giel van Schijndel 2008-08-30 13:47:15 +00:00
parent ac08ca70b0
commit 2cb4c71883
1 changed files with 93 additions and 73 deletions

View File

@ -207,6 +207,98 @@ size_t utf32_utf8_buffer_length(const utf_32_char* unicode_string)
return length;
}
/** Encodes a single Unicode character to a UTF-8 encoded string.
*
* \param unicode_char A UTF-32 encoded Unicode codepoint that will be encoded
* into UTF-8.
* \param out_char Points to the position in a buffer where the UTF-8
* encoded character can be stored.
*
* \return A pointer pointing to the first byte <em>after</em> the encoded
* UTF-8 sequence. This can be used as the \c out_char parameter for a
* next invocation of encode_utf8_char().
*/
static char* encode_utf8_char(const utf_32_char unicode_char, char * const out_char)
{
char * next_char = out_char;
// 7 bits
if (unicode_char < 0x00000080)
{
*(next_char++) = unicode_char;
}
// 11 bits
else if (unicode_char < 0x00000800)
{
// 0xc0 provides the counting bits: 110
// then append the 5 most significant bits
*(next_char++) = 0xc0 | (unicode_char >> 6);
// Put the next 6 bits in a byte of their own
*(next_char++) = 0x80 | (unicode_char & 0x3f);
}
// 16 bits
else if (unicode_char < 0x00010000)
{
// 0xe0 provides the counting bits: 1110
// then append the 4 most significant bits
*(next_char++) = 0xe0 | (unicode_char >> 12);
// Put the next 12 bits in two bytes of their own
*(next_char++) = 0x80 | ((unicode_char >> 6) & 0x3f);
*(next_char++) = 0x80 | (unicode_char & 0x3f);
}
// 21 bits
else if (unicode_char < 0x00200000)
{
// 0xf0 provides the counting bits: 11110
// then append the 3 most significant bits
*(next_char++) = 0xf0 | (unicode_char >> 18);
// Put the next 18 bits in three bytes of their own
*(next_char++) = 0x80 | ((unicode_char >> 12) & 0x3f);
*(next_char++) = 0x80 | ((unicode_char >> 6) & 0x3f);
*(next_char++) = 0x80 | (unicode_char & 0x3f);
}
// 26 bits
else if (unicode_char < 0x04000000)
{
// 0xf8 provides the counting bits: 111110
// then append the 2 most significant bits
*(next_char++) = 0xf8 | (unicode_char >> 24 );
// Put the next 24 bits in four bytes of their own
*(next_char++) = 0x80 | ((unicode_char >> 18) & 0x3f);
*(next_char++) = 0x80 | ((unicode_char >> 12) & 0x3f);
*(next_char++) = 0x80 | ((unicode_char >> 6) & 0x3f);
*(next_char++) = 0x80 | (unicode_char & 0x3f);
}
// 31 bits
else if (unicode_char < 0x80000000)
{
// 0xfc provides the counting bits: 1111110
// then append the 1 most significant bit
*(next_char++) = 0xfc | (unicode_char >> 30);
// Put the next 30 bits in five bytes of their own
*(next_char++) = 0x80 | ((unicode_char >> 24) & 0x3f);
*(next_char++) = 0x80 | ((unicode_char >> 18) & 0x3f);
*(next_char++) = 0x80 | ((unicode_char >> 12) & 0x3f);
*(next_char++) = 0x80 | ((unicode_char >> 6) & 0x3f);
*(next_char++) = 0x80 | (unicode_char & 0x3f);
}
// 36 bits
else
{
// 0xfe provides the counting bits: 11111110
*(next_char++) = 0xfe;
// Put the next 36 bits in six bytes of their own
*(next_char++) = 0x80 | ((unicode_char >> 30) & 0x3f);
*(next_char++) = 0x80 | ((unicode_char >> 24) & 0x3f);
*(next_char++) = 0x80 | ((unicode_char >> 18) & 0x3f);
*(next_char++) = 0x80 | ((unicode_char >> 12) & 0x3f);
*(next_char++) = 0x80 | ((unicode_char >> 6) & 0x3f);
*(next_char++) = 0x80 | (unicode_char & 0x3f);
}
return next_char;
}
char* utf8_encode(const utf_32_char* unicode_string)
{
const utf_32_char* curChar;
@ -225,79 +317,7 @@ char* utf8_encode(const utf_32_char* unicode_string)
for (curChar = unicode_string; *curChar != 0; ++curChar)
{
// 7 bits
if (*curChar < 0x00000080)
{
*(curOutPos++) = *curChar;
}
// 11 bits
else if (*curChar < 0x00000800)
{
// 0xc0 provides the counting bits: 110
// then append the 5 most significant bits
*(curOutPos++) = 0xc0 | (*curChar >> 6);
// Put the next 6 bits in a byte of their own
*(curOutPos++) = 0x80 | (*curChar & 0x3f);
}
// 16 bits
else if (*curChar < 0x00010000)
{
// 0xe0 provides the counting bits: 1110
// then append the 4 most significant bits
*(curOutPos++) = 0xe0 | (*curChar >> 12);
// Put the next 12 bits in two bytes of their own
*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
*(curOutPos++) = 0x80 | (*curChar & 0x3f);
}
// 21 bits
else if (*curChar < 0x00200000)
{
// 0xf0 provides the counting bits: 11110
// then append the 3 most significant bits
*(curOutPos++) = 0xf0 | (*curChar >> 18);
// Put the next 18 bits in three bytes of their own
*(curOutPos++) = 0x80 | ((*curChar >> 12) & 0x3f);
*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
*(curOutPos++) = 0x80 | (*curChar & 0x3f);
}
// 26 bits
else if (*curChar < 0x04000000)
{
// 0xf8 provides the counting bits: 111110
// then append the 2 most significant bits
*(curOutPos++) = 0xf8 | (*curChar >> 24 );
// Put the next 24 bits in four bytes of their own
*(curOutPos++) = 0x80 | ((*curChar >> 18) & 0x3f);
*(curOutPos++) = 0x80 | ((*curChar >> 12) & 0x3f);
*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
*(curOutPos++) = 0x80 | (*curChar & 0x3f);
}
// 31 bits
else if (*curChar < 0x80000000)
{
// 0xfc provides the counting bits: 1111110
// then append the 1 most significant bit
*(curOutPos++) = 0xfc | (*curChar >> 30);
// Put the next 30 bits in five bytes of their own
*(curOutPos++) = 0x80 | ((*curChar >> 24) & 0x3f);
*(curOutPos++) = 0x80 | ((*curChar >> 18) & 0x3f);
*(curOutPos++) = 0x80 | ((*curChar >> 12) & 0x3f);
*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
*(curOutPos++) = 0x80 | (*curChar & 0x3f);
}
// 36 bits
else
{
// 0xfe provides the counting bits: 11111110
*(curOutPos++) = 0xfe;
// Put the next 36 bits in six bytes of their own
*(curOutPos++) = 0x80 | ((*curChar >> 30) & 0x3f);
*(curOutPos++) = 0x80 | ((*curChar >> 24) & 0x3f);
*(curOutPos++) = 0x80 | ((*curChar >> 18) & 0x3f);
*(curOutPos++) = 0x80 | ((*curChar >> 12) & 0x3f);
*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
*(curOutPos++) = 0x80 | (*curChar & 0x3f);
}
curOutPos = encode_utf8_char(*curChar, curOutPos);
}
// Terminate the string with a nul character