Add a function for UTF-8 encoding

git-svn-id: https://warzone2100.svn.sourceforge.net/svnroot/warzone2100/trunk@7394 4a71c877-e1ca-e34f-864e-861f7616d084
master
Giel van Schijndel 2009-05-12 15:45:57 +00:00 committed by Git SVN Gateway
parent e5e90e5121
commit aaa1efe4bb
2 changed files with 97 additions and 0 deletions

View File

@ -110,6 +110,90 @@ utf_32_char UTF8DecodeChar(const char *utf8_char, const char **next_char)
return decoded;
}
char* UTF8EncodeChar(char utf8_char[MAX_UTF8_LEN], utf_32_char c)
{
char* curOutPos = utf8_char;
// 7 bits
if (c < 0x00000080)
{
*(curOutPos++) = c;
}
// 11 bits
else if (c < 0x00000800)
{
// 0xc0 provides the counting bits: 110
// then append the 5 most significant bits
*(curOutPos++) = 0xc0 | (c >> 6);
// Put the next 6 bits in a byte of their own
*(curOutPos++) = 0x80 | (c & 0x3f);
}
// 16 bits
else if (c < 0x00010000)
{
// 0xe0 provides the counting bits: 1110
// then append the 4 most significant bits
*(curOutPos++) = 0xe0 | (c >> 12);
// Put the next 12 bits in two bytes of their own
*(curOutPos++) = 0x80 | ((c >> 6) & 0x3f);
*(curOutPos++) = 0x80 | (c & 0x3f);
}
// 21 bits
else if (c < 0x00200000)
{
// 0xf0 provides the counting bits: 11110
// then append the 3 most significant bits
*(curOutPos++) = 0xf0 | (c >> 18);
// Put the next 18 bits in three bytes of their own
*(curOutPos++) = 0x80 | ((c >> 12) & 0x3f);
*(curOutPos++) = 0x80 | ((c >> 6) & 0x3f);
*(curOutPos++) = 0x80 | (c & 0x3f);
}
// 26 bits
else if (c < 0x04000000)
{
// 0xf8 provides the counting bits: 111110
// then append the 2 most significant bits
*(curOutPos++) = 0xf8 | (c >> 24 );
// Put the next 24 bits in four bytes of their own
*(curOutPos++) = 0x80 | ((c >> 18) & 0x3f);
*(curOutPos++) = 0x80 | ((c >> 12) & 0x3f);
*(curOutPos++) = 0x80 | ((c >> 6) & 0x3f);
*(curOutPos++) = 0x80 | (c & 0x3f);
}
// 31 bits
else if (c < 0x80000000)
{
// 0xfc provides the counting bits: 1111110
// then append the 1 most significant bit
*(curOutPos++) = 0xfc | (c >> 30);
// Put the next 30 bits in five bytes of their own
*(curOutPos++) = 0x80 | ((c >> 24) & 0x3f);
*(curOutPos++) = 0x80 | ((c >> 18) & 0x3f);
*(curOutPos++) = 0x80 | ((c >> 12) & 0x3f);
*(curOutPos++) = 0x80 | ((c >> 6) & 0x3f);
*(curOutPos++) = 0x80 | (c & 0x3f);
}
// 36 bits
else
{
// 0xfe provides the counting bits: 11111110
*(curOutPos++) = 0xfe;
// Put the next 36 bits in six bytes of their own
*(curOutPos++) = 0x80 | ((c >> 30) & 0x3f);
*(curOutPos++) = 0x80 | ((c >> 24) & 0x3f);
*(curOutPos++) = 0x80 | ((c >> 18) & 0x3f);
*(curOutPos++) = 0x80 | ((c >> 12) & 0x3f);
*(curOutPos++) = 0x80 | ((c >> 6) & 0x3f);
*(curOutPos++) = 0x80 | (c & 0x3f);
}
// NUL terminate the string
*curOutPos = '\0';
return utf8_char;
}
size_t UTF8CharacterCount(const char *utf8_string)
{
size_t length = 0;

View File

@ -30,6 +30,11 @@
#include "types.h"
/**
* The maximum size (in octets) a single UTF-8 encoded codepoint can use.
*/
#define MAX_UTF8_LEN 8
/** Used to store a UTF-32 character in
*/
typedef uint32_t utf_32_char;
@ -61,6 +66,14 @@ utf_32_char UTF16DecodeChar(const utf_16_char *utf16_char, const utf_16_char **n
*/
utf_32_char UTF8DecodeChar(const char *utf8_char, const char **next_char);
/** Encode a single Unicode character as UTF-8.
* \param[out] utf8_char Points to a character buffer at least \c MAX_UTF8_LEN octects large. Will be used to store a UTF-8 encoded version of \c.
* \param c The Unicode character to encode.
*
* \return \c utf8_char on success, NULL otherwise.
*/
char* UTF8EncodeChar(char utf8_char[MAX_UTF8_LEN], utf_32_char c);
/** Determines the amount of unicode codepoints in a UTF-8 encoded string
* \param utf8_string the UTF-8 encoded string to count
* \return the amount of codepoints found in the UTF-8 string