Add a function for UTF-8 encoding

git-svn-id: https://warzone2100.svn.sourceforge.net/svnroot/warzone2100/trunk@7394 4a71c877-e1ca-e34f-864e-861f7616d084
2009-05-12 15:45:57 +00:00 · 2009-05-12 15:45:57 +00:00 · aaa1efe4bb
parent e5e90e5121
commit aaa1efe4bb
2 changed files with 97 additions and 0 deletions
--- a/lib/framework/utf.c
+++ b/lib/framework/utf.c
@ -110,6 +110,90 @@ utf_32_char UTF8DecodeChar(const char *utf8_char, const char **next_char)
 	return decoded;
 }

+char* UTF8EncodeChar(char utf8_char[MAX_UTF8_LEN], utf_32_char c)
+{
+	char* curOutPos = utf8_char;
+
+	// 7 bits
+	if      (c < 0x00000080)
+	{
+		*(curOutPos++) = c;
+	}
+	// 11 bits
+	else if (c < 0x00000800)
+	{
+		// 0xc0 provides the counting bits: 110
+		// then append the 5 most significant bits
+		*(curOutPos++) = 0xc0 | (c >> 6);
+		// Put the next 6 bits in a byte of their own
+		*(curOutPos++) = 0x80 | (c & 0x3f);
+	}
+	// 16 bits
+	else if (c < 0x00010000)
+	{
+		// 0xe0 provides the counting bits: 1110
+		// then append the 4 most significant bits
+		*(curOutPos++) = 0xe0 | (c >> 12);
+		// Put the next 12 bits in two bytes of their own
+		*(curOutPos++) = 0x80 | ((c >> 6) & 0x3f);
+		*(curOutPos++) = 0x80 | (c & 0x3f);
+	}
+	// 21 bits
+	else if (c < 0x00200000)
+	{
+		// 0xf0 provides the counting bits: 11110
+		// then append the 3 most significant bits
+		*(curOutPos++) = 0xf0 | (c >> 18);
+		// Put the next 18 bits in three bytes of their own
+		*(curOutPos++) = 0x80 | ((c >> 12) & 0x3f);
+		*(curOutPos++) = 0x80 | ((c >> 6) & 0x3f);
+		*(curOutPos++) = 0x80 | (c & 0x3f);
+	}
+	// 26 bits
+	else if (c < 0x04000000)
+	{
+		// 0xf8 provides the counting bits: 111110
+		// then append the 2 most significant bits
+		*(curOutPos++) = 0xf8 | (c >> 24 );
+		// Put the next 24 bits in four bytes of their own
+		*(curOutPos++) = 0x80 | ((c >> 18) & 0x3f);
+		*(curOutPos++) = 0x80 | ((c >> 12) & 0x3f);
+		*(curOutPos++) = 0x80 | ((c >> 6) & 0x3f);
+		*(curOutPos++) = 0x80 | (c & 0x3f);
+	}
+	// 31 bits
+	else if (c < 0x80000000)
+	{
+		// 0xfc provides the counting bits: 1111110
+		// then append the 1 most significant bit
+		*(curOutPos++) = 0xfc | (c >> 30);
+		// Put the next 30 bits in five bytes of their own
+		*(curOutPos++) = 0x80 | ((c >> 24) & 0x3f);
+		*(curOutPos++) = 0x80 | ((c >> 18) & 0x3f);
+		*(curOutPos++) = 0x80 | ((c >> 12) & 0x3f);
+		*(curOutPos++) = 0x80 | ((c >> 6) & 0x3f);
+		*(curOutPos++) = 0x80 | (c & 0x3f);
+	}
+	// 36 bits
+	else
+	{
+		// 0xfe provides the counting bits: 11111110
+		*(curOutPos++) = 0xfe;
+		// Put the next 36 bits in six bytes of their own
+		*(curOutPos++) = 0x80 | ((c >> 30) & 0x3f);
+		*(curOutPos++) = 0x80 | ((c >> 24) & 0x3f);
+		*(curOutPos++) = 0x80 | ((c >> 18) & 0x3f);
+		*(curOutPos++) = 0x80 | ((c >> 12) & 0x3f);
+		*(curOutPos++) = 0x80 | ((c >> 6) & 0x3f);
+		*(curOutPos++) = 0x80 | (c & 0x3f);
+	}
+
+	// NUL terminate the string
+	*curOutPos = '\0';
+
+	return utf8_char;
+}
+
 size_t UTF8CharacterCount(const char *utf8_string)
 {
 	size_t length = 0;
--- a/lib/framework/utf.h
+++ b/lib/framework/utf.h
@ -30,6 +30,11 @@

 #include "types.h"

+/**
+ * The maximum size (in octets) a single UTF-8 encoded codepoint can use.
+ */
+#define MAX_UTF8_LEN 8
+
 /** Used to store a UTF-32 character in
 */
 typedef uint32_t utf_32_char;
@ -61,6 +66,14 @@ utf_32_char UTF16DecodeChar(const utf_16_char *utf16_char, const utf_16_char **n
 */
 utf_32_char UTF8DecodeChar(const char *utf8_char, const char **next_char);

+/** Encode a single Unicode character as UTF-8.
+ *  \param[out] utf8_char Points to a character buffer at least \c MAX_UTF8_LEN octects large. Will be used to store a UTF-8 encoded version of \c.
+ *  \param      c         The Unicode character to encode.
+ *
+ *  \return \c utf8_char on success, NULL otherwise.
+ */
+char* UTF8EncodeChar(char utf8_char[MAX_UTF8_LEN], utf_32_char c);
+
 /** Determines the amount of unicode codepoints in a UTF-8 encoded string
 *  \param utf8_string the UTF-8 encoded string to count
 *  \return the amount of codepoints found in the UTF-8 string