* Split out the code that encodes a single Unicode character as UTF-8 into a separate function: encode_utf8_char

* This will allow encoding of Unicode strings to UTF-8 regardless of its current encoding git-svn-id: svn+ssh://svn.gna.org/svn/warzone/trunk@5899 4a71c877-e1ca-e34f-864e-861f7616d084
2008-08-30 13:47:15 +00:00 · 2008-08-30 13:47:15 +00:00 · 2cb4c71883
parent ac08ca70b0
commit 2cb4c71883
1 changed files with 93 additions and 73 deletions
--- a/lib/framework/utf8.c
+++ b/lib/framework/utf8.c
@ -207,6 +207,98 @@ size_t utf32_utf8_buffer_length(const utf_32_char* unicode_string)
 	return length;
 }

+/** Encodes a single Unicode character to a UTF-8 encoded string.
+ * 
+ *  \param unicode_char A UTF-32 encoded Unicode codepoint that will be encoded
+ *                      into UTF-8.
+ *  \param out_char     Points to the position in a buffer where the UTF-8
+ *                      encoded character can be stored.
+ *
+ *  \return A pointer pointing to the first byte <em>after</em> the encoded
+ *          UTF-8 sequence. This can be used as the \c out_char parameter for a
+ *          next invocation of encode_utf8_char().
+ */
+static char* encode_utf8_char(const utf_32_char unicode_char, char * const out_char)
+{
+	char * next_char = out_char;
+
+	// 7 bits
+	if      (unicode_char < 0x00000080)
+	{
+		*(next_char++) = unicode_char;
+	}
+	// 11 bits
+	else if (unicode_char < 0x00000800)
+	{
+		// 0xc0 provides the counting bits: 110
+		// then append the 5 most significant bits
+		*(next_char++) = 0xc0 | (unicode_char >> 6);
+		// Put the next 6 bits in a byte of their own
+		*(next_char++) = 0x80 | (unicode_char & 0x3f);
+	}
+	// 16 bits
+	else if (unicode_char < 0x00010000)
+	{
+		// 0xe0 provides the counting bits: 1110
+		// then append the 4 most significant bits
+		*(next_char++) = 0xe0 | (unicode_char >> 12);
+		// Put the next 12 bits in two bytes of their own
+		*(next_char++) = 0x80 | ((unicode_char >> 6) & 0x3f);
+		*(next_char++) = 0x80 | (unicode_char & 0x3f);
+	}
+	// 21 bits
+	else if (unicode_char < 0x00200000)
+	{
+		// 0xf0 provides the counting bits: 11110
+		// then append the 3 most significant bits
+		*(next_char++) = 0xf0 | (unicode_char >> 18);
+		// Put the next 18 bits in three bytes of their own
+		*(next_char++) = 0x80 | ((unicode_char >> 12) & 0x3f);
+		*(next_char++) = 0x80 | ((unicode_char >> 6) & 0x3f);
+		*(next_char++) = 0x80 | (unicode_char & 0x3f);
+	}
+	// 26 bits
+	else if (unicode_char < 0x04000000)
+	{
+		// 0xf8 provides the counting bits: 111110
+		// then append the 2 most significant bits
+		*(next_char++) = 0xf8 | (unicode_char >> 24 );
+		// Put the next 24 bits in four bytes of their own
+		*(next_char++) = 0x80 | ((unicode_char >> 18) & 0x3f);
+		*(next_char++) = 0x80 | ((unicode_char >> 12) & 0x3f);
+		*(next_char++) = 0x80 | ((unicode_char >> 6) & 0x3f);
+		*(next_char++) = 0x80 | (unicode_char & 0x3f);
+	}
+	// 31 bits
+	else if (unicode_char < 0x80000000)
+	{
+		// 0xfc provides the counting bits: 1111110
+		// then append the 1 most significant bit
+		*(next_char++) = 0xfc | (unicode_char >> 30);
+		// Put the next 30 bits in five bytes of their own
+		*(next_char++) = 0x80 | ((unicode_char >> 24) & 0x3f);
+		*(next_char++) = 0x80 | ((unicode_char >> 18) & 0x3f);
+		*(next_char++) = 0x80 | ((unicode_char >> 12) & 0x3f);
+		*(next_char++) = 0x80 | ((unicode_char >> 6) & 0x3f);
+		*(next_char++) = 0x80 | (unicode_char & 0x3f);
+	}
+	// 36 bits
+	else
+	{
+		// 0xfe provides the counting bits: 11111110
+		*(next_char++) = 0xfe;
+		// Put the next 36 bits in six bytes of their own
+		*(next_char++) = 0x80 | ((unicode_char >> 30) & 0x3f);
+		*(next_char++) = 0x80 | ((unicode_char >> 24) & 0x3f);
+		*(next_char++) = 0x80 | ((unicode_char >> 18) & 0x3f);
+		*(next_char++) = 0x80 | ((unicode_char >> 12) & 0x3f);
+		*(next_char++) = 0x80 | ((unicode_char >> 6) & 0x3f);
+		*(next_char++) = 0x80 | (unicode_char & 0x3f);
+	}
+
+	return next_char;
+}
+
 char* utf8_encode(const utf_32_char* unicode_string)
 {
 	const utf_32_char* curChar;
@ -225,79 +317,7 @@ char* utf8_encode(const utf_32_char* unicode_string)

 	for (curChar = unicode_string; *curChar != 0; ++curChar)
 	{
-		// 7 bits
-		if      (*curChar < 0x00000080)
-		{
-			*(curOutPos++) = *curChar;
-		}
-		// 11 bits
-		else if (*curChar < 0x00000800)
-		{
-			// 0xc0 provides the counting bits: 110
-			// then append the 5 most significant bits
-			*(curOutPos++) = 0xc0 | (*curChar >> 6);
-			// Put the next 6 bits in a byte of their own
-			*(curOutPos++) = 0x80 | (*curChar & 0x3f);
-		}
-		// 16 bits
-		else if (*curChar < 0x00010000)
-		{
-			// 0xe0 provides the counting bits: 1110
-			// then append the 4 most significant bits
-			*(curOutPos++) = 0xe0 | (*curChar >> 12);
-			// Put the next 12 bits in two bytes of their own
-			*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
-			*(curOutPos++) = 0x80 | (*curChar & 0x3f);
-		}
-		// 21 bits
-		else if (*curChar < 0x00200000)
-		{
-			// 0xf0 provides the counting bits: 11110
-			// then append the 3 most significant bits
-			*(curOutPos++) = 0xf0 | (*curChar >> 18);
-			// Put the next 18 bits in three bytes of their own
-			*(curOutPos++) = 0x80 | ((*curChar >> 12) & 0x3f);
-			*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
-			*(curOutPos++) = 0x80 | (*curChar & 0x3f);
-		}
-		// 26 bits
-		else if (*curChar < 0x04000000)
-		{
-			// 0xf8 provides the counting bits: 111110
-			// then append the 2 most significant bits
-			*(curOutPos++) = 0xf8 | (*curChar >> 24 );
-			// Put the next 24 bits in four bytes of their own
-			*(curOutPos++) = 0x80 | ((*curChar >> 18) & 0x3f);
-			*(curOutPos++) = 0x80 | ((*curChar >> 12) & 0x3f);
-			*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
-			*(curOutPos++) = 0x80 | (*curChar & 0x3f);
-		}
-		// 31 bits
-		else if (*curChar < 0x80000000)
-		{
-			// 0xfc provides the counting bits: 1111110
-			// then append the 1 most significant bit
-			*(curOutPos++) = 0xfc | (*curChar >> 30);
-			// Put the next 30 bits in five bytes of their own
-			*(curOutPos++) = 0x80 | ((*curChar >> 24) & 0x3f);
-			*(curOutPos++) = 0x80 | ((*curChar >> 18) & 0x3f);
-			*(curOutPos++) = 0x80 | ((*curChar >> 12) & 0x3f);
-			*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
-			*(curOutPos++) = 0x80 | (*curChar & 0x3f);
-		}
-		// 36 bits
-		else
-		{
-			// 0xfe provides the counting bits: 11111110
-			*(curOutPos++) = 0xfe;
-			// Put the next 36 bits in six bytes of their own
-			*(curOutPos++) = 0x80 | ((*curChar >> 30) & 0x3f);
-			*(curOutPos++) = 0x80 | ((*curChar >> 24) & 0x3f);
-			*(curOutPos++) = 0x80 | ((*curChar >> 18) & 0x3f);
-			*(curOutPos++) = 0x80 | ((*curChar >> 12) & 0x3f);
-			*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
-			*(curOutPos++) = 0x80 | (*curChar & 0x3f);
-		}
+		curOutPos = encode_utf8_char(*curChar, curOutPos);
 	}

 	// Terminate the string with a nul character