warzone2100/lib/framework/utf8.c

/*
	This file is part of Warzone 2100.
	Copyright (C) 2007  Giel van Schijndel
	Copyright (C) 2007  Warzone Resurrection Project

	Warzone 2100 is free software; you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation; either version 2 of the License, or
	(at your option) any later version.

	Warzone 2100 is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with Warzone 2100; if not, write to the Free Software
	Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA

	$Revision$
	$Id$
	$HeadURL$
*/

#include "utf8.h"
#include "debug.h"
#include <assert.h>

// Assert that non-starting octets are of the form 10xxxxxx
#define ASSERT_NON_START_OCTET(octet) \
	assert((octet & 0xC0) == 0x80 && "invalid non-start UTF-8 octet")

// Assert that starting octets are either of the form 0xxxxxxx (ASCII) or 11xxxxxx
#define ASSERT_START_OCTECT(octet) \
	assert((octet & 0x80) == 0x00 || (octet & 0xC0) == 0xC0 || !"invalid starting UTF-8 octet")

size_t utf8_character_count(const char* utf8_string)
{
	const char* curChar = utf8_string;

	size_t length = 0;
	while (*curChar != '\0')
	{
		ASSERT_START_OCTECT(*curChar);

		// first octect: 0xxxxxxx: 7 bit (ASCII)
		if      ((*curChar & 0x80) == 0x00)
		{
			// 1 byte long encoding
			curChar += 1;
		}
		// first octect: 110xxxxx
		else if ((*curChar & 0xe0) == 0xc0)
		{
			// 2 byte long encoding
			ASSERT_NON_START_OCTET(curChar[1]);
			curChar += 2;
		}
		// first octect: 1110xxxx
		else if ((*curChar & 0xf0) == 0xe0)
		{
			// 3 byte long encoding
			ASSERT_NON_START_OCTET(curChar[1]);
			ASSERT_NON_START_OCTET(curChar[2]);
			curChar += 3;
		}
		// first octect: 11110xxx
		else if ((*curChar & 0xf8) == 0xf0)
		{
			// 4 byte long encoding
			ASSERT_NON_START_OCTET(curChar[1]);
			ASSERT_NON_START_OCTET(curChar[2]);
			ASSERT_NON_START_OCTET(curChar[3]);
			curChar += 4;
		}
		// first octect: 111110xx
		else if ((*curChar & 0xfc) == 0xf8)
		{
			// 5 byte long encoding
			ASSERT_NON_START_OCTET(curChar[1]);
			ASSERT_NON_START_OCTET(curChar[2]);
			ASSERT_NON_START_OCTET(curChar[3]);
			ASSERT_NON_START_OCTET(curChar[4]);
			curChar += 5;
		}
		// first octect: 1111110x
		else if ((*curChar & 0xfe) == 0xfc)
		{
			// 6 byte long encoding
			ASSERT_NON_START_OCTET(curChar[1]);
			ASSERT_NON_START_OCTET(curChar[2]);
			ASSERT_NON_START_OCTET(curChar[3]);
			ASSERT_NON_START_OCTET(curChar[4]);
			ASSERT_NON_START_OCTET(curChar[5]);
			curChar += 6;
		}
		// first octect: 11111110
		else if ((*curChar & 0xff) == 0xfe)
		{
			// 7 byte long encoding
			ASSERT_NON_START_OCTET(curChar[1]);
			ASSERT_NON_START_OCTET(curChar[2]);
			ASSERT_NON_START_OCTET(curChar[3]);
			ASSERT_NON_START_OCTET(curChar[4]);
			ASSERT_NON_START_OCTET(curChar[5]);
			ASSERT_NON_START_OCTET(curChar[6]);
			curChar += 7;
		}
		// first octet: 11111111
		else
		{
			// apparently this character uses more than 36 bit
			// this decoder is not developed to cope with those
			// characters so error out
			ASSERT(!"out-of-range UTF-8 character", "utf8_character_count: this UTF-8 character is too large (> 36bits) for this UTF-8 decoder");
		}

		++length;
	}

	return length;
}

size_t unicode_utf8_buffer_length(const utf_32_char* unicode_string)
{
	const utf_32_char* curChar;

	// Determine length of string (in octets) when encoded in UTF-8
	size_t length = 0;
	for (curChar = unicode_string; *curChar != 0; ++curChar)
	{
		// an ASCII character, which uses 7 bit at most, which is one byte in UTF-8
		if      (*curChar < 0x00000080)
			length += 1; // stores 7 bits
		else if (*curChar < 0x00000800)
			length += 2; // stores 11 bits
		else if (*curChar < 0x00010000)
			length += 3; // stores 16 bits
		else if (*curChar < 0x00200000)
			length += 4; // stores 21 bits
		else if (*curChar < 0x04000000)
			length += 5; // stores 26 bits
		else if (*curChar < 0x80000000)
			length += 6; // stores 31 bits
		else // if (*curChar < 0x1000000000)
			length += 7; // stores 36 bits
	}

	return length;
}

char* utf8_encode(const utf_32_char* unicode_string)
{
	const utf_32_char* curChar;

	const size_t utf8_length = unicode_utf8_buffer_length(unicode_string);

	// Allocate memory to hold the UTF-8 encoded string (plus a terminating nul char)
	char* utf8_string = malloc(utf8_length + 1);
	char* curOutPos = utf8_string;

	if (utf8_string == NULL)
	{
		debug(LOG_ERROR, "utf8_encode: Out of memory");
		return NULL;
	}

	for (curChar = unicode_string; *curChar != 0; ++curChar)
	{
		// 7 bits
		if      (*curChar < 0x00000080)
		{
			*(curOutPos++) = *curChar;
		}
		// 11 bits
		else if (*curChar < 0x00000800)
		{
			// 0xc0 provides the counting bits: 110
			// then append the 5 most significant bits
			*(curOutPos++) = 0xc0 | (*curChar >> 6);
			// Put the next 6 bits in a byte of their own
			*(curOutPos++) = 0x80 | (*curChar & 0x3f);
		}
		// 16 bits
		else if (*curChar < 0x00010000)
		{
			// 0xe0 provides the counting bits: 1110
			// then append the 4 most significant bits
			*(curOutPos++) = 0xe0 | (*curChar >> 12);
			// Put the next 12 bits in two bytes of their own
			*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
			*(curOutPos++) = 0x80 | (*curChar & 0x3f);
		}
		// 21 bits
		else if (*curChar < 0x00200000)
		{
			// 0xf0 provides the counting bits: 11110
			// then append the 3 most significant bits
			*(curOutPos++) = 0xf0 | (*curChar >> 18);
			// Put the next 18 bits in three bytes of their own
			*(curOutPos++) = 0x80 | ((*curChar >> 12) & 0x3f);
			*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
			*(curOutPos++) = 0x80 | (*curChar & 0x3f);
		}
		// 26 bits
		else if (*curChar < 0x04000000)
		{
			// 0xf8 provides the counting bits: 111110
			// then append the 2 most significant bits
			*(curOutPos++) = 0xf8 | (*curChar >> 24 );
			// Put the next 24 bits in four bytes of their own
			*(curOutPos++) = 0x80 | ((*curChar >> 18) & 0x3f);
			*(curOutPos++) = 0x80 | ((*curChar >> 12) & 0x3f);
			*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
			*(curOutPos++) = 0x80 | (*curChar & 0x3f);
		}
		// 31 bits
		else if (*curChar < 0x80000000)
		{
			// 0xfc provides the counting bits: 1111110
			// then append the 1 most significant bit
			*(curOutPos++) = 0xfc | (*curChar >> 30);
			// Put the next 30 bits in five bytes of their own
			*(curOutPos++) = 0x80 | ((*curChar >> 24) & 0x3f);
			*(curOutPos++) = 0x80 | ((*curChar >> 18) & 0x3f);
			*(curOutPos++) = 0x80 | ((*curChar >> 12) & 0x3f);
			*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
			*(curOutPos++) = 0x80 | (*curChar & 0x3f);
		}
		// 36 bits
		else
		{
			// 0xfe provides the counting bits: 11111110
			*(curOutPos++) = 0xfe;
			// Put the next 36 bits in six bytes of their own
			*(curOutPos++) = 0x80 | ((*curChar >> 30) & 0x3f);
			*(curOutPos++) = 0x80 | ((*curChar >> 24) & 0x3f);
			*(curOutPos++) = 0x80 | ((*curChar >> 18) & 0x3f);
			*(curOutPos++) = 0x80 | ((*curChar >> 12) & 0x3f);
			*(curOutPos++) = 0x80 | ((*curChar >> 6) & 0x3f);
			*(curOutPos++) = 0x80 | (*curChar & 0x3f);
		}
	}

	// Terminate the string with a nul character
	utf8_string[utf8_length] = '\0';

	return utf8_string;
}

utf_32_char* utf8_decode(const char* utf8_string)
{
	const char* curChar = utf8_string;
	const size_t unicode_length = utf8_character_count(utf8_string);

	// Allocate memory to hold the UTF-32 encoded string (plus a terminating nul)
	utf_32_char* unicode_string = malloc(sizeof(utf_32_char) * (unicode_length + 1));
	utf_32_char* curOutPos = unicode_string;

	if (unicode_string == NULL)
	{
		debug(LOG_ERROR, "utf8_decode: Out of memory");
		return NULL;
	}

	while (*curChar != '\0')
	{
		ASSERT_START_OCTECT(*curChar);

		// first octect: 0xxxxxxx: 7 bit (ASCII)
		if      ((*curChar & 0x80) == 0x00)
		{
			// 1 byte long encoding
			*curOutPos = *(curChar++);
		}
		// first octect: 110xxxxx: 11 bit
		else if ((*curChar & 0xe0) == 0xc0)
		{
			// 2 byte long encoding
			ASSERT_NON_START_OCTET(curChar[1]);

			*curOutPos  = (*(curChar++) & 0x1f) << 6;
			*curOutPos |= (*(curChar++) & 0x3f) << 0;
		}
		// first octect: 1110xxxx: 16 bit
		else if ((*curChar & 0xf0) == 0xe0)
		{
			// 3 byte long encoding
			ASSERT_NON_START_OCTET(curChar[1]);
			ASSERT_NON_START_OCTET(curChar[2]);

			*curOutPos  = (*(curChar++) & 0x0f) << 12;
			*curOutPos |= (*(curChar++) & 0x3f) << 6;
			*curOutPos |= (*(curChar++) & 0x3f) << 0;
		}
		// first octect: 11110xxx: 21 bit
		else if ((*curChar & 0xf8) == 0xf0)
		{
			// 4 byte long encoding
			ASSERT_NON_START_OCTET(curChar[1]);
			ASSERT_NON_START_OCTET(curChar[2]);
			ASSERT_NON_START_OCTET(curChar[3]);

			*curOutPos  = (*(curChar++) & 0x07) << 18;
			*curOutPos |= (*(curChar++) & 0x3f) << 12;
			*curOutPos |= (*(curChar++) & 0x3f) << 6;
			*curOutPos |= (*(curChar++) & 0x3f) << 0;
		}
		// first octect: 111110xx: 26 bit
		else if ((*curChar & 0xfc) == 0xf8)
		{
			// 5 byte long encoding
			ASSERT_NON_START_OCTET(curChar[1]);
			ASSERT_NON_START_OCTET(curChar[2]);
			ASSERT_NON_START_OCTET(curChar[3]);
			ASSERT_NON_START_OCTET(curChar[4]);

			*curOutPos  = (*(curChar++) & 0x03) << 24;
			*curOutPos |= (*(curChar++) & 0x3f) << 18;
			*curOutPos |= (*(curChar++) & 0x3f) << 12;
			*curOutPos |= (*(curChar++) & 0x3f) << 6;
			*curOutPos |= (*(curChar++) & 0x3f) << 0;
		}
		// first octect: 1111110x: 31 bit
		else if ((*curChar & 0xfe) == 0xfc)
		{
			// 6 byte long encoding
			ASSERT_NON_START_OCTET(curChar[1]);
			ASSERT_NON_START_OCTET(curChar[2]);
			ASSERT_NON_START_OCTET(curChar[3]);
			ASSERT_NON_START_OCTET(curChar[4]);
			ASSERT_NON_START_OCTET(curChar[5]);

			*curOutPos  = (*(curChar++) & 0x01) << 30;
			*curOutPos |= (*(curChar++) & 0x3f) << 24;
			*curOutPos |= (*(curChar++) & 0x3f) << 18;
			*curOutPos |= (*(curChar++) & 0x3f) << 12;
			*curOutPos |= (*(curChar++) & 0x3f) << 6;
			*curOutPos |= (*(curChar++) & 0x3f) << 0;
		}
		// first octect: 11111110: 36 bit (we'll only use 32bit though)
		else if ((*curChar & 0xff) == 0xfe)
		{
			// 7 byte long encoding
			ASSERT_NON_START_OCTET(curChar[1]);
			ASSERT_NON_START_OCTET(curChar[2]);
			ASSERT_NON_START_OCTET(curChar[3]);
			ASSERT_NON_START_OCTET(curChar[4]);
			ASSERT_NON_START_OCTET(curChar[5]);
			ASSERT_NON_START_OCTET(curChar[6]);

			// original: *curOutPos  = (*(curChar++) & 0x00) << 36;
			// The first octect contains no data bits
			*curOutPos = 0; ++curChar;

			// original: *curOutPos |= (*(curChar++) & 0x3f) << 30;
			// Use only the 2 least significant bits of this byte
			// to make sure we use 32bit at maximum
			*curOutPos |= (*(curChar++) & 0x03) << 30;

			*curOutPos |= (*(curChar++) & 0x3f) << 24;
			*curOutPos |= (*(curChar++) & 0x3f) << 18;
			*curOutPos |= (*(curChar++) & 0x3f) << 12;
			*curOutPos |= (*(curChar++) & 0x3f) << 6;
			*curOutPos |= (*(curChar++) & 0x3f) << 0;
		}
		// first octet: 11111111: 41 bit or more
		else
		{
			// apparently this character uses more than 36 bit
			// this decoder is not developed to cope with those
			// characters so error out
			ASSERT(!"out-of-range UTF-8 character", "utf8_character_count: this UTF-8 character is too large (> 36bits) for this UTF-8 decoder");
		}

		++curOutPos;
	}

	// Terminate the string with a nul
	unicode_string[unicode_length] = '\0';

	return unicode_string;
}

size_t unicode_strlen(const utf_32_char* unicode_string)
{
	size_t length = 0;
	while (*(unicode_string++))
	{
		++length;
	}

	return length;
}