geany/src/encodings.c

/*
 *      encodings.c - this file is part of Geany, a fast and lightweight IDE
 *
 *      Copyright 2005-2007 Enrico Tröger <enrico.troeger@uvena.de>
 *      Copyright 2006-2007 Nick Treleaven <nick.treleaven@btinternet.com>
 *
 *      This program is free software; you can redistribute it and/or modify
 *      it under the terms of the GNU General Public License as published by
 *      the Free Software Foundation; either version 2 of the License, or
 *      (at your option) any later version.
 *
 *      This program is distributed in the hope that it will be useful,
 *      but WITHOUT ANY WARRANTY; without even the implied warranty of
 *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *      GNU General Public License for more details.
 *
 *      You should have received a copy of the GNU General Public License
 *      along with this program; if not, write to the Free Software
 *      Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 *
 *  $Id$
 */

/*
 * Encoding conversion and Byte Order Mark (BOM) handling.
 */

/*
 * Modified by the gedit Team, 2002. See the gedit AUTHORS file for a
 * list of people on the gedit Team.
 * See the gedit ChangeLog files for a list of changes.
 */
 /* Stolen from anjuta */

#include <string.h>

#include "geany.h"
#include "utils.h"
#include "support.h"
#include "msgwindow.h"
#include "encodings.h"
#include "callbacks.h"


#define fill(Order, Group, Idx, Charset, Name) \
		encodings[Idx].idx = Idx; \
		encodings[Idx].order = Order; \
		encodings[Idx].group = Group; \
		encodings[Idx].charset = Charset; \
		encodings[Idx].name = Name;

static void init_encodings(void)
{
	fill(0, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_14, "ISO-8859-14", _("Celtic"));
	fill(1, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_7, "ISO-8859-7", _("Greek"));
	fill(2, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1253, "WINDOWS-1253", _("Greek"));
	fill(3, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_10, "ISO-8859-10", _("Nordic"));
	fill(4, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_3, "ISO-8859-3", _("South European"));
	fill(5, WESTEUROPEAN, GEANY_ENCODING_IBM_850, "IBM850", _("Western"));
	fill(6, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_1, "ISO-8859-1", _("Western"));
	fill(7, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_15, "ISO-8859-15", _("Western"));
	fill(8, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1252, "WINDOWS-1252", _("Western"));

	fill(0, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_4, "ISO-8859-4", _("Baltic"));
	fill(1, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_13, "ISO-8859-13", _("Baltic"));
	fill(2, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1257, "WINDOWS-1257", _("Baltic"));
	fill(3, EASTEUROPEAN, GEANY_ENCODING_IBM_852, "IBM852", _("Central European"));
	fill(4, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_2, "ISO-8859-2", _("Central European"));
	fill(5, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1250, "WINDOWS-1250", _("Central European"));
	fill(6, EASTEUROPEAN, GEANY_ENCODING_IBM_855, "IBM855", _("Cyrillic"));
	fill(7, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_5, "ISO-8859-5", _("Cyrillic"));
	fill(8, EASTEUROPEAN, GEANY_ENCODING_ISO_IR_111, "ISO-IR-111", _("Cyrillic"));
	fill(9, EASTEUROPEAN, GEANY_ENCODING_KOI8_R, "KOI8R", _("Cyrillic"));
	fill(10, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1251, "WINDOWS-1251", _("Cyrillic"));
	fill(11, EASTEUROPEAN, GEANY_ENCODING_CP_866, "CP866", _("Cyrillic/Russian"));
	fill(12, EASTEUROPEAN, GEANY_ENCODING_KOI8_U, "KOI8U", _("Cyrillic/Ukrainian"));
	fill(13, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_16, "ISO-8859-16", _("Romanian"));

	fill(0, MIDDLEEASTERN, GEANY_ENCODING_IBM_864, "IBM864", _("Arabic"));
	fill(1, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_6, "ISO-8859-6", _("Arabic"));
	fill(2, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1256, "WINDOWS-1256", _("Arabic"));
	fill(3, MIDDLEEASTERN, GEANY_ENCODING_IBM_862, "IBM862", _("Hebrew"));
	fill(4, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8_I, "ISO-8859-8-I", _("Hebrew"));
	fill(5, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1255, "WINDOWS-1255", _("Hebrew"));
	fill(6, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8, "ISO-8859-8", _("Hebrew Visual"));

	fill(0, ASIAN, GEANY_ENCODING_ARMSCII_8, "ARMSCII-8", _("Armenian"));
	fill(1, ASIAN, GEANY_ENCODING_GEOSTD8, "GEORGIAN-ACADEMY", _("Georgian"));
	fill(2, ASIAN, GEANY_ENCODING_TIS_620, "TIS-620", _("Thai"));
	fill(3, ASIAN, GEANY_ENCODING_IBM_857, "IBM857", _("Turkish"));
	fill(4, ASIAN, GEANY_ENCODING_WINDOWS_1254, "WINDOWS-1254", _("Turkish"));
	fill(5, ASIAN, GEANY_ENCODING_ISO_8859_9, "ISO-8859-9", _("Turkish"));
	fill(6, ASIAN, GEANY_ENCODING_TCVN, "TCVN", _("Vietnamese"));
	fill(7, ASIAN, GEANY_ENCODING_VISCII, "VISCII", _("Vietnamese"));
	fill(8, ASIAN, GEANY_ENCODING_WINDOWS_1258, "WINDOWS-1258", _("Vietnamese"));

	fill(0, UNICODE, GEANY_ENCODING_UTF_7, "UTF-7", _("Unicode"));
	fill(1, UNICODE, GEANY_ENCODING_UTF_8, "UTF-8", _("Unicode"));
	fill(2, UNICODE, GEANY_ENCODING_UTF_16LE, "UTF-16LE", _("Unicode"));
	fill(3, UNICODE, GEANY_ENCODING_UTF_16BE, "UTF-16BE", _("Unicode"));
	fill(4, UNICODE, GEANY_ENCODING_UCS_2LE, "UCS-2LE", _("Unicode"));
	fill(5, UNICODE, GEANY_ENCODING_UCS_2BE, "UCS-2BE", _("Unicode"));
	fill(6, UNICODE, GEANY_ENCODING_UTF_32LE, "UTF-32LE", _("Unicode"));
	fill(7, UNICODE, GEANY_ENCODING_UTF_32BE, "UTF-32BE", _("Unicode"));

	fill(0, EASTASIAN, GEANY_ENCODING_GB18030, "GB18030", _("Chinese Simplified"));
	fill(1, EASTASIAN, GEANY_ENCODING_GB2312, "GB2312", _("Chinese Simplified"));
	fill(2, EASTASIAN, GEANY_ENCODING_GBK, "GBK", _("Chinese Simplified"));
	fill(3, EASTASIAN, GEANY_ENCODING_HZ, "HZ", _("Chinese Simplified"));
	fill(4, EASTASIAN, GEANY_ENCODING_BIG5, "BIG5", _("Chinese Traditional"));
	fill(5, EASTASIAN, GEANY_ENCODING_BIG5_HKSCS, "BIG5-HKSCS", _("Chinese Traditional"));
	fill(6, EASTASIAN, GEANY_ENCODING_EUC_TW, "EUC-TW", _("Chinese Traditional"));
	fill(7, EASTASIAN, GEANY_ENCODING_EUC_JP, "EUC-JP", _("Japanese"));
	fill(8, EASTASIAN, GEANY_ENCODING_ISO_2022_JP, "ISO-2022-JP", _("Japanese"));
	fill(9, EASTASIAN, GEANY_ENCODING_SHIFT_JIS, "SHIFT_JIS", _("Japanese"));
	fill(10, EASTASIAN, GEANY_ENCODING_EUC_KR, "EUC-KR", _("Korean"));
	fill(11, EASTASIAN, GEANY_ENCODING_ISO_2022_KR, "ISO-2022-KR", _("Korean"));
	fill(12, EASTASIAN, GEANY_ENCODING_JOHAB, "JOHAB", _("Korean"));
	fill(13, EASTASIAN, GEANY_ENCODING_UHC, "UHC", _("Korean"));

	fill(0, NONE, GEANY_ENCODING_NONE, "None", _("Without encoding"));
}


const GeanyEncoding *encodings_get_from_charset(const gchar *charset)
{
	gint i;

	if (charset == NULL) return &encodings[GEANY_ENCODING_UTF_8];

	i = 0;
	while (i < GEANY_ENCODINGS_MAX)
	{
		if (strcmp(charset, encodings[i].charset) == 0)
			return &encodings[i];

		++i;
	}

	return NULL;
}


const GeanyEncoding *encodings_get_from_index(gint idx)
{
	g_return_val_if_fail(idx >= 0, NULL);
	g_return_val_if_fail(idx < GEANY_ENCODINGS_MAX, NULL);

	return &encodings[idx];
}


gchar *encodings_to_string(const GeanyEncoding* enc)
{
	g_return_val_if_fail(enc != NULL, NULL);
	g_return_val_if_fail(enc->name != NULL, NULL);
	g_return_val_if_fail(enc->charset != NULL, NULL);

    return g_strdup_printf("%s (%s)", enc->name, enc->charset);
}


const gchar *encodings_get_charset(const GeanyEncoding* enc)
{
	g_return_val_if_fail(enc != NULL, NULL);
	g_return_val_if_fail(enc->charset != NULL, NULL);

	return enc->charset;
}

static GtkWidget *radio_items[GEANY_ENCODINGS_MAX];


void encodings_select_radio_item(const gchar *charset)
{
	gint i;
	g_return_if_fail(charset != NULL);

	i = 0;
	while (i < GEANY_ENCODINGS_MAX)
	{
		if (utils_str_equal(charset, encodings[i].charset)) break;
		i++;
	}
	if (i == GEANY_ENCODINGS_MAX) i = GEANY_ENCODING_UTF_8; // fallback to UTF-8

	// app->ignore_callback has to be set by the caller
	gtk_check_menu_item_set_active(GTK_CHECK_MENU_ITEM(radio_items[i]), TRUE);
}


void encodings_init(void)
{
	GtkWidget *item, *menu[2], *submenu, *menu_westeuro, *menu_easteuro, *menu_eastasian, *menu_asian,
			  *menu_utf8, *menu_middleeast, *item_westeuro, *item_easteuro, *item_eastasian,
			  *item_asian, *item_utf8, *item_middleeast;
	GCallback cb_func[2];
	GSList *group = NULL;
	gchar *label;
	gint order, group_size;
	guint i, j, k;

	init_encodings();

	// create encodings submenu in document menu
	menu[0] = lookup_widget(app->window, "set_encoding1_menu");
	menu[1] = lookup_widget(app->window, "menu_reload_as1_menu");
	cb_func[0] = G_CALLBACK(on_encoding_change);
	cb_func[1] = G_CALLBACK(on_reload_as_activate);

	for (k = 0; k < 2; k++)
	{
		menu_westeuro = gtk_menu_new();
		item_westeuro = gtk_menu_item_new_with_mnemonic(_("_West European"));
		gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_westeuro), menu_westeuro);
		gtk_container_add(GTK_CONTAINER(menu[k]), item_westeuro);
		gtk_widget_show_all(item_westeuro);

		menu_easteuro = gtk_menu_new();
		item_easteuro = gtk_menu_item_new_with_mnemonic(_("_East European"));
		gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_easteuro), menu_easteuro);
		gtk_container_add(GTK_CONTAINER(menu[k]), item_easteuro);
		gtk_widget_show_all(item_easteuro);

		menu_eastasian = gtk_menu_new();
		item_eastasian = gtk_menu_item_new_with_mnemonic(_("East _Asian"));
		gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_eastasian), menu_eastasian);
		gtk_container_add(GTK_CONTAINER(menu[k]), item_eastasian);
		gtk_widget_show_all(item_eastasian);

		menu_asian = gtk_menu_new();
		item_asian = gtk_menu_item_new_with_mnemonic(_("_SE & SW Asian"));
		gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_asian), menu_asian);
		gtk_container_add(GTK_CONTAINER(menu[k]), item_asian);
		gtk_widget_show_all(item_asian);

		menu_middleeast = gtk_menu_new();
		item_middleeast = gtk_menu_item_new_with_mnemonic(_("_Middle Eastern"));
		gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_middleeast), menu_middleeast);
		gtk_container_add(GTK_CONTAINER(menu[k]), item_middleeast);
		gtk_widget_show_all(item_middleeast);

		menu_utf8 = gtk_menu_new();
		item_utf8 = gtk_menu_item_new_with_mnemonic(_("_Unicode"));
		gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_utf8), menu_utf8);
		gtk_container_add(GTK_CONTAINER(menu[k]), item_utf8);
		gtk_widget_show_all(item_utf8);

		/// TODO can it be optimized? ATM 3782 runs at line "if (encodings[j].group ...)"
		for (i = 0; i < GEANY_ENCODING_GROUPS_MAX; i++)
		{
			order = 0;
			switch (i)
			{
				case WESTEUROPEAN: submenu = menu_westeuro; group_size = 9; break;
				case EASTEUROPEAN: submenu = menu_easteuro; group_size = 14; break;
				case EASTASIAN: submenu = menu_eastasian; group_size = 14; break;
				case ASIAN: submenu = menu_asian; group_size = 9; break;
				case MIDDLEEASTERN: submenu = menu_middleeast; group_size = 7; break;
				case UNICODE: submenu = menu_utf8; group_size = 8; break;
				default: submenu = menu[k]; group_size = 1;
			}

			while (order < group_size)	// the biggest group has 13 elements
			{
				for (j = 0; j < GEANY_ENCODINGS_MAX; j++)
				{
					if (encodings[j].group == i && encodings[j].order == order)
					{
						label = encodings_to_string(&encodings[j]);
						if (k == 0)
						{
							item = gtk_radio_menu_item_new_with_label(group, label);
							group = gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(item));
							radio_items[j] = item;
						}
						else
							item = gtk_menu_item_new_with_label(label);
						gtk_widget_show(item);
						gtk_container_add(GTK_CONTAINER(submenu), item);
						g_signal_connect((gpointer) item, "activate",
										cb_func[k], GINT_TO_POINTER(encodings[j].idx));
						g_free(label);
						break;
					}
				}
				order++;
			}
		}
	}
}


/* Converts a string from the given charset to UTF-8.
 * If fast is set, no further checks are performed. */
gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gsize size,
											const gchar *charset, gboolean fast)
{
	gchar *utf8_content = NULL;
	GError *conv_error = NULL;
	gchar* converted_contents = NULL;
	gsize bytes_written;

	g_return_val_if_fail(buffer != NULL, NULL);
	g_return_val_if_fail(charset != NULL, NULL);

	converted_contents = g_convert(buffer, size, "UTF-8", charset, NULL,
								   &bytes_written, &conv_error);

	if (fast)
	{
		utf8_content = converted_contents;
		if (conv_error != NULL) g_error_free(conv_error);
	}
	else if (conv_error != NULL || ! g_utf8_validate(converted_contents, bytes_written, NULL))
	{
		if (conv_error != NULL)
		{
			geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset, conv_error->message);
			g_error_free(conv_error);
			conv_error = NULL;
		}
		else
			geany_debug("Couldn't convert from %s to UTF-8.", charset);

		utf8_content = NULL;
		if (converted_contents != NULL) g_free(converted_contents);
	}
	else
	{
		geany_debug("Converted from %s to UTF-8.", charset);
		utf8_content = converted_contents;
	}

	return utf8_content;
}


gchar *encodings_convert_to_utf8(const gchar *buffer, gsize size, gchar **used_encoding)
{
	gchar *locale_charset = NULL;
	gchar *utf8_content;
	gchar *charset;
	gboolean check_current = FALSE;
	guint i;

	// current locale is not UTF-8, we have to check this charset
	check_current = ! g_get_charset((const gchar**)&locale_charset);

	for (i = 0; i < GEANY_ENCODINGS_MAX; i++)
	{
		if (i == (guint) encodings[GEANY_ENCODING_NONE].idx) continue;

		if (check_current)
		{
			check_current = FALSE;
			charset = locale_charset;
			i = -1;
		}
		else
			charset = encodings[i].charset;

		geany_debug("Trying to convert %d bytes of data from %s into UTF-8.", size, charset);
		utf8_content = encodings_convert_to_utf8_from_charset(buffer, size, charset, FALSE);

		if (utf8_content != NULL)
		{
			if (used_encoding != NULL)
			{
				if (*used_encoding != NULL)
				{
					g_free(*used_encoding);
					geany_debug("%s:%d", __FILE__, __LINE__);
				}
				*used_encoding = g_strdup(charset);
			}
			return utf8_content;
		}
	}

	return NULL;
}


/* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
 * otherwise GEANY_ENCODING_NONE.
 * */
GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len)
{
	if (len >= 3)
	{
		if (bom_len)
			*bom_len = 3;

		if ((guchar)string[0] == 0xef && (guchar)string[1] == 0xbb &&
			(guchar)string[2] == 0xbf)
		{
			return GEANY_ENCODING_UTF_8;
		}
	}
	if (len >= 4)
	{
		if (bom_len)
			*bom_len = 4;

		if ((guchar)string[0] == 0x00 && (guchar)string[1] == 0x00 &&
				 (guchar)string[2] == 0xfe && (guchar)string[3] == 0xff)
		{
			return GEANY_ENCODING_UTF_32BE; // Big endian
		}
		if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe &&
				 (guchar)string[2] == 0x00 && (guchar)string[3] == 0x00)
		{
			return GEANY_ENCODING_UTF_32LE; // Little endian
		}
		if ((string[0] == 0x2b && string[1] == 0x2f && string[2] == 0x76) &&
				 (string[3] == 0x38 || string[3] == 0x39 || string[3] == 0x2b || string[3] == 0x2f))
		{
			 return GEANY_ENCODING_UTF_7;
		}
	}
	if (len >= 2)
	{
		if (bom_len)
			*bom_len = 2;

		if ((guchar)string[0]==0xfe && (guchar)string[1] == 0xff)
		{
			return GEANY_ENCODING_UTF_16BE; // Big endian
		}
		if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe)
		{
			return GEANY_ENCODING_UTF_16LE; // Little endian
		}
	}
	if (bom_len)
		*bom_len = 0;
	return GEANY_ENCODING_NONE;
}


gboolean encodings_is_unicode_charset(const gchar *string)
{
	if (string != NULL && (strncmp(string, "UTF", 3) == 0 || strncmp(string, "UCS", 3) == 0))
	{
		return TRUE;
	}
	return FALSE;
}