2005-11-22 12:26:26 +00:00
|
|
|
/*
|
|
|
|
* encodings.c - this file is part of Geany, a fast and lightweight IDE
|
|
|
|
*
|
2012-06-18 01:13:05 +02:00
|
|
|
* Copyright 2005-2012 Enrico Tröger <enrico(dot)troeger(at)uvena(dot)de>
|
|
|
|
* Copyright 2006-2012 Nick Treleaven <nick(dot)treleaven(at)btinternet(dot)com>
|
2005-11-22 12:26:26 +00:00
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
2012-08-24 19:25:57 +02:00
|
|
|
* You should have received a copy of the GNU General Public License along
|
|
|
|
* with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
2005-11-22 12:26:26 +00:00
|
|
|
*/
|
|
|
|
|
2007-02-24 11:41:56 +00:00
|
|
|
/*
|
|
|
|
* Encoding conversion and Byte Order Mark (BOM) handling.
|
|
|
|
*/
|
2005-11-22 12:26:26 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Modified by the gedit Team, 2002. See the gedit AUTHORS file for a
|
|
|
|
* list of people on the gedit Team.
|
|
|
|
* See the gedit ChangeLog files for a list of changes.
|
|
|
|
*/
|
|
|
|
/* Stolen from anjuta */
|
|
|
|
|
2014-05-18 17:31:51 -07:00
|
|
|
#ifdef HAVE_CONFIG_H
|
|
|
|
# include "config.h"
|
|
|
|
#endif
|
2005-11-22 12:26:26 +00:00
|
|
|
|
|
|
|
#include "encodings.h"
|
2014-05-18 17:31:51 -07:00
|
|
|
|
|
|
|
#include "app.h"
|
2006-06-19 18:31:17 +00:00
|
|
|
#include "callbacks.h"
|
2014-05-18 17:31:51 -07:00
|
|
|
#include "documentprivate.h"
|
2014-10-13 16:36:36 -07:00
|
|
|
#include "pluginexport.h"
|
2014-05-18 17:31:51 -07:00
|
|
|
#include "support.h"
|
2008-05-22 14:41:28 +00:00
|
|
|
#include "ui_utils.h"
|
2014-05-18 17:31:51 -07:00
|
|
|
#include "utils.h"
|
|
|
|
|
|
|
|
#include <string.h>
|
|
|
|
|
2006-06-19 18:31:17 +00:00
|
|
|
|
2008-02-27 13:17:29 +00:00
|
|
|
/* <meta http-equiv="content-type" content="text/html; charset=UTF-8" /> */
|
2011-12-30 16:12:17 +00:00
|
|
|
#define PATTERN_HTMLMETA "<meta\\s+http-equiv\\s*=\\s*\"?content-type\"?\\s+content\\s*=\\s*\"text/x?html;\\s*charset=([a-z0-9_-]+)\"\\s*/?>"
|
2008-06-08 12:18:20 +00:00
|
|
|
/* " geany_encoding=utf-8 " or " coding: utf-8 " */
|
2011-03-31 23:27:33 +00:00
|
|
|
#define PATTERN_CODING "coding[\t ]*[:=][\t ]*\"?([a-z0-9-]+)\"?[\t ]*"
|
2010-08-25 11:44:33 +00:00
|
|
|
|
2008-02-27 13:17:29 +00:00
|
|
|
/* precompiled regexps */
|
2011-12-30 15:53:47 +00:00
|
|
|
static GRegex *pregs[2];
|
2008-04-27 14:33:40 +00:00
|
|
|
static gboolean pregs_loaded = FALSE;
|
2007-12-07 14:13:05 +00:00
|
|
|
|
2006-06-19 18:31:17 +00:00
|
|
|
|
2008-02-20 11:24:23 +00:00
|
|
|
GeanyEncoding encodings[GEANY_ENCODINGS_MAX];
|
|
|
|
|
|
|
|
|
2007-01-07 16:22:41 +00:00
|
|
|
#define fill(Order, Group, Idx, Charset, Name) \
|
|
|
|
encodings[Idx].idx = Idx; \
|
|
|
|
encodings[Idx].order = Order; \
|
|
|
|
encodings[Idx].group = Group; \
|
|
|
|
encodings[Idx].charset = Charset; \
|
|
|
|
encodings[Idx].name = Name;
|
2006-07-16 18:46:53 +00:00
|
|
|
|
2006-06-19 18:31:17 +00:00
|
|
|
static void init_encodings(void)
|
|
|
|
{
|
2011-03-31 23:27:45 +00:00
|
|
|
fill(0, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_14, "ISO-8859-14", _("Celtic"));
|
|
|
|
fill(1, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_7, "ISO-8859-7", _("Greek"));
|
|
|
|
fill(2, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1253, "WINDOWS-1253", _("Greek"));
|
|
|
|
fill(3, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_10, "ISO-8859-10", _("Nordic"));
|
|
|
|
fill(4, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_3, "ISO-8859-3", _("South European"));
|
|
|
|
fill(5, WESTEUROPEAN, GEANY_ENCODING_IBM_850, "IBM850", _("Western"));
|
|
|
|
fill(6, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_1, "ISO-8859-1", _("Western"));
|
|
|
|
fill(7, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_15, "ISO-8859-15", _("Western"));
|
|
|
|
fill(8, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1252, "WINDOWS-1252", _("Western"));
|
|
|
|
|
|
|
|
fill(0, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_4, "ISO-8859-4", _("Baltic"));
|
|
|
|
fill(1, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_13, "ISO-8859-13", _("Baltic"));
|
|
|
|
fill(2, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1257, "WINDOWS-1257", _("Baltic"));
|
|
|
|
fill(3, EASTEUROPEAN, GEANY_ENCODING_IBM_852, "IBM852", _("Central European"));
|
|
|
|
fill(4, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_2, "ISO-8859-2", _("Central European"));
|
|
|
|
fill(5, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1250, "WINDOWS-1250", _("Central European"));
|
|
|
|
fill(6, EASTEUROPEAN, GEANY_ENCODING_IBM_855, "IBM855", _("Cyrillic"));
|
|
|
|
fill(7, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_5, "ISO-8859-5", _("Cyrillic"));
|
2008-06-08 12:18:20 +00:00
|
|
|
/* ISO-IR-111 not available on Windows */
|
2011-03-31 23:27:45 +00:00
|
|
|
fill(8, EASTEUROPEAN, GEANY_ENCODING_ISO_IR_111, "ISO-IR-111", _("Cyrillic"));
|
|
|
|
fill(9, EASTEUROPEAN, GEANY_ENCODING_KOI8_R, "KOI8-R", _("Cyrillic"));
|
|
|
|
fill(10, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1251, "WINDOWS-1251", _("Cyrillic"));
|
|
|
|
fill(11, EASTEUROPEAN, GEANY_ENCODING_CP_866, "CP866", _("Cyrillic/Russian"));
|
|
|
|
fill(12, EASTEUROPEAN, GEANY_ENCODING_KOI8_U, "KOI8-U", _("Cyrillic/Ukrainian"));
|
|
|
|
fill(13, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_16, "ISO-8859-16", _("Romanian"));
|
|
|
|
|
|
|
|
fill(0, MIDDLEEASTERN, GEANY_ENCODING_IBM_864, "IBM864", _("Arabic"));
|
|
|
|
fill(1, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_6, "ISO-8859-6", _("Arabic"));
|
|
|
|
fill(2, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1256, "WINDOWS-1256", _("Arabic"));
|
|
|
|
fill(3, MIDDLEEASTERN, GEANY_ENCODING_IBM_862, "IBM862", _("Hebrew"));
|
2008-06-08 12:18:20 +00:00
|
|
|
/* not available at all, ? */
|
2011-03-31 23:27:45 +00:00
|
|
|
fill(4, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8_I, "ISO-8859-8-I", _("Hebrew"));
|
|
|
|
fill(5, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1255, "WINDOWS-1255", _("Hebrew"));
|
|
|
|
fill(6, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8, "ISO-8859-8", _("Hebrew Visual"));
|
|
|
|
|
|
|
|
fill(0, ASIAN, GEANY_ENCODING_ARMSCII_8, "ARMSCII-8", _("Armenian"));
|
|
|
|
fill(1, ASIAN, GEANY_ENCODING_GEOSTD8, "GEORGIAN-ACADEMY", _("Georgian"));
|
|
|
|
fill(2, ASIAN, GEANY_ENCODING_TIS_620, "TIS-620", _("Thai"));
|
|
|
|
fill(3, ASIAN, GEANY_ENCODING_IBM_857, "IBM857", _("Turkish"));
|
|
|
|
fill(4, ASIAN, GEANY_ENCODING_WINDOWS_1254, "WINDOWS-1254", _("Turkish"));
|
|
|
|
fill(5, ASIAN, GEANY_ENCODING_ISO_8859_9, "ISO-8859-9", _("Turkish"));
|
|
|
|
fill(6, ASIAN, GEANY_ENCODING_TCVN, "TCVN", _("Vietnamese"));
|
|
|
|
fill(7, ASIAN, GEANY_ENCODING_VISCII, "VISCII", _("Vietnamese"));
|
|
|
|
fill(8, ASIAN, GEANY_ENCODING_WINDOWS_1258, "WINDOWS-1258", _("Vietnamese"));
|
|
|
|
|
|
|
|
fill(0, UNICODE, GEANY_ENCODING_UTF_7, "UTF-7", _("Unicode"));
|
|
|
|
fill(1, UNICODE, GEANY_ENCODING_UTF_8, "UTF-8", _("Unicode"));
|
|
|
|
fill(2, UNICODE, GEANY_ENCODING_UTF_16LE, "UTF-16LE", _("Unicode"));
|
|
|
|
fill(3, UNICODE, GEANY_ENCODING_UTF_16BE, "UTF-16BE", _("Unicode"));
|
|
|
|
fill(4, UNICODE, GEANY_ENCODING_UCS_2LE, "UCS-2LE", _("Unicode"));
|
|
|
|
fill(5, UNICODE, GEANY_ENCODING_UCS_2BE, "UCS-2BE", _("Unicode"));
|
|
|
|
fill(6, UNICODE, GEANY_ENCODING_UTF_32LE, "UTF-32LE", _("Unicode"));
|
|
|
|
fill(7, UNICODE, GEANY_ENCODING_UTF_32BE, "UTF-32BE", _("Unicode"));
|
|
|
|
|
|
|
|
fill(0, EASTASIAN, GEANY_ENCODING_GB18030, "GB18030", _("Chinese Simplified"));
|
|
|
|
fill(1, EASTASIAN, GEANY_ENCODING_GB2312, "GB2312", _("Chinese Simplified"));
|
|
|
|
fill(2, EASTASIAN, GEANY_ENCODING_GBK, "GBK", _("Chinese Simplified"));
|
2008-06-08 12:18:20 +00:00
|
|
|
/* maybe not available on Linux */
|
2011-03-31 23:27:45 +00:00
|
|
|
fill(3, EASTASIAN, GEANY_ENCODING_HZ, "HZ", _("Chinese Simplified"));
|
|
|
|
fill(4, EASTASIAN, GEANY_ENCODING_BIG5, "BIG5", _("Chinese Traditional"));
|
|
|
|
fill(5, EASTASIAN, GEANY_ENCODING_BIG5_HKSCS, "BIG5-HKSCS", _("Chinese Traditional"));
|
|
|
|
fill(6, EASTASIAN, GEANY_ENCODING_EUC_TW, "EUC-TW", _("Chinese Traditional"));
|
|
|
|
fill(7, EASTASIAN, GEANY_ENCODING_EUC_JP, "EUC-JP", _("Japanese"));
|
|
|
|
fill(8, EASTASIAN, GEANY_ENCODING_ISO_2022_JP, "ISO-2022-JP", _("Japanese"));
|
|
|
|
fill(9, EASTASIAN, GEANY_ENCODING_SHIFT_JIS, "SHIFT_JIS", _("Japanese"));
|
|
|
|
fill(10, EASTASIAN, GEANY_ENCODING_CP_932, "CP932", _("Japanese"));
|
|
|
|
fill(11, EASTASIAN, GEANY_ENCODING_EUC_KR, "EUC-KR", _("Korean"));
|
|
|
|
fill(12, EASTASIAN, GEANY_ENCODING_ISO_2022_KR, "ISO-2022-KR", _("Korean"));
|
|
|
|
fill(13, EASTASIAN, GEANY_ENCODING_JOHAB, "JOHAB", _("Korean"));
|
|
|
|
fill(14, EASTASIAN, GEANY_ENCODING_UHC, "UHC", _("Korean"));
|
|
|
|
|
|
|
|
fill(0, NONE, GEANY_ENCODING_NONE, "None", _("Without encoding"));
|
2005-11-22 12:26:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-03-31 23:27:58 +00:00
|
|
|
/* compares two encoding names in a permissive fashion.
|
|
|
|
* e.g. "utf8" matches "UTF-8", "iso8859_1" matches "ISO-8859-1", etc. */
|
|
|
|
static gboolean encodings_charset_equals(const gchar *a, const gchar *b)
|
|
|
|
{
|
|
|
|
gboolean was_alpha = FALSE; /* whether last character of previous word was a letter */
|
|
|
|
gboolean need_sep = FALSE; /* whether we're expecting an implicit separator */
|
|
|
|
|
|
|
|
while (*a && *b)
|
|
|
|
{
|
|
|
|
gboolean is_alpha;
|
|
|
|
|
|
|
|
if (g_ascii_toupper(*a) == g_ascii_toupper(*b) &&
|
|
|
|
((is_alpha = g_ascii_isalpha(*a)) || g_ascii_isdigit(*a)))
|
|
|
|
{
|
|
|
|
/* either there was a real separator, or we need a implicit one (a chage from alpha to
|
|
|
|
* numeric or so) */
|
|
|
|
if (! need_sep || (was_alpha != is_alpha))
|
|
|
|
{
|
|
|
|
a++;
|
|
|
|
b++;
|
|
|
|
was_alpha = is_alpha;
|
|
|
|
need_sep = FALSE;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
guint n_sep = 0;
|
|
|
|
|
|
|
|
if (! g_ascii_isalnum(*a))
|
|
|
|
{
|
|
|
|
a++;
|
|
|
|
n_sep++;
|
|
|
|
}
|
|
|
|
if (! g_ascii_isalnum(*b))
|
|
|
|
{
|
|
|
|
b++;
|
|
|
|
n_sep++;
|
|
|
|
}
|
|
|
|
if (n_sep < 1)
|
|
|
|
return FALSE;
|
|
|
|
else if (n_sep < 2)
|
|
|
|
need_sep = TRUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return *a == *b;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2007-12-07 14:13:05 +00:00
|
|
|
GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset)
|
|
|
|
{
|
|
|
|
gint i;
|
|
|
|
|
2009-04-15 22:47:33 +00:00
|
|
|
if (charset == NULL)
|
2009-04-05 21:07:40 +00:00
|
|
|
return GEANY_ENCODING_UTF_8;
|
2007-12-07 14:13:05 +00:00
|
|
|
|
|
|
|
i = 0;
|
|
|
|
while (i < GEANY_ENCODINGS_MAX)
|
|
|
|
{
|
2011-03-31 23:27:58 +00:00
|
|
|
if (encodings_charset_equals(charset, encodings[i].charset))
|
2007-12-07 14:13:05 +00:00
|
|
|
return i;
|
|
|
|
|
|
|
|
++i;
|
|
|
|
}
|
|
|
|
return GEANY_ENCODING_UTF_8;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2006-06-19 18:31:17 +00:00
|
|
|
const GeanyEncoding *encodings_get_from_charset(const gchar *charset)
|
2005-11-22 12:26:26 +00:00
|
|
|
{
|
|
|
|
gint i;
|
|
|
|
|
2009-04-15 22:47:33 +00:00
|
|
|
if (charset == NULL)
|
2009-04-05 21:07:40 +00:00
|
|
|
return &encodings[GEANY_ENCODING_UTF_8];
|
2006-07-25 09:12:51 +00:00
|
|
|
|
2005-11-22 12:26:26 +00:00
|
|
|
i = 0;
|
2006-06-21 18:54:07 +00:00
|
|
|
while (i < GEANY_ENCODINGS_MAX)
|
2005-11-22 12:26:26 +00:00
|
|
|
{
|
2011-03-31 23:27:58 +00:00
|
|
|
if (encodings_charset_equals(charset, encodings[i].charset))
|
2005-11-22 12:26:26 +00:00
|
|
|
return &encodings[i];
|
|
|
|
|
|
|
|
++i;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-03-31 23:27:58 +00:00
|
|
|
static const gchar *encodings_normalize_charset(const gchar *charset)
|
|
|
|
{
|
|
|
|
const GeanyEncoding *encoding;
|
|
|
|
|
|
|
|
encoding = encodings_get_from_charset(charset);
|
|
|
|
if (encoding != NULL)
|
|
|
|
return encoding->charset;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2007-03-09 13:52:26 +00:00
|
|
|
const GeanyEncoding *encodings_get_from_index(gint idx)
|
2005-11-22 12:26:26 +00:00
|
|
|
{
|
2009-04-15 22:47:33 +00:00
|
|
|
g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
|
2005-11-22 12:26:26 +00:00
|
|
|
|
2007-03-09 13:52:26 +00:00
|
|
|
return &encodings[idx];
|
2005-11-22 12:26:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-02-17 18:00:42 +00:00
|
|
|
/**
|
|
|
|
* Gets the character set name of the specified index e.g. for use with
|
|
|
|
* @ref document_set_encoding().
|
2008-01-16 16:50:10 +00:00
|
|
|
*
|
2008-02-17 18:00:42 +00:00
|
|
|
* @param idx @ref GeanyEncodingIndex to retrieve the corresponding character set.
|
2008-01-16 16:50:10 +00:00
|
|
|
*
|
|
|
|
*
|
2008-02-17 18:00:42 +00:00
|
|
|
* @return The charset according to idx, or @c NULL if the index is invalid.
|
|
|
|
*
|
|
|
|
* @since 0.13
|
|
|
|
**/
|
2014-10-13 16:36:36 -07:00
|
|
|
GEANY_API_SYMBOL
|
2008-01-16 16:50:10 +00:00
|
|
|
const gchar* encodings_get_charset_from_index(gint idx)
|
|
|
|
{
|
2009-04-15 22:47:33 +00:00
|
|
|
g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
|
2008-01-16 16:50:10 +00:00
|
|
|
|
|
|
|
return encodings[idx].charset;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2006-06-19 18:31:17 +00:00
|
|
|
gchar *encodings_to_string(const GeanyEncoding* enc)
|
2005-11-22 12:26:26 +00:00
|
|
|
{
|
2009-04-15 22:47:33 +00:00
|
|
|
g_return_val_if_fail(enc != NULL, NULL);
|
|
|
|
g_return_val_if_fail(enc->name != NULL, NULL);
|
|
|
|
g_return_val_if_fail(enc->charset != NULL, NULL);
|
2005-11-22 12:26:26 +00:00
|
|
|
|
2011-08-06 18:34:14 +00:00
|
|
|
return g_strdup_printf("%s (%s)", enc->name, enc->charset);
|
2005-11-22 12:26:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2006-06-19 18:31:17 +00:00
|
|
|
const gchar *encodings_get_charset(const GeanyEncoding* enc)
|
2005-11-22 12:26:26 +00:00
|
|
|
{
|
2009-04-15 22:47:33 +00:00
|
|
|
g_return_val_if_fail(enc != NULL, NULL);
|
|
|
|
g_return_val_if_fail(enc->charset != NULL, NULL);
|
2005-11-22 12:26:26 +00:00
|
|
|
|
|
|
|
return enc->charset;
|
|
|
|
}
|
|
|
|
|
2009-09-16 14:13:38 +00:00
|
|
|
|
2006-08-03 22:17:10 +00:00
|
|
|
static GtkWidget *radio_items[GEANY_ENCODINGS_MAX];
|
|
|
|
|
|
|
|
|
|
|
|
void encodings_select_radio_item(const gchar *charset)
|
|
|
|
{
|
|
|
|
gint i;
|
2008-06-16 18:31:59 +00:00
|
|
|
|
2009-04-15 22:47:33 +00:00
|
|
|
g_return_if_fail(charset != NULL);
|
2006-08-03 22:17:10 +00:00
|
|
|
|
|
|
|
i = 0;
|
|
|
|
while (i < GEANY_ENCODINGS_MAX)
|
|
|
|
{
|
2008-06-16 18:31:59 +00:00
|
|
|
if (utils_str_equal(charset, encodings[i].charset))
|
|
|
|
break;
|
2006-08-03 22:17:10 +00:00
|
|
|
i++;
|
|
|
|
}
|
2009-04-15 22:47:33 +00:00
|
|
|
if (i == GEANY_ENCODINGS_MAX)
|
2008-06-16 18:31:59 +00:00
|
|
|
i = GEANY_ENCODING_UTF_8; /* fallback to UTF-8 */
|
2006-08-03 22:17:10 +00:00
|
|
|
|
2008-05-22 14:41:28 +00:00
|
|
|
/* ignore_callback has to be set by the caller */
|
2006-08-03 22:17:10 +00:00
|
|
|
gtk_check_menu_item_set_active(GTK_CHECK_MENU_ITEM(radio_items[i]), TRUE);
|
|
|
|
}
|
|
|
|
|
2005-11-22 12:26:26 +00:00
|
|
|
|
2008-09-23 15:19:46 +00:00
|
|
|
/* Regexp detection of file encoding declared in the file itself.
|
2007-12-07 14:13:05 +00:00
|
|
|
* Idea and parts of code taken from Bluefish, thanks.
|
|
|
|
* regex_compile() is used to compile regular expressions on program init and keep it in memory
|
|
|
|
* for faster access when opening a file. Pre-compiled regexps will be freed on program exit.
|
|
|
|
*/
|
2011-12-30 15:53:47 +00:00
|
|
|
static GRegex *regex_compile(const gchar *pattern)
|
2007-12-07 14:13:05 +00:00
|
|
|
{
|
2011-12-30 15:53:47 +00:00
|
|
|
GError *error = NULL;
|
|
|
|
GRegex *regex = g_regex_new(pattern, G_REGEX_CASELESS, 0, &error);
|
|
|
|
|
|
|
|
if (!regex)
|
2007-12-07 14:13:05 +00:00
|
|
|
{
|
2011-12-30 15:53:47 +00:00
|
|
|
geany_debug("Failed to compile encoding regex (%s)", error->message);
|
|
|
|
g_error_free(error);
|
2007-12-07 14:13:05 +00:00
|
|
|
}
|
2011-12-30 15:53:47 +00:00
|
|
|
return regex;
|
2007-12-07 14:13:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-12-30 15:53:47 +00:00
|
|
|
static gchar *regex_match(GRegex *preg, const gchar *buffer, gsize size)
|
2007-12-07 14:13:05 +00:00
|
|
|
{
|
|
|
|
gchar *encoding = NULL;
|
2011-12-30 15:53:47 +00:00
|
|
|
GMatchInfo *minfo;
|
2007-12-07 14:13:05 +00:00
|
|
|
|
2011-03-24 22:00:18 +00:00
|
|
|
if (G_UNLIKELY(! pregs_loaded || buffer == NULL))
|
2007-12-07 14:13:05 +00:00
|
|
|
return NULL;
|
|
|
|
|
2011-12-30 15:53:47 +00:00
|
|
|
/* scan only the first 512 characters in the buffer */
|
|
|
|
size = MIN(size, 512);
|
2007-12-07 14:13:05 +00:00
|
|
|
|
2011-12-30 15:53:47 +00:00
|
|
|
if (g_regex_match_full(preg, buffer, size, 0, 0, &minfo, NULL) &&
|
|
|
|
g_match_info_get_match_count(minfo) >= 2)
|
2007-12-07 14:13:05 +00:00
|
|
|
{
|
2011-12-30 15:53:47 +00:00
|
|
|
encoding = g_match_info_fetch(minfo, 1);
|
2007-12-07 14:13:05 +00:00
|
|
|
geany_debug("Detected encoding by regex search: %s", encoding);
|
|
|
|
|
2012-01-25 16:26:16 +00:00
|
|
|
SETPTR(encoding, g_utf8_strup(encoding, -1));
|
2007-12-07 14:13:05 +00:00
|
|
|
}
|
2011-12-30 15:53:47 +00:00
|
|
|
g_match_info_free(minfo);
|
2007-12-07 14:13:05 +00:00
|
|
|
return encoding;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-04-17 15:30:01 +00:00
|
|
|
static void encodings_radio_item_change_cb(GtkCheckMenuItem *menuitem, gpointer user_data)
|
|
|
|
{
|
|
|
|
GeanyDocument *doc = document_get_current();
|
2014-06-24 09:13:44 +02:00
|
|
|
const gchar *charset = user_data;
|
2009-04-17 15:30:01 +00:00
|
|
|
|
2014-06-24 09:13:44 +02:00
|
|
|
if (ignore_callback || doc == NULL || charset == NULL ||
|
2009-04-17 15:30:01 +00:00
|
|
|
! gtk_check_menu_item_get_active(menuitem) ||
|
2014-06-24 09:13:44 +02:00
|
|
|
utils_str_equal(charset, doc->encoding))
|
2009-04-17 15:30:01 +00:00
|
|
|
return;
|
|
|
|
|
|
|
|
if (doc->readonly)
|
|
|
|
{
|
|
|
|
utils_beep();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
document_undo_add(doc, UNDO_ENCODING, g_strdup(doc->encoding));
|
|
|
|
|
2014-06-24 09:13:44 +02:00
|
|
|
document_set_encoding(doc, charset);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void encodings_reload_radio_item_change_cb(GtkMenuItem *menuitem, gpointer user_data)
|
|
|
|
{
|
|
|
|
GeanyDocument *doc = document_get_current();
|
|
|
|
|
|
|
|
g_return_if_fail(doc != NULL);
|
|
|
|
|
|
|
|
document_reload_prompt(doc, user_data);
|
2009-04-17 15:30:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2007-12-07 14:13:05 +00:00
|
|
|
void encodings_finalize(void)
|
|
|
|
{
|
2009-04-15 22:47:33 +00:00
|
|
|
if (pregs_loaded)
|
2007-12-07 14:13:05 +00:00
|
|
|
{
|
2009-04-05 21:07:40 +00:00
|
|
|
guint i, len;
|
|
|
|
len = G_N_ELEMENTS(pregs);
|
|
|
|
for (i = 0; i < len; i++)
|
2007-12-07 14:13:05 +00:00
|
|
|
{
|
2011-12-30 15:53:47 +00:00
|
|
|
g_regex_unref(pregs[i]);
|
2007-12-07 14:13:05 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2006-06-19 18:31:17 +00:00
|
|
|
void encodings_init(void)
|
2005-11-22 12:26:26 +00:00
|
|
|
{
|
2006-07-22 01:29:10 +00:00
|
|
|
GtkWidget *item, *menu[2], *submenu, *menu_westeuro, *menu_easteuro, *menu_eastasian, *menu_asian,
|
2006-07-16 18:46:53 +00:00
|
|
|
*menu_utf8, *menu_middleeast, *item_westeuro, *item_easteuro, *item_eastasian,
|
|
|
|
*item_asian, *item_utf8, *item_middleeast;
|
2006-07-22 01:29:10 +00:00
|
|
|
GCallback cb_func[2];
|
2006-08-03 22:17:10 +00:00
|
|
|
GSList *group = NULL;
|
2006-06-19 18:31:17 +00:00
|
|
|
gchar *label;
|
2006-08-08 13:01:50 +00:00
|
|
|
gint order, group_size;
|
|
|
|
guint i, j, k;
|
2006-07-08 14:44:18 +00:00
|
|
|
|
2006-06-19 18:31:17 +00:00
|
|
|
init_encodings();
|
2005-11-22 12:26:26 +00:00
|
|
|
|
2009-04-15 22:47:33 +00:00
|
|
|
if (! pregs_loaded)
|
2008-04-27 14:33:40 +00:00
|
|
|
{
|
2011-12-30 15:53:47 +00:00
|
|
|
pregs[0] = regex_compile(PATTERN_HTMLMETA);
|
|
|
|
pregs[1] = regex_compile(PATTERN_CODING);
|
2008-04-27 14:33:40 +00:00
|
|
|
pregs_loaded = TRUE;
|
|
|
|
}
|
2007-12-07 14:13:05 +00:00
|
|
|
|
2008-02-27 13:17:29 +00:00
|
|
|
/* create encodings submenu in document menu */
|
2008-12-18 21:21:53 +00:00
|
|
|
menu[0] = ui_lookup_widget(main_widgets.window, "set_encoding1_menu");
|
|
|
|
menu[1] = ui_lookup_widget(main_widgets.window, "menu_reload_as1_menu");
|
2009-04-17 15:30:01 +00:00
|
|
|
cb_func[0] = G_CALLBACK(encodings_radio_item_change_cb);
|
2014-06-24 09:13:44 +02:00
|
|
|
cb_func[1] = G_CALLBACK(encodings_reload_radio_item_change_cb);
|
2006-07-22 01:29:10 +00:00
|
|
|
|
|
|
|
for (k = 0; k < 2; k++)
|
2005-11-22 12:26:26 +00:00
|
|
|
{
|
2006-07-22 01:29:10 +00:00
|
|
|
menu_westeuro = gtk_menu_new();
|
|
|
|
item_westeuro = gtk_menu_item_new_with_mnemonic(_("_West European"));
|
|
|
|
gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_westeuro), menu_westeuro);
|
|
|
|
gtk_container_add(GTK_CONTAINER(menu[k]), item_westeuro);
|
|
|
|
gtk_widget_show_all(item_westeuro);
|
|
|
|
|
|
|
|
menu_easteuro = gtk_menu_new();
|
|
|
|
item_easteuro = gtk_menu_item_new_with_mnemonic(_("_East European"));
|
|
|
|
gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_easteuro), menu_easteuro);
|
|
|
|
gtk_container_add(GTK_CONTAINER(menu[k]), item_easteuro);
|
|
|
|
gtk_widget_show_all(item_easteuro);
|
|
|
|
|
|
|
|
menu_eastasian = gtk_menu_new();
|
|
|
|
item_eastasian = gtk_menu_item_new_with_mnemonic(_("East _Asian"));
|
|
|
|
gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_eastasian), menu_eastasian);
|
|
|
|
gtk_container_add(GTK_CONTAINER(menu[k]), item_eastasian);
|
|
|
|
gtk_widget_show_all(item_eastasian);
|
|
|
|
|
|
|
|
menu_asian = gtk_menu_new();
|
|
|
|
item_asian = gtk_menu_item_new_with_mnemonic(_("_SE & SW Asian"));
|
|
|
|
gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_asian), menu_asian);
|
|
|
|
gtk_container_add(GTK_CONTAINER(menu[k]), item_asian);
|
|
|
|
gtk_widget_show_all(item_asian);
|
|
|
|
|
|
|
|
menu_middleeast = gtk_menu_new();
|
|
|
|
item_middleeast = gtk_menu_item_new_with_mnemonic(_("_Middle Eastern"));
|
|
|
|
gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_middleeast), menu_middleeast);
|
|
|
|
gtk_container_add(GTK_CONTAINER(menu[k]), item_middleeast);
|
|
|
|
gtk_widget_show_all(item_middleeast);
|
|
|
|
|
|
|
|
menu_utf8 = gtk_menu_new();
|
|
|
|
item_utf8 = gtk_menu_item_new_with_mnemonic(_("_Unicode"));
|
|
|
|
gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_utf8), menu_utf8);
|
|
|
|
gtk_container_add(GTK_CONTAINER(menu[k]), item_utf8);
|
|
|
|
gtk_widget_show_all(item_utf8);
|
|
|
|
|
2008-02-27 13:17:29 +00:00
|
|
|
/** TODO can it be optimized? ATM 3782 runs at line "if (encodings[j].group ...)" */
|
2006-07-22 01:29:10 +00:00
|
|
|
for (i = 0; i < GEANY_ENCODING_GROUPS_MAX; i++)
|
2006-07-16 18:46:53 +00:00
|
|
|
{
|
2006-07-22 01:29:10 +00:00
|
|
|
order = 0;
|
|
|
|
switch (i)
|
|
|
|
{
|
|
|
|
case WESTEUROPEAN: submenu = menu_westeuro; group_size = 9; break;
|
|
|
|
case EASTEUROPEAN: submenu = menu_easteuro; group_size = 14; break;
|
|
|
|
case EASTASIAN: submenu = menu_eastasian; group_size = 14; break;
|
|
|
|
case ASIAN: submenu = menu_asian; group_size = 9; break;
|
|
|
|
case MIDDLEEASTERN: submenu = menu_middleeast; group_size = 7; break;
|
|
|
|
case UNICODE: submenu = menu_utf8; group_size = 8; break;
|
2006-10-23 20:32:37 +00:00
|
|
|
default: submenu = menu[k]; group_size = 1;
|
2006-07-22 01:29:10 +00:00
|
|
|
}
|
2006-07-16 18:46:53 +00:00
|
|
|
|
2008-02-27 13:17:29 +00:00
|
|
|
while (order < group_size) /* the biggest group has 13 elements */
|
2006-07-18 12:27:36 +00:00
|
|
|
{
|
2006-07-22 01:29:10 +00:00
|
|
|
for (j = 0; j < GEANY_ENCODINGS_MAX; j++)
|
2006-07-18 12:27:36 +00:00
|
|
|
{
|
2006-07-22 01:29:10 +00:00
|
|
|
if (encodings[j].group == i && encodings[j].order == order)
|
|
|
|
{
|
|
|
|
label = encodings_to_string(&encodings[j]);
|
2006-08-03 22:17:10 +00:00
|
|
|
if (k == 0)
|
|
|
|
{
|
|
|
|
item = gtk_radio_menu_item_new_with_label(group, label);
|
|
|
|
group = gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(item));
|
|
|
|
radio_items[j] = item;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
item = gtk_menu_item_new_with_label(label);
|
2006-07-22 01:29:10 +00:00
|
|
|
gtk_widget_show(item);
|
|
|
|
gtk_container_add(GTK_CONTAINER(submenu), item);
|
2014-06-24 09:13:44 +02:00
|
|
|
g_signal_connect(item, "activate", cb_func[k],
|
|
|
|
(gpointer) encodings[j].charset);
|
2006-07-22 01:29:10 +00:00
|
|
|
g_free(label);
|
|
|
|
break;
|
|
|
|
}
|
2006-07-18 12:27:36 +00:00
|
|
|
}
|
2006-07-22 01:29:10 +00:00
|
|
|
order++;
|
2006-07-18 12:27:36 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2005-11-22 12:26:26 +00:00
|
|
|
}
|
2006-10-25 14:48:07 +00:00
|
|
|
|
|
|
|
|
2014-01-17 21:56:51 +01:00
|
|
|
static gint encoding_combo_store_sort_func(GtkTreeModel *model,
|
|
|
|
GtkTreeIter *a,
|
|
|
|
GtkTreeIter *b,
|
|
|
|
gpointer data)
|
|
|
|
{
|
|
|
|
gboolean a_has_child = gtk_tree_model_iter_has_child(model, a);
|
|
|
|
gboolean b_has_child = gtk_tree_model_iter_has_child(model, b);
|
|
|
|
gchar *a_string;
|
|
|
|
gchar *b_string;
|
|
|
|
gint cmp_res;
|
|
|
|
|
|
|
|
if (a_has_child != b_has_child)
|
|
|
|
return a_has_child ? -1 : 1;
|
|
|
|
|
|
|
|
gtk_tree_model_get(model, a, 1, &a_string, -1);
|
|
|
|
gtk_tree_model_get(model, b, 1, &b_string, -1);
|
|
|
|
cmp_res = strcmp(a_string, b_string);
|
|
|
|
g_free(a_string);
|
|
|
|
g_free(b_string);
|
|
|
|
return cmp_res;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
GtkTreeStore *encodings_encoding_store_new(gboolean has_detect)
|
|
|
|
{
|
|
|
|
GtkTreeStore *store;
|
|
|
|
GtkTreeIter iter_current, iter_westeuro, iter_easteuro, iter_eastasian,
|
|
|
|
iter_asian, iter_utf8, iter_middleeast;
|
|
|
|
GtkTreeIter *iter_parent;
|
|
|
|
gchar *encoding_string;
|
|
|
|
gint i;
|
|
|
|
|
|
|
|
store = gtk_tree_store_new(2, G_TYPE_INT, G_TYPE_STRING);
|
|
|
|
|
|
|
|
if (has_detect)
|
|
|
|
{
|
|
|
|
gtk_tree_store_append(store, &iter_current, NULL);
|
|
|
|
gtk_tree_store_set(store, &iter_current, 0, GEANY_ENCODINGS_MAX, 1, _("Detect from file"), -1);
|
|
|
|
}
|
|
|
|
|
|
|
|
gtk_tree_store_append(store, &iter_westeuro, NULL);
|
|
|
|
gtk_tree_store_set(store, &iter_westeuro, 0, -1, 1, _("West European"), -1);
|
|
|
|
gtk_tree_store_append(store, &iter_easteuro, NULL);
|
|
|
|
gtk_tree_store_set(store, &iter_easteuro, 0, -1, 1, _("East European"), -1);
|
|
|
|
gtk_tree_store_append(store, &iter_eastasian, NULL);
|
|
|
|
gtk_tree_store_set(store, &iter_eastasian, 0, -1, 1, _("East Asian"), -1);
|
|
|
|
gtk_tree_store_append(store, &iter_asian, NULL);
|
|
|
|
gtk_tree_store_set(store, &iter_asian, 0, -1, 1, _("SE & SW Asian"), -1);
|
|
|
|
gtk_tree_store_append(store, &iter_middleeast, NULL);
|
|
|
|
gtk_tree_store_set(store, &iter_middleeast, 0, -1, 1, _("Middle Eastern"), -1);
|
|
|
|
gtk_tree_store_append(store, &iter_utf8, NULL);
|
|
|
|
gtk_tree_store_set(store, &iter_utf8, 0, -1, 1, _("Unicode"), -1);
|
|
|
|
|
|
|
|
for (i = 0; i < GEANY_ENCODINGS_MAX; i++)
|
|
|
|
{
|
|
|
|
switch (encodings[i].group)
|
|
|
|
{
|
|
|
|
case WESTEUROPEAN: iter_parent = &iter_westeuro; break;
|
|
|
|
case EASTEUROPEAN: iter_parent = &iter_easteuro; break;
|
|
|
|
case EASTASIAN: iter_parent = &iter_eastasian; break;
|
|
|
|
case ASIAN: iter_parent = &iter_asian; break;
|
|
|
|
case MIDDLEEASTERN: iter_parent = &iter_middleeast; break;
|
|
|
|
case UNICODE: iter_parent = &iter_utf8; break;
|
|
|
|
case NONE:
|
|
|
|
default: iter_parent = NULL;
|
|
|
|
}
|
|
|
|
gtk_tree_store_append(store, &iter_current, iter_parent);
|
|
|
|
encoding_string = encodings_to_string(&encodings[i]);
|
|
|
|
gtk_tree_store_set(store, &iter_current, 0, i, 1, encoding_string, -1);
|
|
|
|
g_free(encoding_string);
|
|
|
|
}
|
|
|
|
|
|
|
|
gtk_tree_sortable_set_sort_column_id(GTK_TREE_SORTABLE(store), 1, GTK_SORT_ASCENDING);
|
|
|
|
gtk_tree_sortable_set_sort_func(GTK_TREE_SORTABLE(store), 1, encoding_combo_store_sort_func, NULL, NULL);
|
|
|
|
|
|
|
|
return store;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
gint encodings_encoding_store_get_encoding(GtkTreeStore *store, GtkTreeIter *iter)
|
|
|
|
{
|
|
|
|
gint enc;
|
|
|
|
gtk_tree_model_get(GTK_TREE_MODEL(store), iter, 0, &enc, -1);
|
|
|
|
return enc;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
gboolean encodings_encoding_store_get_iter(GtkTreeStore *store, GtkTreeIter *iter, gint enc)
|
|
|
|
{
|
|
|
|
if (gtk_tree_model_get_iter_first(GTK_TREE_MODEL(store), iter))
|
|
|
|
{
|
|
|
|
do
|
|
|
|
{
|
|
|
|
if (encodings_encoding_store_get_encoding(store, iter) == enc)
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
while (ui_tree_model_iter_any_next(GTK_TREE_MODEL(store), iter, TRUE));
|
|
|
|
}
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void encodings_encoding_store_cell_data_func(GtkCellLayout *cell_layout,
|
|
|
|
GtkCellRenderer *cell,
|
|
|
|
GtkTreeModel *tree_model,
|
|
|
|
GtkTreeIter *iter,
|
|
|
|
gpointer data)
|
|
|
|
{
|
|
|
|
gboolean sensitive = !gtk_tree_model_iter_has_child(tree_model, iter);
|
|
|
|
gchar *text;
|
|
|
|
|
|
|
|
gtk_tree_model_get(tree_model, iter, 1, &text, -1);
|
|
|
|
g_object_set(cell, "sensitive", sensitive, "text", text, NULL);
|
|
|
|
g_free(text);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-02-17 18:00:42 +00:00
|
|
|
/**
|
|
|
|
* Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset.
|
|
|
|
* If @a fast is not set, additional checks to validate the converted string are performed.
|
2008-01-16 16:50:10 +00:00
|
|
|
*
|
2008-02-17 18:00:42 +00:00
|
|
|
* @param buffer The input string to convert.
|
|
|
|
* @param size The length of the string, or -1 if the string is nul-terminated.
|
|
|
|
* @param charset The charset to be used for conversion.
|
|
|
|
* @param fast @c TRUE to only convert the input and skip extended checks on the converted string.
|
2008-01-16 16:50:10 +00:00
|
|
|
*
|
2008-02-17 18:00:42 +00:00
|
|
|
* @return If the conversion was successful, a newly allocated nul-terminated string,
|
2009-07-21 22:12:13 +00:00
|
|
|
* which must be freed with @c g_free(). Otherwise @c NULL.
|
2008-02-17 18:00:42 +00:00
|
|
|
**/
|
2014-10-13 16:36:36 -07:00
|
|
|
GEANY_API_SYMBOL
|
2012-01-03 13:30:38 +00:00
|
|
|
gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size,
|
2008-01-16 16:50:10 +00:00
|
|
|
const gchar *charset, gboolean fast)
|
2006-10-25 14:48:07 +00:00
|
|
|
{
|
|
|
|
gchar *utf8_content = NULL;
|
|
|
|
GError *conv_error = NULL;
|
|
|
|
gchar* converted_contents = NULL;
|
|
|
|
gsize bytes_written;
|
|
|
|
|
2009-04-15 22:47:33 +00:00
|
|
|
g_return_val_if_fail(buffer != NULL, NULL);
|
|
|
|
g_return_val_if_fail(charset != NULL, NULL);
|
2006-10-25 14:48:07 +00:00
|
|
|
|
|
|
|
converted_contents = g_convert(buffer, size, "UTF-8", charset, NULL,
|
|
|
|
&bytes_written, &conv_error);
|
|
|
|
|
|
|
|
if (fast)
|
|
|
|
{
|
|
|
|
utf8_content = converted_contents;
|
|
|
|
if (conv_error != NULL) g_error_free(conv_error);
|
|
|
|
}
|
|
|
|
else if (conv_error != NULL || ! g_utf8_validate(converted_contents, bytes_written, NULL))
|
|
|
|
{
|
|
|
|
if (conv_error != NULL)
|
|
|
|
{
|
|
|
|
geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset, conv_error->message);
|
|
|
|
g_error_free(conv_error);
|
|
|
|
conv_error = NULL;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
geany_debug("Couldn't convert from %s to UTF-8.", charset);
|
|
|
|
|
|
|
|
utf8_content = NULL;
|
2010-09-09 17:21:05 +00:00
|
|
|
g_free(converted_contents);
|
2006-10-25 14:48:07 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
geany_debug("Converted from %s to UTF-8.", charset);
|
|
|
|
utf8_content = converted_contents;
|
|
|
|
}
|
|
|
|
|
|
|
|
return utf8_content;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-03-31 23:28:10 +00:00
|
|
|
static gchar *encodings_check_regexes(const gchar *buffer, gsize size)
|
2006-10-25 14:48:07 +00:00
|
|
|
{
|
2011-03-31 23:28:10 +00:00
|
|
|
guint i;
|
|
|
|
|
|
|
|
for (i = 0; i < G_N_ELEMENTS(pregs); i++)
|
|
|
|
{
|
|
|
|
gchar *charset;
|
|
|
|
|
2011-12-30 15:53:47 +00:00
|
|
|
if ((charset = regex_match(pregs[i], buffer, size)) != NULL)
|
2011-03-31 23:28:10 +00:00
|
|
|
return charset;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2012-01-03 13:30:38 +00:00
|
|
|
static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gssize size,
|
2011-03-31 23:28:10 +00:00
|
|
|
const gchar *suggested_charset, gchar **used_encoding)
|
|
|
|
{
|
|
|
|
const gchar *locale_charset = NULL;
|
2010-04-25 17:43:39 +00:00
|
|
|
const gchar *charset;
|
2007-12-07 14:13:05 +00:00
|
|
|
gchar *utf8_content;
|
2011-03-31 23:28:10 +00:00
|
|
|
gboolean check_suggestion = suggested_charset != NULL;
|
2007-12-07 14:13:05 +00:00
|
|
|
gboolean check_locale = FALSE;
|
2011-03-31 23:28:10 +00:00
|
|
|
gint i, preferred_charset;
|
2006-10-25 14:48:07 +00:00
|
|
|
|
2012-01-03 13:30:38 +00:00
|
|
|
if (size == -1)
|
2008-01-16 16:50:10 +00:00
|
|
|
{
|
|
|
|
size = strlen(buffer);
|
|
|
|
}
|
|
|
|
|
2008-02-27 13:17:29 +00:00
|
|
|
/* current locale is not UTF-8, we have to check this charset */
|
2011-03-31 23:28:10 +00:00
|
|
|
check_locale = ! g_get_charset(&locale_charset);
|
2006-10-25 14:48:07 +00:00
|
|
|
|
2009-10-11 10:51:12 +00:00
|
|
|
/* First check for preferred charset, if specified */
|
|
|
|
preferred_charset = file_prefs.default_open_encoding;
|
|
|
|
|
2010-02-07 18:37:25 +00:00
|
|
|
if (preferred_charset == encodings[GEANY_ENCODING_NONE].idx ||
|
|
|
|
preferred_charset < 0 ||
|
|
|
|
preferred_charset >= GEANY_ENCODINGS_MAX)
|
|
|
|
{
|
2009-10-11 10:51:12 +00:00
|
|
|
preferred_charset = -1;
|
2010-02-07 18:37:25 +00:00
|
|
|
}
|
2009-10-11 10:51:12 +00:00
|
|
|
|
|
|
|
/* -1 means "Preferred charset" */
|
|
|
|
for (i = -1; i < GEANY_ENCODINGS_MAX; i++)
|
2006-10-25 14:48:07 +00:00
|
|
|
{
|
2010-02-07 18:37:25 +00:00
|
|
|
if (G_UNLIKELY(i == encodings[GEANY_ENCODING_NONE].idx))
|
2007-12-07 14:13:05 +00:00
|
|
|
continue;
|
2006-12-06 20:44:32 +00:00
|
|
|
|
2011-03-31 23:28:10 +00:00
|
|
|
if (check_suggestion)
|
2006-10-25 14:48:07 +00:00
|
|
|
{
|
2011-03-31 23:28:10 +00:00
|
|
|
check_suggestion = FALSE;
|
|
|
|
charset = encodings_normalize_charset(suggested_charset);
|
|
|
|
if (charset == NULL) /* we failed at normalizing suggested encoding, try it as is */
|
|
|
|
charset = suggested_charset;
|
2010-08-15 13:33:32 +00:00
|
|
|
i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
|
2007-12-07 14:13:05 +00:00
|
|
|
}
|
|
|
|
else if (check_locale)
|
|
|
|
{
|
|
|
|
check_locale = FALSE;
|
2006-10-25 14:48:07 +00:00
|
|
|
charset = locale_charset;
|
2010-08-15 13:33:32 +00:00
|
|
|
i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
|
2006-10-25 14:48:07 +00:00
|
|
|
}
|
2010-08-15 13:33:32 +00:00
|
|
|
else if (i == -1)
|
|
|
|
{
|
|
|
|
if (preferred_charset >= 0)
|
|
|
|
{
|
|
|
|
charset = encodings[preferred_charset].charset;
|
|
|
|
geany_debug("Using preferred charset: %s", charset);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
else if (i >= 0)
|
2006-10-25 14:48:07 +00:00
|
|
|
charset = encodings[i].charset;
|
2010-08-15 13:33:32 +00:00
|
|
|
else /* in this case we have i == -2, continue to increase i and go ahead */
|
|
|
|
continue;
|
2007-12-07 14:13:05 +00:00
|
|
|
|
2009-04-05 21:07:40 +00:00
|
|
|
if (G_UNLIKELY(charset == NULL))
|
2007-12-07 14:13:05 +00:00
|
|
|
continue;
|
|
|
|
|
2010-08-15 13:33:32 +00:00
|
|
|
geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.",
|
|
|
|
size, charset);
|
2006-10-25 14:48:07 +00:00
|
|
|
utf8_content = encodings_convert_to_utf8_from_charset(buffer, size, charset, FALSE);
|
|
|
|
|
2009-04-05 21:07:40 +00:00
|
|
|
if (G_LIKELY(utf8_content != NULL))
|
2006-10-25 14:48:07 +00:00
|
|
|
{
|
|
|
|
if (used_encoding != NULL)
|
|
|
|
{
|
2009-04-05 21:07:40 +00:00
|
|
|
if (G_UNLIKELY(*used_encoding != NULL))
|
2006-10-25 14:48:07 +00:00
|
|
|
{
|
|
|
|
geany_debug("%s:%d", __FILE__, __LINE__);
|
2007-12-07 14:13:05 +00:00
|
|
|
g_free(*used_encoding);
|
2006-10-25 14:48:07 +00:00
|
|
|
}
|
|
|
|
*used_encoding = g_strdup(charset);
|
|
|
|
}
|
|
|
|
return utf8_content;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
2007-01-07 16:22:41 +00:00
|
|
|
|
|
|
|
|
2011-03-31 23:28:10 +00:00
|
|
|
/**
|
|
|
|
* Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
|
|
|
|
* @a used_encoding.
|
|
|
|
*
|
|
|
|
* @param buffer the input string to convert.
|
|
|
|
* @param size the length of the string, or -1 if the string is nul-terminated.
|
|
|
|
* @param used_encoding return location of the detected encoding of the input string, or @c NULL.
|
|
|
|
*
|
|
|
|
* @return If the conversion was successful, a newly allocated nul-terminated string,
|
|
|
|
* which must be freed with @c g_free(). Otherwise @c NULL.
|
|
|
|
**/
|
2014-10-13 16:36:36 -07:00
|
|
|
GEANY_API_SYMBOL
|
2012-01-03 13:30:38 +00:00
|
|
|
gchar *encodings_convert_to_utf8(const gchar *buffer, gssize size, gchar **used_encoding)
|
2011-03-31 23:28:10 +00:00
|
|
|
{
|
|
|
|
gchar *regex_charset;
|
|
|
|
gchar *utf8;
|
|
|
|
|
|
|
|
/* first try to read the encoding from the file content */
|
|
|
|
regex_charset = encodings_check_regexes(buffer, size);
|
|
|
|
utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding);
|
|
|
|
g_free(regex_charset);
|
|
|
|
|
|
|
|
return utf8;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2007-01-07 16:22:41 +00:00
|
|
|
/* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
|
|
|
|
* otherwise GEANY_ENCODING_NONE.
|
|
|
|
* */
|
|
|
|
GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len)
|
|
|
|
{
|
|
|
|
if (len >= 3)
|
|
|
|
{
|
|
|
|
if (bom_len)
|
|
|
|
*bom_len = 3;
|
|
|
|
|
|
|
|
if ((guchar)string[0] == 0xef && (guchar)string[1] == 0xbb &&
|
|
|
|
(guchar)string[2] == 0xbf)
|
|
|
|
{
|
|
|
|
return GEANY_ENCODING_UTF_8;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (len >= 4)
|
|
|
|
{
|
|
|
|
if (bom_len)
|
|
|
|
*bom_len = 4;
|
|
|
|
|
|
|
|
if ((guchar)string[0] == 0x00 && (guchar)string[1] == 0x00 &&
|
|
|
|
(guchar)string[2] == 0xfe && (guchar)string[3] == 0xff)
|
|
|
|
{
|
2008-02-27 13:17:29 +00:00
|
|
|
return GEANY_ENCODING_UTF_32BE; /* Big endian */
|
2007-01-07 16:22:41 +00:00
|
|
|
}
|
|
|
|
if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe &&
|
|
|
|
(guchar)string[2] == 0x00 && (guchar)string[3] == 0x00)
|
|
|
|
{
|
2008-02-27 13:17:29 +00:00
|
|
|
return GEANY_ENCODING_UTF_32LE; /* Little endian */
|
2007-01-07 16:22:41 +00:00
|
|
|
}
|
|
|
|
if ((string[0] == 0x2b && string[1] == 0x2f && string[2] == 0x76) &&
|
|
|
|
(string[3] == 0x38 || string[3] == 0x39 || string[3] == 0x2b || string[3] == 0x2f))
|
|
|
|
{
|
|
|
|
return GEANY_ENCODING_UTF_7;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (len >= 2)
|
|
|
|
{
|
|
|
|
if (bom_len)
|
|
|
|
*bom_len = 2;
|
|
|
|
|
2009-09-16 14:13:38 +00:00
|
|
|
if ((guchar)string[0] == 0xfe && (guchar)string[1] == 0xff)
|
2007-01-07 16:22:41 +00:00
|
|
|
{
|
2008-02-27 13:17:29 +00:00
|
|
|
return GEANY_ENCODING_UTF_16BE; /* Big endian */
|
2007-01-07 16:22:41 +00:00
|
|
|
}
|
|
|
|
if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe)
|
|
|
|
{
|
2008-02-27 13:17:29 +00:00
|
|
|
return GEANY_ENCODING_UTF_16LE; /* Little endian */
|
2007-01-07 16:22:41 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (bom_len)
|
|
|
|
*bom_len = 0;
|
|
|
|
return GEANY_ENCODING_NONE;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
gboolean encodings_is_unicode_charset(const gchar *string)
|
|
|
|
{
|
2009-04-15 22:47:33 +00:00
|
|
|
if (string != NULL &&
|
2009-04-05 21:07:40 +00:00
|
|
|
(strncmp(string, "UTF", 3) == 0 || strncmp(string, "UCS", 3) == 0))
|
2007-01-07 16:22:41 +00:00
|
|
|
{
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-03-18 15:57:02 +00:00
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
gchar *data; /* null-terminated data */
|
|
|
|
gsize size; /* actual data size */
|
|
|
|
gsize len; /* string length of data */
|
|
|
|
gchar *enc;
|
|
|
|
gboolean bom;
|
|
|
|
gboolean partial;
|
|
|
|
} BufferData;
|
|
|
|
|
|
|
|
|
|
|
|
/* convert data with the specified encoding */
|
|
|
|
static gboolean
|
|
|
|
handle_forced_encoding(BufferData *buffer, const gchar *forced_enc)
|
|
|
|
{
|
|
|
|
GeanyEncodingIndex enc_idx;
|
|
|
|
|
|
|
|
if (utils_str_equal(forced_enc, "UTF-8"))
|
|
|
|
{
|
|
|
|
if (! g_utf8_validate(buffer->data, buffer->len, NULL))
|
|
|
|
{
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
gchar *converted_text = encodings_convert_to_utf8_from_charset(
|
|
|
|
buffer->data, buffer->size, forced_enc, FALSE);
|
|
|
|
if (converted_text == NULL)
|
|
|
|
{
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2012-01-25 16:26:16 +00:00
|
|
|
SETPTR(buffer->data, converted_text);
|
2011-03-18 15:57:02 +00:00
|
|
|
buffer->len = strlen(converted_text);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
|
|
|
|
buffer->bom = (enc_idx == GEANY_ENCODING_UTF_8);
|
|
|
|
buffer->enc = g_strdup(forced_enc);
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* detect encoding and convert to UTF-8 if necessary */
|
|
|
|
static gboolean
|
|
|
|
handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
|
|
|
|
{
|
|
|
|
g_return_val_if_fail(buffer->enc == NULL, FALSE);
|
|
|
|
g_return_val_if_fail(buffer->bom == FALSE, FALSE);
|
|
|
|
|
|
|
|
if (buffer->size == 0)
|
|
|
|
{
|
|
|
|
/* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty
|
|
|
|
* e.g. UTF32 file with a BOM(so size is 4, len is 0) */
|
|
|
|
buffer->enc = g_strdup("UTF-8");
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* first check for a BOM */
|
|
|
|
if (enc_idx != GEANY_ENCODING_NONE)
|
|
|
|
{
|
|
|
|
buffer->enc = g_strdup(encodings[enc_idx].charset);
|
|
|
|
buffer->bom = TRUE;
|
|
|
|
|
|
|
|
if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */
|
|
|
|
{
|
|
|
|
gchar *converted_text = encodings_convert_to_utf8_from_charset(
|
|
|
|
buffer->data, buffer->size, buffer->enc, FALSE);
|
|
|
|
if (converted_text != NULL)
|
|
|
|
{
|
2012-01-25 16:26:16 +00:00
|
|
|
SETPTR(buffer->data, converted_text);
|
2011-03-18 15:57:02 +00:00
|
|
|
buffer->len = strlen(converted_text);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* there was a problem converting data from BOM encoding type */
|
2012-01-25 16:26:16 +00:00
|
|
|
SETPTR(buffer->enc, NULL);
|
2011-03-18 15:57:02 +00:00
|
|
|
buffer->bom = FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (buffer->enc == NULL) /* either there was no BOM or the BOM encoding failed */
|
|
|
|
{
|
2011-03-31 23:28:10 +00:00
|
|
|
/* first try to read the encoding from the file content */
|
|
|
|
gchar *regex_charset = encodings_check_regexes(buffer->data, buffer->size);
|
|
|
|
|
2011-03-18 15:57:02 +00:00
|
|
|
/* try UTF-8 first */
|
2011-03-31 23:28:10 +00:00
|
|
|
if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 &&
|
|
|
|
(buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL))
|
2011-03-18 15:57:02 +00:00
|
|
|
{
|
|
|
|
buffer->enc = g_strdup("UTF-8");
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* detect the encoding */
|
2011-03-31 23:28:10 +00:00
|
|
|
gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data,
|
|
|
|
buffer->size, regex_charset, &buffer->enc);
|
2011-03-18 15:57:02 +00:00
|
|
|
|
|
|
|
if (converted_text == NULL)
|
|
|
|
{
|
2011-03-31 23:28:10 +00:00
|
|
|
g_free(regex_charset);
|
2011-03-18 15:57:02 +00:00
|
|
|
return FALSE;
|
|
|
|
}
|
2012-01-25 16:26:16 +00:00
|
|
|
SETPTR(buffer->data, converted_text);
|
2011-03-18 15:57:02 +00:00
|
|
|
buffer->len = strlen(converted_text);
|
|
|
|
}
|
2011-03-31 23:28:10 +00:00
|
|
|
g_free(regex_charset);
|
2011-03-18 15:57:02 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
handle_bom(BufferData *buffer)
|
|
|
|
{
|
|
|
|
guint bom_len;
|
|
|
|
|
|
|
|
encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len);
|
|
|
|
g_return_if_fail(bom_len != 0);
|
|
|
|
|
|
|
|
/* use filedata->len here because the contents are already converted into UTF-8 */
|
|
|
|
buffer->len -= bom_len;
|
|
|
|
/* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
|
|
|
|
g_memmove(buffer->data, buffer->data + bom_len, buffer->len + 1);
|
|
|
|
buffer->data = g_realloc(buffer->data, buffer->len + 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */
|
|
|
|
static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
|
|
|
|
{
|
|
|
|
GeanyEncodingIndex tmp_enc_idx;
|
|
|
|
|
|
|
|
/* temporarily retrieve the encoding idx based on the BOM to suppress the following warning
|
|
|
|
* if we have a BOM */
|
|
|
|
tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
|
|
|
|
|
|
|
|
/* check whether the size of the loaded data is equal to the size of the file in the
|
|
|
|
* filesystem file size may be 0 to allow opening files in /proc/ which have typically a
|
|
|
|
* file size of 0 bytes */
|
|
|
|
if (buffer->len != buffer->size && buffer->size != 0 && (
|
|
|
|
tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */
|
|
|
|
tmp_enc_idx == GEANY_ENCODING_UTF_7)) /* filter UTF-7/8 where no NULL bytes are allowed */
|
|
|
|
{
|
|
|
|
buffer->partial = TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Determine character encoding and convert to UTF-8 */
|
|
|
|
if (forced_enc != NULL)
|
|
|
|
{
|
|
|
|
/* the encoding should be ignored(requested by user), so open the file "as it is" */
|
|
|
|
if (utils_str_equal(forced_enc, encodings[GEANY_ENCODING_NONE].charset))
|
|
|
|
{
|
|
|
|
buffer->bom = FALSE;
|
|
|
|
buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset);
|
|
|
|
}
|
|
|
|
else if (! handle_forced_encoding(buffer, forced_enc))
|
|
|
|
{
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (! handle_encoding(buffer, tmp_enc_idx))
|
|
|
|
{
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (buffer->bom)
|
|
|
|
handle_bom(buffer);
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Tries to convert @a buffer into UTF-8 encoding. Unlike encodings_convert_to_utf8()
|
|
|
|
* and encodings_convert_to_utf8_from_charset() it handles the possible BOM in the data.
|
|
|
|
*
|
|
|
|
* @param buf a pointer to modifiable null-terminated buffer to convert.
|
|
|
|
* It may or may not be modified, and should be freed whatever happens.
|
|
|
|
* @param size a pointer to the size of the buffer (expected to be e.g. the on-disk
|
|
|
|
* file size). It will be updated to the new size.
|
|
|
|
* @param forced_enc forced encoding to use, or @c NULL
|
|
|
|
* @param used_encoding return location for the actually used encoding, or @c NULL
|
|
|
|
* @param has_bom return location to store whether the data had a BOM, or @c NULL
|
|
|
|
* @param partial return location to store whether the conversion may be partial, or @c NULL
|
|
|
|
*
|
|
|
|
* @return @C TRUE if the conversion succeeded, @c FALSE otherwise.
|
|
|
|
*/
|
|
|
|
gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
|
|
|
|
gchar **used_encoding, gboolean *has_bom, gboolean *partial)
|
|
|
|
{
|
|
|
|
BufferData buffer;
|
|
|
|
|
|
|
|
buffer.data = *buf;
|
|
|
|
buffer.size = *size;
|
|
|
|
/* use strlen to check for null chars */
|
|
|
|
buffer.len = strlen(buffer.data);
|
|
|
|
buffer.enc = NULL;
|
|
|
|
buffer.bom = FALSE;
|
|
|
|
buffer.partial = FALSE;
|
|
|
|
|
|
|
|
if (! handle_buffer(&buffer, forced_enc))
|
|
|
|
return FALSE;
|
|
|
|
|
|
|
|
*size = buffer.len;
|
|
|
|
if (used_encoding)
|
|
|
|
*used_encoding = buffer.enc;
|
2011-09-02 21:45:20 +00:00
|
|
|
else
|
|
|
|
g_free(buffer.enc);
|
2011-03-18 15:57:02 +00:00
|
|
|
if (has_bom)
|
|
|
|
*has_bom = buffer.bom;
|
|
|
|
if (partial)
|
|
|
|
*partial = buffer.partial;
|
|
|
|
|
|
|
|
*buf = buffer.data;
|
|
|
|
return TRUE;
|
|
|
|
}
|