/* * encodings.c - this file is part of Geany, a fast and lightweight IDE * * Copyright 2005-2011 Enrico Tröger * Copyright 2006-2011 Nick Treleaven * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ /* * Encoding conversion and Byte Order Mark (BOM) handling. */ /* * Modified by the gedit Team, 2002. See the gedit AUTHORS file for a * list of people on the gedit Team. * See the gedit ChangeLog files for a list of changes. */ /* Stolen from anjuta */ #include #include "geany.h" #include "utils.h" #include "support.h" #include "document.h" #include "documentprivate.h" #include "msgwindow.h" #include "encodings.h" #include "callbacks.h" #include "ui_utils.h" /* */ #define PATTERN_HTMLMETA "" /* " geany_encoding=utf-8 " or " coding: utf-8 " */ #define PATTERN_CODING "coding[\t ]*[:=][\t ]*\"?([a-z0-9-]+)\"?[\t ]*" /* precompiled regexps */ static GRegex *pregs[2]; static gboolean pregs_loaded = FALSE; GeanyEncoding encodings[GEANY_ENCODINGS_MAX]; #define fill(Order, Group, Idx, Charset, Name) \ encodings[Idx].idx = Idx; \ encodings[Idx].order = Order; \ encodings[Idx].group = Group; \ encodings[Idx].charset = Charset; \ encodings[Idx].name = Name; static void init_encodings(void) { fill(0, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_14, "ISO-8859-14", _("Celtic")); fill(1, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_7, "ISO-8859-7", _("Greek")); fill(2, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1253, "WINDOWS-1253", _("Greek")); fill(3, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_10, "ISO-8859-10", _("Nordic")); fill(4, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_3, "ISO-8859-3", _("South European")); fill(5, WESTEUROPEAN, GEANY_ENCODING_IBM_850, "IBM850", _("Western")); fill(6, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_1, "ISO-8859-1", _("Western")); fill(7, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_15, "ISO-8859-15", _("Western")); fill(8, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1252, "WINDOWS-1252", _("Western")); fill(0, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_4, "ISO-8859-4", _("Baltic")); fill(1, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_13, "ISO-8859-13", _("Baltic")); fill(2, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1257, "WINDOWS-1257", _("Baltic")); fill(3, EASTEUROPEAN, GEANY_ENCODING_IBM_852, "IBM852", _("Central European")); fill(4, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_2, "ISO-8859-2", _("Central European")); fill(5, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1250, "WINDOWS-1250", _("Central European")); fill(6, EASTEUROPEAN, GEANY_ENCODING_IBM_855, "IBM855", _("Cyrillic")); fill(7, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_5, "ISO-8859-5", _("Cyrillic")); /* ISO-IR-111 not available on Windows */ fill(8, EASTEUROPEAN, GEANY_ENCODING_ISO_IR_111, "ISO-IR-111", _("Cyrillic")); fill(9, EASTEUROPEAN, GEANY_ENCODING_KOI8_R, "KOI8-R", _("Cyrillic")); fill(10, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1251, "WINDOWS-1251", _("Cyrillic")); fill(11, EASTEUROPEAN, GEANY_ENCODING_CP_866, "CP866", _("Cyrillic/Russian")); fill(12, EASTEUROPEAN, GEANY_ENCODING_KOI8_U, "KOI8-U", _("Cyrillic/Ukrainian")); fill(13, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_16, "ISO-8859-16", _("Romanian")); fill(0, MIDDLEEASTERN, GEANY_ENCODING_IBM_864, "IBM864", _("Arabic")); fill(1, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_6, "ISO-8859-6", _("Arabic")); fill(2, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1256, "WINDOWS-1256", _("Arabic")); fill(3, MIDDLEEASTERN, GEANY_ENCODING_IBM_862, "IBM862", _("Hebrew")); /* not available at all, ? */ fill(4, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8_I, "ISO-8859-8-I", _("Hebrew")); fill(5, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1255, "WINDOWS-1255", _("Hebrew")); fill(6, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8, "ISO-8859-8", _("Hebrew Visual")); fill(0, ASIAN, GEANY_ENCODING_ARMSCII_8, "ARMSCII-8", _("Armenian")); fill(1, ASIAN, GEANY_ENCODING_GEOSTD8, "GEORGIAN-ACADEMY", _("Georgian")); fill(2, ASIAN, GEANY_ENCODING_TIS_620, "TIS-620", _("Thai")); fill(3, ASIAN, GEANY_ENCODING_IBM_857, "IBM857", _("Turkish")); fill(4, ASIAN, GEANY_ENCODING_WINDOWS_1254, "WINDOWS-1254", _("Turkish")); fill(5, ASIAN, GEANY_ENCODING_ISO_8859_9, "ISO-8859-9", _("Turkish")); fill(6, ASIAN, GEANY_ENCODING_TCVN, "TCVN", _("Vietnamese")); fill(7, ASIAN, GEANY_ENCODING_VISCII, "VISCII", _("Vietnamese")); fill(8, ASIAN, GEANY_ENCODING_WINDOWS_1258, "WINDOWS-1258", _("Vietnamese")); fill(0, UNICODE, GEANY_ENCODING_UTF_7, "UTF-7", _("Unicode")); fill(1, UNICODE, GEANY_ENCODING_UTF_8, "UTF-8", _("Unicode")); fill(2, UNICODE, GEANY_ENCODING_UTF_16LE, "UTF-16LE", _("Unicode")); fill(3, UNICODE, GEANY_ENCODING_UTF_16BE, "UTF-16BE", _("Unicode")); fill(4, UNICODE, GEANY_ENCODING_UCS_2LE, "UCS-2LE", _("Unicode")); fill(5, UNICODE, GEANY_ENCODING_UCS_2BE, "UCS-2BE", _("Unicode")); fill(6, UNICODE, GEANY_ENCODING_UTF_32LE, "UTF-32LE", _("Unicode")); fill(7, UNICODE, GEANY_ENCODING_UTF_32BE, "UTF-32BE", _("Unicode")); fill(0, EASTASIAN, GEANY_ENCODING_GB18030, "GB18030", _("Chinese Simplified")); fill(1, EASTASIAN, GEANY_ENCODING_GB2312, "GB2312", _("Chinese Simplified")); fill(2, EASTASIAN, GEANY_ENCODING_GBK, "GBK", _("Chinese Simplified")); /* maybe not available on Linux */ fill(3, EASTASIAN, GEANY_ENCODING_HZ, "HZ", _("Chinese Simplified")); fill(4, EASTASIAN, GEANY_ENCODING_BIG5, "BIG5", _("Chinese Traditional")); fill(5, EASTASIAN, GEANY_ENCODING_BIG5_HKSCS, "BIG5-HKSCS", _("Chinese Traditional")); fill(6, EASTASIAN, GEANY_ENCODING_EUC_TW, "EUC-TW", _("Chinese Traditional")); fill(7, EASTASIAN, GEANY_ENCODING_EUC_JP, "EUC-JP", _("Japanese")); fill(8, EASTASIAN, GEANY_ENCODING_ISO_2022_JP, "ISO-2022-JP", _("Japanese")); fill(9, EASTASIAN, GEANY_ENCODING_SHIFT_JIS, "SHIFT_JIS", _("Japanese")); fill(10, EASTASIAN, GEANY_ENCODING_CP_932, "CP932", _("Japanese")); fill(11, EASTASIAN, GEANY_ENCODING_EUC_KR, "EUC-KR", _("Korean")); fill(12, EASTASIAN, GEANY_ENCODING_ISO_2022_KR, "ISO-2022-KR", _("Korean")); fill(13, EASTASIAN, GEANY_ENCODING_JOHAB, "JOHAB", _("Korean")); fill(14, EASTASIAN, GEANY_ENCODING_UHC, "UHC", _("Korean")); fill(0, NONE, GEANY_ENCODING_NONE, "None", _("Without encoding")); } /* compares two encoding names in a permissive fashion. * e.g. "utf8" matches "UTF-8", "iso8859_1" matches "ISO-8859-1", etc. */ static gboolean encodings_charset_equals(const gchar *a, const gchar *b) { gboolean was_alpha = FALSE; /* whether last character of previous word was a letter */ gboolean need_sep = FALSE; /* whether we're expecting an implicit separator */ while (*a && *b) { gboolean is_alpha; if (g_ascii_toupper(*a) == g_ascii_toupper(*b) && ((is_alpha = g_ascii_isalpha(*a)) || g_ascii_isdigit(*a))) { /* either there was a real separator, or we need a implicit one (a chage from alpha to * numeric or so) */ if (! need_sep || (was_alpha != is_alpha)) { a++; b++; was_alpha = is_alpha; need_sep = FALSE; } else return FALSE; } else { guint n_sep = 0; if (! g_ascii_isalnum(*a)) { a++; n_sep++; } if (! g_ascii_isalnum(*b)) { b++; n_sep++; } if (n_sep < 1) return FALSE; else if (n_sep < 2) need_sep = TRUE; } } return *a == *b; } GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset) { gint i; if (charset == NULL) return GEANY_ENCODING_UTF_8; i = 0; while (i < GEANY_ENCODINGS_MAX) { if (encodings_charset_equals(charset, encodings[i].charset)) return i; ++i; } return GEANY_ENCODING_UTF_8; } const GeanyEncoding *encodings_get_from_charset(const gchar *charset) { gint i; if (charset == NULL) return &encodings[GEANY_ENCODING_UTF_8]; i = 0; while (i < GEANY_ENCODINGS_MAX) { if (encodings_charset_equals(charset, encodings[i].charset)) return &encodings[i]; ++i; } return NULL; } static const gchar *encodings_normalize_charset(const gchar *charset) { const GeanyEncoding *encoding; encoding = encodings_get_from_charset(charset); if (encoding != NULL) return encoding->charset; return NULL; } const GeanyEncoding *encodings_get_from_index(gint idx) { g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL); return &encodings[idx]; } /** * Gets the character set name of the specified index e.g. for use with * @ref document_set_encoding(). * * @param idx @ref GeanyEncodingIndex to retrieve the corresponding character set. * * * @return The charset according to idx, or @c NULL if the index is invalid. * * @since 0.13 **/ const gchar* encodings_get_charset_from_index(gint idx) { g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL); return encodings[idx].charset; } gchar *encodings_to_string(const GeanyEncoding* enc) { g_return_val_if_fail(enc != NULL, NULL); g_return_val_if_fail(enc->name != NULL, NULL); g_return_val_if_fail(enc->charset != NULL, NULL); return g_strdup_printf("%s (%s)", enc->name, enc->charset); } const gchar *encodings_get_charset(const GeanyEncoding* enc) { g_return_val_if_fail(enc != NULL, NULL); g_return_val_if_fail(enc->charset != NULL, NULL); return enc->charset; } static GtkWidget *radio_items[GEANY_ENCODINGS_MAX]; void encodings_select_radio_item(const gchar *charset) { gint i; g_return_if_fail(charset != NULL); i = 0; while (i < GEANY_ENCODINGS_MAX) { if (utils_str_equal(charset, encodings[i].charset)) break; i++; } if (i == GEANY_ENCODINGS_MAX) i = GEANY_ENCODING_UTF_8; /* fallback to UTF-8 */ /* ignore_callback has to be set by the caller */ gtk_check_menu_item_set_active(GTK_CHECK_MENU_ITEM(radio_items[i]), TRUE); } /* Regexp detection of file encoding declared in the file itself. * Idea and parts of code taken from Bluefish, thanks. * regex_compile() is used to compile regular expressions on program init and keep it in memory * for faster access when opening a file. Pre-compiled regexps will be freed on program exit. */ static GRegex *regex_compile(const gchar *pattern) { GError *error = NULL; GRegex *regex = g_regex_new(pattern, G_REGEX_CASELESS, 0, &error); if (!regex) { geany_debug("Failed to compile encoding regex (%s)", error->message); g_error_free(error); } return regex; } static gchar *regex_match(GRegex *preg, const gchar *buffer, gsize size) { gchar *encoding = NULL; GMatchInfo *minfo; if (G_UNLIKELY(! pregs_loaded || buffer == NULL)) return NULL; /* scan only the first 512 characters in the buffer */ size = MIN(size, 512); if (g_regex_match_full(preg, buffer, size, 0, 0, &minfo, NULL) && g_match_info_get_match_count(minfo) >= 2) { encoding = g_match_info_fetch(minfo, 1); geany_debug("Detected encoding by regex search: %s", encoding); setptr(encoding, g_utf8_strup(encoding, -1)); } g_match_info_free(minfo); return encoding; } static void encodings_radio_item_change_cb(GtkCheckMenuItem *menuitem, gpointer user_data) { GeanyDocument *doc = document_get_current(); guint i = GPOINTER_TO_INT(user_data); if (ignore_callback || doc == NULL || encodings[i].charset == NULL || ! gtk_check_menu_item_get_active(menuitem) || utils_str_equal(encodings[i].charset, doc->encoding)) return; if (doc->readonly) { utils_beep(); return; } document_undo_add(doc, UNDO_ENCODING, g_strdup(doc->encoding)); document_set_encoding(doc, encodings[i].charset); } void encodings_finalize(void) { if (pregs_loaded) { guint i, len; len = G_N_ELEMENTS(pregs); for (i = 0; i < len; i++) { g_regex_unref(pregs[i]); } } } void encodings_init(void) { GtkWidget *item, *menu[2], *submenu, *menu_westeuro, *menu_easteuro, *menu_eastasian, *menu_asian, *menu_utf8, *menu_middleeast, *item_westeuro, *item_easteuro, *item_eastasian, *item_asian, *item_utf8, *item_middleeast; GCallback cb_func[2]; GSList *group = NULL; gchar *label; gint order, group_size; guint i, j, k; init_encodings(); if (! pregs_loaded) { pregs[0] = regex_compile(PATTERN_HTMLMETA); pregs[1] = regex_compile(PATTERN_CODING); pregs_loaded = TRUE; } /* create encodings submenu in document menu */ menu[0] = ui_lookup_widget(main_widgets.window, "set_encoding1_menu"); menu[1] = ui_lookup_widget(main_widgets.window, "menu_reload_as1_menu"); cb_func[0] = G_CALLBACK(encodings_radio_item_change_cb); cb_func[1] = G_CALLBACK(on_reload_as_activate); for (k = 0; k < 2; k++) { menu_westeuro = gtk_menu_new(); item_westeuro = gtk_menu_item_new_with_mnemonic(_("_West European")); gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_westeuro), menu_westeuro); gtk_container_add(GTK_CONTAINER(menu[k]), item_westeuro); gtk_widget_show_all(item_westeuro); menu_easteuro = gtk_menu_new(); item_easteuro = gtk_menu_item_new_with_mnemonic(_("_East European")); gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_easteuro), menu_easteuro); gtk_container_add(GTK_CONTAINER(menu[k]), item_easteuro); gtk_widget_show_all(item_easteuro); menu_eastasian = gtk_menu_new(); item_eastasian = gtk_menu_item_new_with_mnemonic(_("East _Asian")); gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_eastasian), menu_eastasian); gtk_container_add(GTK_CONTAINER(menu[k]), item_eastasian); gtk_widget_show_all(item_eastasian); menu_asian = gtk_menu_new(); item_asian = gtk_menu_item_new_with_mnemonic(_("_SE & SW Asian")); gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_asian), menu_asian); gtk_container_add(GTK_CONTAINER(menu[k]), item_asian); gtk_widget_show_all(item_asian); menu_middleeast = gtk_menu_new(); item_middleeast = gtk_menu_item_new_with_mnemonic(_("_Middle Eastern")); gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_middleeast), menu_middleeast); gtk_container_add(GTK_CONTAINER(menu[k]), item_middleeast); gtk_widget_show_all(item_middleeast); menu_utf8 = gtk_menu_new(); item_utf8 = gtk_menu_item_new_with_mnemonic(_("_Unicode")); gtk_menu_item_set_submenu(GTK_MENU_ITEM(item_utf8), menu_utf8); gtk_container_add(GTK_CONTAINER(menu[k]), item_utf8); gtk_widget_show_all(item_utf8); /** TODO can it be optimized? ATM 3782 runs at line "if (encodings[j].group ...)" */ for (i = 0; i < GEANY_ENCODING_GROUPS_MAX; i++) { order = 0; switch (i) { case WESTEUROPEAN: submenu = menu_westeuro; group_size = 9; break; case EASTEUROPEAN: submenu = menu_easteuro; group_size = 14; break; case EASTASIAN: submenu = menu_eastasian; group_size = 14; break; case ASIAN: submenu = menu_asian; group_size = 9; break; case MIDDLEEASTERN: submenu = menu_middleeast; group_size = 7; break; case UNICODE: submenu = menu_utf8; group_size = 8; break; default: submenu = menu[k]; group_size = 1; } while (order < group_size) /* the biggest group has 13 elements */ { for (j = 0; j < GEANY_ENCODINGS_MAX; j++) { if (encodings[j].group == i && encodings[j].order == order) { label = encodings_to_string(&encodings[j]); if (k == 0) { item = gtk_radio_menu_item_new_with_label(group, label); group = gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(item)); radio_items[j] = item; } else item = gtk_menu_item_new_with_label(label); gtk_widget_show(item); gtk_container_add(GTK_CONTAINER(submenu), item); g_signal_connect(item, "activate", cb_func[k], GINT_TO_POINTER(encodings[j].idx)); g_free(label); break; } } order++; } } } } /** * Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset. * If @a fast is not set, additional checks to validate the converted string are performed. * * @param buffer The input string to convert. * @param size The length of the string, or -1 if the string is nul-terminated. * @param charset The charset to be used for conversion. * @param fast @c TRUE to only convert the input and skip extended checks on the converted string. * * @return If the conversion was successful, a newly allocated nul-terminated string, * which must be freed with @c g_free(). Otherwise @c NULL. **/ gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gsize size, const gchar *charset, gboolean fast) { gchar *utf8_content = NULL; GError *conv_error = NULL; gchar* converted_contents = NULL; gsize bytes_written; g_return_val_if_fail(buffer != NULL, NULL); g_return_val_if_fail(charset != NULL, NULL); converted_contents = g_convert(buffer, size, "UTF-8", charset, NULL, &bytes_written, &conv_error); if (fast) { utf8_content = converted_contents; if (conv_error != NULL) g_error_free(conv_error); } else if (conv_error != NULL || ! g_utf8_validate(converted_contents, bytes_written, NULL)) { if (conv_error != NULL) { geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset, conv_error->message); g_error_free(conv_error); conv_error = NULL; } else geany_debug("Couldn't convert from %s to UTF-8.", charset); utf8_content = NULL; g_free(converted_contents); } else { geany_debug("Converted from %s to UTF-8.", charset); utf8_content = converted_contents; } return utf8_content; } static gchar *encodings_check_regexes(const gchar *buffer, gsize size) { guint i; for (i = 0; i < G_N_ELEMENTS(pregs); i++) { gchar *charset; if ((charset = regex_match(pregs[i], buffer, size)) != NULL) return charset; } return NULL; } static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gsize size, const gchar *suggested_charset, gchar **used_encoding) { const gchar *locale_charset = NULL; const gchar *charset; gchar *utf8_content; gboolean check_suggestion = suggested_charset != NULL; gboolean check_locale = FALSE; gint i, preferred_charset; if ((gint)size == -1) { size = strlen(buffer); } /* current locale is not UTF-8, we have to check this charset */ check_locale = ! g_get_charset(&locale_charset); /* First check for preferred charset, if specified */ preferred_charset = file_prefs.default_open_encoding; if (preferred_charset == encodings[GEANY_ENCODING_NONE].idx || preferred_charset < 0 || preferred_charset >= GEANY_ENCODINGS_MAX) { preferred_charset = -1; } /* -1 means "Preferred charset" */ for (i = -1; i < GEANY_ENCODINGS_MAX; i++) { if (G_UNLIKELY(i == encodings[GEANY_ENCODING_NONE].idx)) continue; if (check_suggestion) { check_suggestion = FALSE; charset = encodings_normalize_charset(suggested_charset); if (charset == NULL) /* we failed at normalizing suggested encoding, try it as is */ charset = suggested_charset; i = -2; /* keep i below the start value to have it again at -1 on the next loop run */ } else if (check_locale) { check_locale = FALSE; charset = locale_charset; i = -2; /* keep i below the start value to have it again at -1 on the next loop run */ } else if (i == -1) { if (preferred_charset >= 0) { charset = encodings[preferred_charset].charset; geany_debug("Using preferred charset: %s", charset); } else continue; } else if (i >= 0) charset = encodings[i].charset; else /* in this case we have i == -2, continue to increase i and go ahead */ continue; if (G_UNLIKELY(charset == NULL)) continue; geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.", size, charset); utf8_content = encodings_convert_to_utf8_from_charset(buffer, size, charset, FALSE); if (G_LIKELY(utf8_content != NULL)) { if (used_encoding != NULL) { if (G_UNLIKELY(*used_encoding != NULL)) { geany_debug("%s:%d", __FILE__, __LINE__); g_free(*used_encoding); } *used_encoding = g_strdup(charset); } return utf8_content; } } return NULL; } /** * Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in * @a used_encoding. * * @param buffer the input string to convert. * @param size the length of the string, or -1 if the string is nul-terminated. * @param used_encoding return location of the detected encoding of the input string, or @c NULL. * * @return If the conversion was successful, a newly allocated nul-terminated string, * which must be freed with @c g_free(). Otherwise @c NULL. **/ gchar *encodings_convert_to_utf8(const gchar *buffer, gsize size, gchar **used_encoding) { gchar *regex_charset; gchar *utf8; /* first try to read the encoding from the file content */ regex_charset = encodings_check_regexes(buffer, size); utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding); g_free(regex_charset); return utf8; } /* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index, * otherwise GEANY_ENCODING_NONE. * */ GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len) { if (len >= 3) { if (bom_len) *bom_len = 3; if ((guchar)string[0] == 0xef && (guchar)string[1] == 0xbb && (guchar)string[2] == 0xbf) { return GEANY_ENCODING_UTF_8; } } if (len >= 4) { if (bom_len) *bom_len = 4; if ((guchar)string[0] == 0x00 && (guchar)string[1] == 0x00 && (guchar)string[2] == 0xfe && (guchar)string[3] == 0xff) { return GEANY_ENCODING_UTF_32BE; /* Big endian */ } if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe && (guchar)string[2] == 0x00 && (guchar)string[3] == 0x00) { return GEANY_ENCODING_UTF_32LE; /* Little endian */ } if ((string[0] == 0x2b && string[1] == 0x2f && string[2] == 0x76) && (string[3] == 0x38 || string[3] == 0x39 || string[3] == 0x2b || string[3] == 0x2f)) { return GEANY_ENCODING_UTF_7; } } if (len >= 2) { if (bom_len) *bom_len = 2; if ((guchar)string[0] == 0xfe && (guchar)string[1] == 0xff) { return GEANY_ENCODING_UTF_16BE; /* Big endian */ } if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe) { return GEANY_ENCODING_UTF_16LE; /* Little endian */ } } if (bom_len) *bom_len = 0; return GEANY_ENCODING_NONE; } gboolean encodings_is_unicode_charset(const gchar *string) { if (string != NULL && (strncmp(string, "UTF", 3) == 0 || strncmp(string, "UCS", 3) == 0)) { return TRUE; } return FALSE; } typedef struct { gchar *data; /* null-terminated data */ gsize size; /* actual data size */ gsize len; /* string length of data */ gchar *enc; gboolean bom; gboolean partial; } BufferData; /* convert data with the specified encoding */ static gboolean handle_forced_encoding(BufferData *buffer, const gchar *forced_enc) { GeanyEncodingIndex enc_idx; if (utils_str_equal(forced_enc, "UTF-8")) { if (! g_utf8_validate(buffer->data, buffer->len, NULL)) { return FALSE; } } else { gchar *converted_text = encodings_convert_to_utf8_from_charset( buffer->data, buffer->size, forced_enc, FALSE); if (converted_text == NULL) { return FALSE; } else { setptr(buffer->data, converted_text); buffer->len = strlen(converted_text); } } enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL); buffer->bom = (enc_idx == GEANY_ENCODING_UTF_8); buffer->enc = g_strdup(forced_enc); return TRUE; } /* detect encoding and convert to UTF-8 if necessary */ static gboolean handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx) { g_return_val_if_fail(buffer->enc == NULL, FALSE); g_return_val_if_fail(buffer->bom == FALSE, FALSE); if (buffer->size == 0) { /* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty * e.g. UTF32 file with a BOM(so size is 4, len is 0) */ buffer->enc = g_strdup("UTF-8"); } else { /* first check for a BOM */ if (enc_idx != GEANY_ENCODING_NONE) { buffer->enc = g_strdup(encodings[enc_idx].charset); buffer->bom = TRUE; if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */ { gchar *converted_text = encodings_convert_to_utf8_from_charset( buffer->data, buffer->size, buffer->enc, FALSE); if (converted_text != NULL) { setptr(buffer->data, converted_text); buffer->len = strlen(converted_text); } else { /* there was a problem converting data from BOM encoding type */ setptr(buffer->enc, NULL); buffer->bom = FALSE; } } } if (buffer->enc == NULL) /* either there was no BOM or the BOM encoding failed */ { /* first try to read the encoding from the file content */ gchar *regex_charset = encodings_check_regexes(buffer->data, buffer->size); /* try UTF-8 first */ if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 && (buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL)) { buffer->enc = g_strdup("UTF-8"); } else { /* detect the encoding */ gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data, buffer->size, regex_charset, &buffer->enc); if (converted_text == NULL) { g_free(regex_charset); return FALSE; } setptr(buffer->data, converted_text); buffer->len = strlen(converted_text); } g_free(regex_charset); } } return TRUE; } static void handle_bom(BufferData *buffer) { guint bom_len; encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len); g_return_if_fail(bom_len != 0); /* use filedata->len here because the contents are already converted into UTF-8 */ buffer->len -= bom_len; /* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */ g_memmove(buffer->data, buffer->data + bom_len, buffer->len + 1); buffer->data = g_realloc(buffer->data, buffer->len + 1); } /* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc) { GeanyEncodingIndex tmp_enc_idx; /* temporarily retrieve the encoding idx based on the BOM to suppress the following warning * if we have a BOM */ tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL); /* check whether the size of the loaded data is equal to the size of the file in the * filesystem file size may be 0 to allow opening files in /proc/ which have typically a * file size of 0 bytes */ if (buffer->len != buffer->size && buffer->size != 0 && ( tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */ tmp_enc_idx == GEANY_ENCODING_UTF_7)) /* filter UTF-7/8 where no NULL bytes are allowed */ { buffer->partial = TRUE; } /* Determine character encoding and convert to UTF-8 */ if (forced_enc != NULL) { /* the encoding should be ignored(requested by user), so open the file "as it is" */ if (utils_str_equal(forced_enc, encodings[GEANY_ENCODING_NONE].charset)) { buffer->bom = FALSE; buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset); } else if (! handle_forced_encoding(buffer, forced_enc)) { return FALSE; } } else if (! handle_encoding(buffer, tmp_enc_idx)) { return FALSE; } if (buffer->bom) handle_bom(buffer); return TRUE; } /* * Tries to convert @a buffer into UTF-8 encoding. Unlike encodings_convert_to_utf8() * and encodings_convert_to_utf8_from_charset() it handles the possible BOM in the data. * * @param buf a pointer to modifiable null-terminated buffer to convert. * It may or may not be modified, and should be freed whatever happens. * @param size a pointer to the size of the buffer (expected to be e.g. the on-disk * file size). It will be updated to the new size. * @param forced_enc forced encoding to use, or @c NULL * @param used_encoding return location for the actually used encoding, or @c NULL * @param has_bom return location to store whether the data had a BOM, or @c NULL * @param partial return location to store whether the conversion may be partial, or @c NULL * * @return @C TRUE if the conversion succeeded, @c FALSE otherwise. */ gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc, gchar **used_encoding, gboolean *has_bom, gboolean *partial) { BufferData buffer; buffer.data = *buf; buffer.size = *size; /* use strlen to check for null chars */ buffer.len = strlen(buffer.data); buffer.enc = NULL; buffer.bom = FALSE; buffer.partial = FALSE; if (! handle_buffer(&buffer, forced_enc)) return FALSE; *size = buffer.len; if (used_encoding) *used_encoding = buffer.enc; else g_free(buffer.enc); if (has_bom) *has_bom = buffer.bom; if (partial) *partial = buffer.partial; *buf = buffer.data; return TRUE; }