Audacious $Id:Doxyfile42802007-03-2104:39:00Znenolod$
|
00001 /* Audacious 00002 * Copyright (C) 2005-2007 Audacious development team. 00003 * 00004 * This program is free software; you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation; under version 3 of the License. 00007 * 00008 * This program is distributed in the hope that it will be useful, 00009 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00010 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00011 * GNU General Public License for more details. 00012 * 00013 * You should have received a copy of the GNU General Public License 00014 * along with this program. If not, see <http://www.gnu.org/licenses>. 00015 * 00016 * The Audacious team does not consider modular code linking to 00017 * Audacious or using our public API to be a derived work. 00018 */ 00019 00020 #include <glib.h> 00021 #include <string.h> 00022 #include <libaudcore/audstrings.h> 00023 00024 #include "config.h" 00025 #include "debug.h" 00026 #include "i18n.h" 00027 #include "main.h" 00028 #include "misc.h" 00029 00030 #ifdef USE_CHARDET 00031 # include <libguess.h> 00032 #endif 00033 00034 static char * cd_chardet_to_utf8 (const char * str, int len, 00035 int * arg_bytes_read, int * arg_bytes_written); 00036 00037 static char * str_to_utf8_fallback (const char * str) 00038 { 00039 char * out = g_strconcat (str, _(" (invalid UTF-8)"), NULL); 00040 00041 for (char * c = out; * c; c ++) 00042 { 00043 if (* c & 0x80) 00044 * c = '?'; 00045 } 00046 00047 return out; 00048 } 00049 00050 static char * cd_str_to_utf8 (const char * str) 00051 { 00052 char *out_str; 00053 00054 if (str == NULL) 00055 return NULL; 00056 00057 /* Note: Currently, playlist calls this function repeatedly, even 00058 * if the string is already converted into utf-8. 00059 * chardet_to_utf8() would convert a valid utf-8 string into a 00060 * different utf-8 string, if fallback encodings were supplied and 00061 * the given string could be treated as a string in one of 00062 * fallback encodings. To avoid this, g_utf8_validate() had been 00063 * used at the top of evaluation. 00064 */ 00065 00066 /* Note 2: g_utf8_validate() has so called encapsulated utf-8 00067 * problem, thus chardet_to_utf8() took the place of that. 00068 */ 00069 00070 /* Note 3: As introducing madplug, the problem of conversion from 00071 * ISO-8859-1 to UTF-8 arose. This may be coped with g_convert() 00072 * located near the end of chardet_to_utf8(), but it requires utf8 00073 * validation guard where g_utf8_validate() was. New 00074 * dfa_validate_utf8() employs libguess' DFA engine to validate 00075 * utf-8 and can properly distinguish examples of encapsulated 00076 * utf-8. It is considered to be safe to use as a guard. 00077 */ 00078 00079 /* Already UTF-8? */ 00080 #ifdef USE_CHARDET 00081 if (libguess_validate_utf8(str, strlen(str))) 00082 return g_strdup(str); 00083 #else 00084 if (g_utf8_validate(str, strlen(str), NULL)) 00085 return g_strdup(str); 00086 #endif 00087 00088 /* chardet encoding detector */ 00089 if ((out_str = cd_chardet_to_utf8 (str, strlen (str), NULL, NULL))) 00090 return out_str; 00091 00092 /* all else fails, we mask off character codes >= 128, replace with '?' */ 00093 return str_to_utf8_fallback(str); 00094 } 00095 00096 static char * cd_chardet_to_utf8 (const char * str, int len, 00097 int * arg_bytes_read, int * arg_bytes_write) 00098 { 00099 char *ret = NULL; 00100 int * bytes_read, * bytes_write; 00101 int my_bytes_read, my_bytes_write; 00102 00103 bytes_read = arg_bytes_read != NULL ? arg_bytes_read : &my_bytes_read; 00104 bytes_write = arg_bytes_write != NULL ? arg_bytes_write : &my_bytes_write; 00105 00106 g_return_val_if_fail(str != NULL, NULL); 00107 00108 #ifdef USE_CHARDET 00109 if (libguess_validate_utf8(str, len)) 00110 #else 00111 if (g_utf8_validate(str, len, NULL)) 00112 #endif 00113 { 00114 if (len < 0) 00115 len = strlen (str); 00116 00117 ret = g_malloc (len + 1); 00118 memcpy (ret, str, len); 00119 ret[len] = 0; 00120 00121 if (arg_bytes_read != NULL) 00122 * arg_bytes_read = len; 00123 if (arg_bytes_write != NULL) 00124 * arg_bytes_write = len; 00125 00126 return ret; 00127 } 00128 00129 #ifdef USE_CHARDET 00130 char * det = get_string (NULL, "chardet_detector"); 00131 00132 if (det[0]) 00133 { 00134 AUDDBG("guess encoding (%s) %s\n", det, str); 00135 const char * encoding = libguess_determine_encoding (str, len, det); 00136 AUDDBG("encoding = %s\n", encoding); 00137 if (encoding) 00138 { 00139 gsize read_gsize = 0, written_gsize = 0; 00140 ret = g_convert (str, len, "UTF-8", encoding, & read_gsize, & written_gsize, NULL); 00141 * bytes_read = read_gsize; 00142 * bytes_write = written_gsize; 00143 } 00144 } 00145 00146 g_free (det); 00147 #endif 00148 00149 /* If detection failed or was not enabled, try fallbacks (if there are any) */ 00150 if (! ret) 00151 { 00152 char * fallbacks = get_string (NULL, "chardet_fallback"); 00153 char * * split = g_strsplit_set (fallbacks, " ,:;|/", -1); 00154 00155 for (char * * enc = split; * enc; enc ++) 00156 { 00157 gsize read_gsize = 0, written_gsize = 0; 00158 ret = g_convert (str, len, "UTF-8", * enc, & read_gsize, & written_gsize, NULL); 00159 * bytes_read = read_gsize; 00160 * bytes_write = written_gsize; 00161 00162 if (len == *bytes_read) 00163 break; 00164 else { 00165 g_free(ret); 00166 ret = NULL; 00167 } 00168 } 00169 00170 g_strfreev (split); 00171 g_free (fallbacks); 00172 } 00173 00174 /* First fallback: locale (duh!) */ 00175 if (ret == NULL) 00176 { 00177 gsize read_gsize = 0, written_gsize = 0; 00178 ret = g_locale_to_utf8 (str, len, & read_gsize, & written_gsize, NULL); 00179 * bytes_read = read_gsize; 00180 * bytes_write = written_gsize; 00181 } 00182 00183 /* The final fallback is ISO-8859-1, if no other is specified or conversions fail */ 00184 if (ret == NULL) 00185 { 00186 gsize read_gsize = 0, written_gsize = 0; 00187 ret = g_convert (str, len, "UTF-8", "ISO-8859-1", & read_gsize, & written_gsize, NULL); 00188 * bytes_read = read_gsize; 00189 * bytes_write = written_gsize; 00190 } 00191 00192 if (ret != NULL) 00193 { 00194 if (g_utf8_validate(ret, -1, NULL)) 00195 return ret; 00196 else 00197 { 00198 g_warning("g_utf8_validate() failed for converted string in cd_chardet_to_utf8: '%s'", ret); 00199 g_free(ret); 00200 return NULL; 00201 } 00202 } 00203 00204 return NULL; /* If we have no idea, return NULL. */ 00205 } 00206 00207 void chardet_init (void) 00208 { 00209 #ifdef USE_CHARDET 00210 libguess_determine_encoding(NULL, -1, ""); 00211 #endif 00212 str_set_utf8_impl (cd_str_to_utf8, cd_chardet_to_utf8); 00213 }