1 /* Determine a canonical name for the current locale's character encoding.
3 Copyright (C) 2000-2006, 2008-2009 Free Software Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License along
16 with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19 /* Written by Bruno Haible <bruno@clisp.org>. */
24 #include "localcharset.h"
32 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
33 # define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
36 #if defined _WIN32 || defined __WIN32__
41 /* Assume EMX program runs on OS/2, even if compiled under DOS. */
47 #if !defined WIN32_NATIVE
49 # if HAVE_LANGINFO_CODESET
50 # include <langinfo.h>
52 # if 0 /* see comment below */
57 # define WIN32_LEAN_AND_MEAN
60 #elif defined WIN32_NATIVE
61 # define WIN32_LEAN_AND_MEAN
69 #if ENABLE_RELOCATABLE
70 # include "relocatable.h"
72 # define relocate(pathname) (pathname)
77 # include "configmake.h"
80 /* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
85 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
86 /* Win32, Cygwin, OS/2, DOS */
87 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
90 #ifndef DIRECTORY_SEPARATOR
91 # define DIRECTORY_SEPARATOR '/'
95 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
98 #if HAVE_DECL_GETC_UNLOCKED
100 # define getc getc_unlocked
103 /* The following static variable is declared 'volatile' to avoid a
104 possible multithread problem in the function get_charset_aliases. If we
105 are running in a threaded environment, and if two threads initialize
106 'charset_aliases' simultaneously, both will produce the same value,
107 and everything will be ok if the two assignments to 'charset_aliases'
108 are atomic. But I don't know what will happen if the two assignments mix. */
110 # define volatile /* empty */
112 /* Pointer to the contents of the charset.alias file, if it has already been
113 read, else NULL. Its format is:
114 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
115 static const char * volatile charset_aliases;
117 /* Return a pointer to the contents of the charset.alias file. */
119 get_charset_aliases (void)
123 cp = charset_aliases;
126 #if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
128 const char *base = "charset.alias";
131 /* Make it possible to override the charset.alias location. This is
132 necessary for running the testsuite before "make install". */
133 dir = getenv ("CHARSETALIASDIR");
134 if (dir == NULL || dir[0] == '\0')
135 dir = relocate (LIBDIR);
137 /* Concatenate dir and base into freshly allocated file_name. */
139 size_t dir_len = strlen (dir);
140 size_t base_len = strlen (base);
141 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
142 file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
143 if (file_name != NULL)
145 memcpy (file_name, dir, dir_len);
147 file_name[dir_len] = DIRECTORY_SEPARATOR;
148 memcpy (file_name + dir_len + add_slash, base, base_len + 1);
152 if (file_name == NULL)
153 /* Out of memory. Treat the file as empty. */
159 /* Open the file. Reject symbolic links on platforms that support
160 O_NOFOLLOW. This is a security feature. Without it, an attacker
161 could retrieve parts of the contents (namely, the tail of the
162 first line that starts with "* ") of an arbitrary file by placing
163 a symbolic link to that file under the name "charset.alias" in
164 some writable directory and defining the environment variable
165 CHARSETALIASDIR to point to that directory. */
166 fd = open (file_name,
167 O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
169 /* File not found. Treat it as empty. */
175 fp = fdopen (fd, "r");
178 /* Out of memory. Treat the file as empty. */
184 /* Parse the file's contents. */
185 char *res_ptr = NULL;
199 if (c == '\n' || c == ' ' || c == '\t')
203 /* Skip comment, to end of line. */
206 while (!(c == EOF || c == '\n'));
212 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
216 old_res_ptr = res_ptr;
219 res_size = l1 + 1 + l2 + 1;
220 res_ptr = (char *) malloc (res_size + 1);
224 res_size += l1 + 1 + l2 + 1;
225 res_ptr = (char *) realloc (res_ptr, res_size + 1);
231 if (old_res_ptr != NULL)
235 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
236 strcpy (res_ptr + res_size - (l2 + 1), buf2);
243 *(res_ptr + res_size) = '\0';
255 /* To avoid the trouble of installing a file that is shared by many
256 GNU packages -- many packaging systems have problems with this --,
257 simply inline the aliases here. */
258 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
259 "ISO8859-2" "\0" "ISO-8859-2" "\0"
260 "ISO8859-4" "\0" "ISO-8859-4" "\0"
261 "ISO8859-5" "\0" "ISO-8859-5" "\0"
262 "ISO8859-7" "\0" "ISO-8859-7" "\0"
263 "ISO8859-9" "\0" "ISO-8859-9" "\0"
264 "ISO8859-13" "\0" "ISO-8859-13" "\0"
265 "ISO8859-15" "\0" "ISO-8859-15" "\0"
266 "KOI8-R" "\0" "KOI8-R" "\0"
267 "KOI8-U" "\0" "KOI8-U" "\0"
268 "CP866" "\0" "CP866" "\0"
269 "CP949" "\0" "CP949" "\0"
270 "CP1131" "\0" "CP1131" "\0"
271 "CP1251" "\0" "CP1251" "\0"
272 "eucCN" "\0" "GB2312" "\0"
273 "GB2312" "\0" "GB2312" "\0"
274 "eucJP" "\0" "EUC-JP" "\0"
275 "eucKR" "\0" "EUC-KR" "\0"
276 "Big5" "\0" "BIG5" "\0"
277 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
278 "GBK" "\0" "GBK" "\0"
279 "GB18030" "\0" "GB18030" "\0"
280 "SJIS" "\0" "SHIFT_JIS" "\0"
281 "ARMSCII-8" "\0" "ARMSCII-8" "\0"
282 "PT154" "\0" "PT154" "\0"
283 /*"ISCII-DEV" "\0" "?" "\0"*/
284 "*" "\0" "UTF-8" "\0";
288 /* To avoid the troubles of an extra file charset.alias_vms in the
289 sources of many GNU packages, simply inline the aliases here. */
290 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
291 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
292 section 10.7 "Handling Different Character Sets". */
293 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
294 "ISO8859-2" "\0" "ISO-8859-2" "\0"
295 "ISO8859-5" "\0" "ISO-8859-5" "\0"
296 "ISO8859-7" "\0" "ISO-8859-7" "\0"
297 "ISO8859-8" "\0" "ISO-8859-8" "\0"
298 "ISO8859-9" "\0" "ISO-8859-9" "\0"
300 "eucJP" "\0" "EUC-JP" "\0"
301 "SJIS" "\0" "SHIFT_JIS" "\0"
302 "DECKANJI" "\0" "DEC-KANJI" "\0"
303 "SDECKANJI" "\0" "EUC-JP" "\0"
305 "eucTW" "\0" "EUC-TW" "\0"
306 "DECHANYU" "\0" "DEC-HANYU" "\0"
307 "DECHANZI" "\0" "GB2312" "\0"
309 "DECKOREAN" "\0" "EUC-KR" "\0";
312 # if defined WIN32_NATIVE || defined __CYGWIN__
313 /* To avoid the troubles of installing a separate file in the same
314 directory as the DLL and of retrieving the DLL's directory at
315 runtime, simply inline the aliases here. */
317 cp = "CP936" "\0" "GBK" "\0"
318 "CP1361" "\0" "JOHAB" "\0"
319 "CP20127" "\0" "ASCII" "\0"
320 "CP20866" "\0" "KOI8-R" "\0"
321 "CP20936" "\0" "GB2312" "\0"
322 "CP21866" "\0" "KOI8-RU" "\0"
323 "CP28591" "\0" "ISO-8859-1" "\0"
324 "CP28592" "\0" "ISO-8859-2" "\0"
325 "CP28593" "\0" "ISO-8859-3" "\0"
326 "CP28594" "\0" "ISO-8859-4" "\0"
327 "CP28595" "\0" "ISO-8859-5" "\0"
328 "CP28596" "\0" "ISO-8859-6" "\0"
329 "CP28597" "\0" "ISO-8859-7" "\0"
330 "CP28598" "\0" "ISO-8859-8" "\0"
331 "CP28599" "\0" "ISO-8859-9" "\0"
332 "CP28605" "\0" "ISO-8859-15" "\0"
333 "CP38598" "\0" "ISO-8859-8" "\0"
334 "CP51932" "\0" "EUC-JP" "\0"
335 "CP51936" "\0" "GB2312" "\0"
336 "CP51949" "\0" "EUC-KR" "\0"
337 "CP51950" "\0" "EUC-TW" "\0"
338 "CP54936" "\0" "GB18030" "\0"
339 "CP65001" "\0" "UTF-8" "\0";
343 charset_aliases = cp;
349 /* Determine the current locale's character encoding, and canonicalize it
350 into one of the canonical names listed in config.charset.
351 The result must not be freed; it is statically allocated.
352 If the canonical name cannot be determined, the result is a non-canonical
359 locale_charset (void)
364 #if !(defined WIN32_NATIVE || defined OS2)
366 # if HAVE_LANGINFO_CODESET
368 /* Most systems support nl_langinfo (CODESET) nowadays. */
369 codeset = nl_langinfo (CODESET);
372 /* Cygwin 2006 does not have locales. nl_langinfo (CODESET) always
373 returns "US-ASCII". As long as this is not fixed, return the suffix
374 of the locale name from the environment variables (if present) or
375 the codepage as a number. */
376 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
379 static char buf[2 + 10 + 1];
381 locale = getenv ("LC_ALL");
382 if (locale == NULL || locale[0] == '\0')
384 locale = getenv ("LC_CTYPE");
385 if (locale == NULL || locale[0] == '\0')
386 locale = getenv ("LANG");
388 if (locale != NULL && locale[0] != '\0')
390 /* If the locale name contains an encoding after the dot, return
392 const char *dot = strchr (locale, '.');
396 const char *modifier;
399 /* Look for the possible @... trailer and remove it, if any. */
400 modifier = strchr (dot, '@');
401 if (modifier == NULL)
403 if (modifier - dot < sizeof (buf))
405 memcpy (buf, dot, modifier - dot);
406 buf [modifier - dot] = '\0';
412 /* Woe32 has a function returning the locale's codepage as a number. */
413 sprintf (buf, "CP%u", GetACP ());
420 /* On old systems which lack it, use setlocale or getenv. */
421 const char *locale = NULL;
423 /* But most old systems don't have a complete set of locales. Some
424 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
425 use setlocale here; it would return "C" when it doesn't support the
426 locale name the user has set. */
428 locale = setlocale (LC_CTYPE, NULL);
430 if (locale == NULL || locale[0] == '\0')
432 locale = getenv ("LC_ALL");
433 if (locale == NULL || locale[0] == '\0')
435 locale = getenv ("LC_CTYPE");
436 if (locale == NULL || locale[0] == '\0')
437 locale = getenv ("LANG");
441 /* On some old systems, one used to set locale = "iso8859_1". On others,
442 you set it to "language_COUNTRY.charset". In any case, we resolve it
443 through the charset.alias file. */
448 #elif defined WIN32_NATIVE
450 static char buf[2 + 10 + 1];
452 /* Woe32 has a function returning the locale's codepage as a number. */
453 sprintf (buf, "CP%u", GetACP ());
459 static char buf[2 + 10 + 1];
463 /* Allow user to override the codeset, as set in the operating system,
464 with standard language environment variables. */
465 locale = getenv ("LC_ALL");
466 if (locale == NULL || locale[0] == '\0')
468 locale = getenv ("LC_CTYPE");
469 if (locale == NULL || locale[0] == '\0')
470 locale = getenv ("LANG");
472 if (locale != NULL && locale[0] != '\0')
474 /* If the locale name contains an encoding after the dot, return it. */
475 const char *dot = strchr (locale, '.');
479 const char *modifier;
482 /* Look for the possible @... trailer and remove it, if any. */
483 modifier = strchr (dot, '@');
484 if (modifier == NULL)
486 if (modifier - dot < sizeof (buf))
488 memcpy (buf, dot, modifier - dot);
489 buf [modifier - dot] = '\0';
494 /* Resolve through the charset.alias file. */
499 /* OS/2 has a function returning the locale's codepage as a number. */
500 if (DosQueryCp (sizeof (cp), cp, &cplen))
504 sprintf (buf, "CP%u", cp[0]);
512 /* The canonical name cannot be determined. */
516 for (aliases = get_charset_aliases ();
518 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
519 if (strcmp (codeset, aliases) == 0
520 || (aliases[0] == '*' && aliases[1] == '\0'))
522 codeset = aliases + strlen (aliases) + 1;
526 /* Don't return an empty string. GNU libc and GNU libiconv interpret
527 the empty string as denoting "the locale's character encoding",
528 thus GNU libiconv would call this function a second time. */
529 if (codeset[0] == '\0')