X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flibpspp%2Fi18n.c;h=f26fa32c5a46c1e0369b80990ed9fc3bc8a57189;hb=005ed2b3279a59e9767efedc8c5d9a40f1df4c77;hp=c04dd5acaf9f6470e03b65e8544b8f7948f385cb;hpb=d6c75296e5573a997c79a7af1195b6a619c0190c;p=pspp diff --git a/src/libpspp/i18n.c b/src/libpspp/i18n.c index c04dd5acaf..f26fa32c5a 100644 --- a/src/libpspp/i18n.c +++ b/src/libpspp/i18n.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2006, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. + Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -22,14 +22,15 @@ #include #include #include -#include #include #include #include #include +#include #include #include "libpspp/assertion.h" +#include "libpspp/compiler.h" #include "libpspp/hmapx.h" #include "libpspp/hash-functions.h" #include "libpspp/pool.h" @@ -38,10 +39,14 @@ #include "gl/c-strcase.h" #include "gl/localcharset.h" +#include "gl/minmax.h" #include "gl/xalloc.h" #include "gl/relocatable.h" #include "gl/xstrndup.h" +#include "gettext.h" +#define _(msgid) gettext (msgid) + struct converter { char *tocode; @@ -142,53 +147,75 @@ recode_string_len (const char *to, const char *from, Returns the output length if successful, -1 if the output buffer is too small. */ static ssize_t -try_recode (iconv_t conv, - const char *ip, size_t inbytes, - char *op_, size_t outbytes) +try_recode (iconv_t conv, char fallbackchar, + const char *in, size_t inbytes, + char *out_, size_t outbytes) { - /* FIXME: Need to ensure that this char is valid in the target encoding */ - const char fallbackchar = '?'; - char *op = op_; + char *out = out_; + int i; /* Put the converter into the initial shift state, in case there was any state information left over from its last usage. */ iconv (conv, NULL, 0, NULL, 0); - while (iconv (conv, (ICONV_CONST char **) &ip, &inbytes, - &op, &outbytes) == -1) - switch (errno) - { - case EINVAL: - if (outbytes < 2) - return -1; - *op++ = fallbackchar; - *op = '\0'; - return op - op_; - - case EILSEQ: - if (outbytes == 0) - return -1; - *op++ = fallbackchar; - outbytes--; - ip++; - inbytes--; - break; - - case E2BIG: - return -1; - - default: - /* should never happen */ - fprintf (stderr, "Character conversion error: %s\n", strerror (errno)); - NOT_REACHED (); - break; - } + /* Do two rounds of iconv() calls: + + - The first round does the bulk of the conversion using the + caller-supplied input data.. + + - The second round flushes any leftover output. This has a real effect + with input encodings that use combining diacritics, e.g. without the + second round the last character tends to gets dropped when converting + from windows-1258 to other encodings. + */ + for (i = 0; i < 2; i++) + { + ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) ∈ + size_t *inbytesp = i ? NULL : &inbytes; + + while (iconv (conv, inp, inbytesp, &out, &outbytes) == -1) + switch (errno) + { + case EINVAL: + if (outbytes < 2) + return -E2BIG; + if (!fallbackchar) + return -EINVAL; + *out++ = fallbackchar; + *out = '\0'; + return out - out_; + + case EILSEQ: + if (outbytes == 0) + return -E2BIG; + if (!fallbackchar) + return -EILSEQ; + *out++ = fallbackchar; + outbytes--; + if (inp) + { + in++; + inbytes--; + } + break; + + case E2BIG: + return -E2BIG; + + default: + /* should never happen */ + fprintf (stderr, "Character conversion error: %s\n", + strerror (errno)); + NOT_REACHED (); + break; + } + } if (outbytes == 0) - return -1; + return -E2BIG; - *op = '\0'; - return op - op_; + *out = '\0'; + return out - out_; } /* Converts the string TEXT, which should be encoded in FROM-encoding, to a @@ -492,21 +519,12 @@ filename_to_utf8 (const char *filename) return recode_string ("UTF-8", filename_encoding (), filename, -1); } -/* Converts the string TEXT, which should be encoded in FROM-encoding, to a - dynamically allocated string in TO-encoding. Any characters which cannot be - converted will be represented by '?'. - - The returned string will be null-terminated and allocated on POOL. - - This function's behaviour differs from that of g_convert_with_fallback - provided by GLib. The GLib function will fail (returns NULL) if any part of - the input string is not valid in the declared input encoding. This function - however perseveres even in the presence of badly encoded input. */ -struct substring -recode_substring_pool (const char *to, const char *from, - struct substring text, struct pool *pool) +static int +recode_substring_pool__ (const char *to, const char *from, + struct substring text, char fallbackchar, + struct pool *pool, struct substring *out) { - size_t outbufferlength; + size_t bufsize; iconv_t conv ; if (to == NULL) @@ -519,33 +537,84 @@ recode_substring_pool (const char *to, const char *from, if ( (iconv_t) -1 == conv ) { - struct substring out; - ss_alloc_substring_pool (&out, text, pool); - return out; + if (fallbackchar) + { + out->string = pool_malloc (pool, text.length + 1); + out->length = text.length; + memcpy (out->string, text.string, text.length); + out->string[out->length] = '\0'; + return 0; + } + else + return EPROTO; } - for ( outbufferlength = 1 ; outbufferlength != 0; outbufferlength <<= 1 ) - if ( outbufferlength > text.length) - { - char *output = pool_malloc (pool, outbufferlength); - ssize_t output_len = try_recode (conv, text.string, text.length, - output, outbufferlength); - if (output_len >= 0) - return ss_buffer (output, output_len); - pool_free (pool, output); - } + for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2) + { + char *output = pool_malloc (pool, bufsize); + ssize_t retval; + + retval = try_recode (conv, fallbackchar, text.string, text.length, + output, bufsize); + if (retval >= 0) + { + *out = ss_buffer (output, retval); + return 0; + } + pool_free (pool, output); + + if (retval != -E2BIG) + return -retval; + } NOT_REACHED (); } +/* Converts the string TEXT, which should be encoded in FROM-encoding, to a + dynamically allocated string in TO-encoding. Any characters which cannot be + converted will be represented by '?'. + + The returned string will be null-terminated and allocated on POOL with + pool_malloc(). + + This function's behaviour differs from that of g_convert_with_fallback + provided by GLib. The GLib function will fail (returns NULL) if any part of + the input string is not valid in the declared input encoding. This function + however perseveres even in the presence of badly encoded input. */ +struct substring +recode_substring_pool (const char *to, const char *from, + struct substring text, struct pool *pool) +{ + struct substring out; + + recode_substring_pool__ (to, from, text, '?', pool, &out); + return out; +} + +/* Converts the string TEXT, which should be encoded in FROM-encoding, to a + dynamically allocated string in TO-encoding. On success, returns 0, and the + converted null-terminated string, allocated from POOL with pool_malloc(), is + stored in *OUT. On failure, returns a positive errno value. + + The function fails with an error if any part of the input string is not + valid in the declared input encoding. */ +int +recode_pedantically (const char *to, const char *from, + struct substring text, struct pool *pool, + struct substring *out) +{ + int error; + + error = recode_substring_pool__ (to, from, text, 0, pool, out); + if (error) + *out = ss_empty (); + return error; +} + void i18n_init (void) { - setlocale (LC_CTYPE, ""); - setlocale (LC_MESSAGES, ""); -#if HAVE_LC_PAPER - setlocale (LC_PAPER, ""); -#endif + setlocale (LC_ALL, ""); bindtextdomain (PACKAGE, relocate(locale_dir)); textdomain (PACKAGE); @@ -593,7 +662,6 @@ set_encoding_from_locale (const char *loc) ok = false; } - setlocale (LC_CTYPE, tmp); free (tmp); @@ -655,9 +723,6 @@ get_system_decimal (void) { char radix_char; - char *ol = xstrdup (setlocale (LC_NUMERIC, NULL)); - setlocale (LC_NUMERIC, ""); - #if HAVE_NL_LANGINFO radix_char = nl_langinfo (RADIXCHAR)[0]; #else @@ -668,10 +733,6 @@ get_system_decimal (void) } #endif - /* We MUST leave LC_NUMERIC untouched, since it would - otherwise interfere with data_{in,out} */ - setlocale (LC_NUMERIC, ol); - free (ol); return radix_char; } @@ -685,6 +746,109 @@ uc_name (ucs4_t uc, char buffer[16]) return buffer; } +/* UTF-8 functions that deal with uppercase/lowercase distinctions. */ + +/* Returns a hash value for the N bytes of UTF-8 encoded data starting at S, + with lowercase and uppercase letters treated as equal, starting from + BASIS. */ +unsigned int +utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis) +{ + uint8_t folded_buf[2048]; + size_t folded_len = sizeof folded_buf; + uint8_t *folded_s; + unsigned int hash; + + folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n, + NULL, UNINORM_NFKD, folded_buf, &folded_len); + if (folded_s != NULL) + { + hash = hash_bytes (folded_s, folded_len, basis); + if (folded_s != folded_buf) + free (folded_s); + } + else + { + if (errno == ENOMEM) + xalloc_die (); + hash = hash_bytes (s, n, basis); + } + + return hash; +} + +/* Returns a hash value for null-terminated UTF-8 string S, with lowercase and + uppercase letters treated as equal, starting from BASIS. */ +unsigned int +utf8_hash_case_string (const char *s, unsigned int basis) +{ + return utf8_hash_case_bytes (s, strlen (s), basis); +} + +/* Compares UTF-8 strings A and B case-insensitively. + Returns a negative value if A < B, zero if A == B, positive if A > B. */ +int +utf8_strcasecmp (const char *a, const char *b) +{ + return utf8_strncasecmp (a, strlen (a), b, strlen (b)); +} + +/* Compares UTF-8 strings A (with length AN) and B (with length BN) + case-insensitively. + Returns a negative value if A < B, zero if A == B, positive if A > B. */ +int +utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn) +{ + int result; + + if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an, + CHAR_CAST (const uint8_t *, b), bn, + NULL, UNINORM_NFKD, &result)) + { + if (errno == ENOMEM) + xalloc_die (); + + result = memcmp (a, b, MIN (an, bn)); + if (result == 0) + result = an < bn ? -1 : an > bn; + } + + return result; +} + +static char * +utf8_casemap (const char *s, + uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t, + uint8_t *, size_t *)) +{ + char *result; + size_t size; + + result = CHAR_CAST (char *, + f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1, + NULL, NULL, NULL, &size)); + if (result == NULL) + { + if (errno == ENOMEM) + xalloc_die (); + + result = xstrdup (s); + } + return result; +} + +char * +utf8_to_upper (const char *s) +{ + return utf8_casemap (s, u8_toupper); +} + +char * +utf8_to_lower (const char *s) +{ + return utf8_casemap (s, u8_tolower); +} + bool get_encoding_info (struct encoding_info *e, const char *name) { @@ -783,3 +947,117 @@ is_encoding_utf8 (const char *e) && ((e[3] == '8' && e[4] == '\0') || (e[3] == '-' && e[4] == '8' && e[5] == '\0'))); } + +static struct encoding_category *categories; +static int n_categories; + +static void SENTINEL (0) +add_category (size_t *allocated_categories, const char *category, ...) +{ + struct encoding_category *c; + const char *encodings[16]; + va_list args; + int i, n; + + /* Count encoding arguments. */ + va_start (args, category); + n = 0; + while ((encodings[n] = va_arg (args, const char *)) != NULL) + { + const char *encoding = encodings[n]; + if (!strcmp (encoding, "Auto") || is_encoding_supported (encoding)) + n++; + } + assert (n < sizeof encodings / sizeof *encodings); + va_end (args); + + if (n == 0) + return; + + if (n_categories >= *allocated_categories) + categories = x2nrealloc (categories, + allocated_categories, sizeof *categories); + + c = &categories[n_categories++]; + c->category = category; + c->encodings = xmalloc (n * sizeof *c->encodings); + for (i = 0; i < n; i++) + c->encodings[i] = encodings[i]; + c->n_encodings = n; +} + +static void +init_encoding_categories (void) +{ + static bool inited; + size_t alloc; + + if (inited) + return; + inited = true; + + alloc = 0; + add_category (&alloc, "Unicode", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", + "UTF-32", "UTF-32BE", "UTF-32LE", NULL_SENTINEL); + add_category (&alloc, _("Arabic"), "IBM864", "ISO-8859-6", "Windows-1256", + NULL_SENTINEL); + add_category (&alloc, _("Armenian"), "ARMSCII-8", NULL_SENTINEL); + add_category (&alloc, _("Baltic"), "ISO-8859-13", "ISO-8859-4", + "Windows-1257", NULL_SENTINEL); + add_category (&alloc, _("Celtic"), "ISO-8859-14", NULL_SENTINEL); + add_category (&alloc, _("Central European"), "IBM852", "ISO-8859-2", + "Mac-CentralEurope", "Windows-1250", NULL_SENTINEL); + add_category (&alloc, _("Chinese Simplified"), "GB18030", "GB2312", "GBK", + "HZ-GB-2312", "ISO-2022-CN", NULL_SENTINEL); + add_category (&alloc, _("Chinese Traditional"), "Big5", "Big5-HKSCS", + "EUC-TW", NULL_SENTINEL); + add_category (&alloc, _("Croatian"), "MacCroatian", NULL_SENTINEL); + add_category (&alloc, _("Cyrillic"), "IBM855", "ISO-8859-5", "ISO-IR-111", + "KOI8-R", "MacCyrillic", NULL_SENTINEL); + add_category (&alloc, _("Cyrillic/Russian"), "IBM866", NULL_SENTINEL); + add_category (&alloc, _("Cyrillic/Ukrainian"), "KOI8-U", "MacUkrainian", + NULL_SENTINEL); + add_category (&alloc, _("Georgian"), "GEOSTD8", NULL_SENTINEL); + add_category (&alloc, _("Greek"), "ISO-8859-7", "MacGreek", NULL_SENTINEL); + add_category (&alloc, _("Gujarati"), "MacGujarati", NULL_SENTINEL); + add_category (&alloc, _("Gurmukhi"), "MacGurmukhi", NULL_SENTINEL); + add_category (&alloc, _("Hebrew"), "IBM862", "ISO-8859-8-I", "Windows-1255", + NULL_SENTINEL); + add_category (&alloc, _("Hebrew Visual"), "ISO-8859-8", NULL_SENTINEL); + add_category (&alloc, _("Hindi"), "MacDevangari", NULL_SENTINEL); + add_category (&alloc, _("Icelandic"), "MacIcelandic", NULL_SENTINEL); + add_category (&alloc, _("Japanese"), "EUC-JP", "ISO-2022-JP", "Shift_JIS", + NULL_SENTINEL); + add_category (&alloc, _("Korean"), "EUC-KR", "ISO-2022-KR", "JOHAB", "UHC", + NULL_SENTINEL); + add_category (&alloc, _("Nordic"), "ISO-8859-10", NULL_SENTINEL); + add_category (&alloc, _("Romanian"), "ISO-8859-16", "MacRomanian", + NULL_SENTINEL); + add_category (&alloc, _("South European"), "ISO-8859-3", NULL_SENTINEL); + add_category (&alloc, _("Thai"), "ISO-8859-11", "TIS-620", "Windows-874", + NULL_SENTINEL); + add_category (&alloc, _("Turkish"), "IBM857", "ISO-8859-9", "Windows-1254", + NULL_SENTINEL); + add_category (&alloc, _("Vietnamese"), "TVCN", "VISCII", "VPS", + "Windows-1258", NULL_SENTINEL); + add_category (&alloc, _("Western European"), "ISO-8859-1", "ISO-8859-15", + "Windows-1252", "IBM850", "MacRoman", NULL_SENTINEL); +} + +/* Returns an array of "struct encoding_category" that contains only the + categories and encodings that the system supports. */ +struct encoding_category * +get_encoding_categories (void) +{ + init_encoding_categories (); + return categories; +} + +/* Returns the number of elements in the array returned by + get_encoding_categories(). */ +size_t +get_n_encoding_categories (void) +{ + init_encoding_categories (); + return n_categories; +}