X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flibpspp%2Fi18n.c;h=bc0db0b896426c094b305382141a6a3e69959cdb;hb=f790dbda9d498eef9c9c0a49078adbeecf768d56;hp=0819299d37abaae54e4f666f046e798818c53a7a;hpb=5b2fcf1abbfbf48a00300a41fa63444402a4bbb7;p=pspp diff --git a/src/libpspp/i18n.c b/src/libpspp/i18n.c index 0819299d37..bc0db0b896 100644 --- a/src/libpspp/i18n.c +++ b/src/libpspp/i18n.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2006, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. + Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -22,11 +22,11 @@ #include #include #include -#include #include #include #include #include +#include #include #include "libpspp/assertion.h" @@ -37,8 +37,10 @@ #include "libpspp/str.h" #include "libpspp/version.h" +#include "gl/c-ctype.h" #include "gl/c-strcase.h" #include "gl/localcharset.h" +#include "gl/minmax.h" #include "gl/xalloc.h" #include "gl/relocatable.h" #include "gl/xstrndup.h" @@ -47,19 +49,19 @@ #define _(msgid) gettext (msgid) struct converter - { - char *tocode; - char *fromcode; - iconv_t conv; - int error; - }; +{ + char *tocode; + char *fromcode; + iconv_t conv; + int null_char_width; +}; static char *default_encoding; static struct hmapx map; /* A wrapper around iconv_open */ static struct converter * -create_iconv__ (const char* tocode, const char* fromcode) +create_iconv (const char* tocode, const char* fromcode) { size_t hash; struct hmapx_node *node; @@ -68,41 +70,63 @@ create_iconv__ (const char* tocode, const char* fromcode) hash = hash_string (tocode, hash_string (fromcode, 0)); HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map) - if (!strcmp (tocode, converter->tocode) - && !strcmp (fromcode, converter->fromcode)) - return converter; + { + if (!converter) + return NULL; + + if (!strcmp (tocode, converter->tocode) + && !strcmp (fromcode, converter->fromcode)) + return converter; + } converter = xmalloc (sizeof *converter); converter->tocode = xstrdup (tocode); converter->fromcode = xstrdup (fromcode); converter->conv = iconv_open (tocode, fromcode); - converter->error = converter->conv == (iconv_t) -1 ? errno : 0; - hmapx_insert (&map, converter, hash); - - return converter; -} - -static iconv_t -create_iconv (const char* tocode, const char* fromcode) -{ - struct converter *converter; - - converter = create_iconv__ (tocode, fromcode); - + int error = converter->conv == (iconv_t) ~0 ? errno : 0; /* I don't think it's safe to translate this string or to use messaging as the converters have not yet been set up */ - if (converter->error && strcmp (tocode, fromcode)) + if (error && strcmp (tocode, fromcode)) { fprintf (stderr, "Warning: " "cannot create a converter for `%s' to `%s': %s\n", - fromcode, tocode, strerror (converter->error)); - converter->error = 0; + fromcode, tocode, strerror (error)); + + free (converter->tocode); + free (converter->fromcode); + free (converter); + + hmapx_insert (&map, NULL, hash); + return NULL; } - return converter->conv; + /* Find out how many bytes there are in a null char in the target + encoding */ + iconv_t bconv = iconv_open (tocode, "ASCII"); + if (bconv != (iconv_t) -1) + { + ICONV_CONST char *nullstr = strdup (""); + ICONV_CONST char *outbuf = strdup ("XXXXXXXX"); + ICONV_CONST char *snullstr = nullstr; + ICONV_CONST char *soutbuf = outbuf; + + size_t inbytes = 1; + const size_t bytes = 8; + size_t outbytes = bytes; + if (-1 != iconv (bconv, &nullstr, &inbytes, &outbuf, &outbytes)) + converter->null_char_width = bytes - outbytes; + free (snullstr); + free (soutbuf); + iconv_close (bconv); + } + + hmapx_insert (&map, converter, hash); + + return converter; } + /* Converts the single byte C from encoding FROM to TO, returning the first byte of the result. @@ -146,53 +170,80 @@ recode_string_len (const char *to, const char *from, Returns the output length if successful, -1 if the output buffer is too small. */ static ssize_t -try_recode (iconv_t conv, - const char *ip, size_t inbytes, - char *op_, size_t outbytes) +try_recode (struct converter *cvtr, char fallbackchar, + const char *in, size_t inbytes, + char *out_, size_t outbytes) { - /* FIXME: Need to ensure that this char is valid in the target encoding */ - const char fallbackchar = '?'; - char *op = op_; + char *out = out_; + int i, j; + + int null_bytes = cvtr->null_char_width; /* Put the converter into the initial shift state, in case there was any state information left over from its last usage. */ - iconv (conv, NULL, 0, NULL, 0); + iconv (cvtr->conv, NULL, 0, NULL, 0); - while (iconv (conv, (ICONV_CONST char **) &ip, &inbytes, - &op, &outbytes) == -1) - switch (errno) - { - case EINVAL: - if (outbytes < 2) - return -1; - *op++ = fallbackchar; - *op = '\0'; - return op - op_; - - case EILSEQ: - if (outbytes == 0) - return -1; - *op++ = fallbackchar; - outbytes--; - ip++; - inbytes--; - break; - - case E2BIG: - return -1; - - default: - /* should never happen */ - fprintf (stderr, "Character conversion error: %s\n", strerror (errno)); - NOT_REACHED (); - break; - } + /* Do two rounds of iconv() calls: + + - The first round does the bulk of the conversion using the + caller-supplied input data.. + + - The second round flushes any leftover output. This has a real effect + with input encodings that use combining diacritics, e.g. without the + second round the last character tends to gets dropped when converting + from windows-1258 to other encodings. + */ + for (i = 0; i < 2; i++) + { + ICONV_CONST char **inp = i ? NULL : (ICONV_CONST char **) ∈ + size_t *inbytesp = i ? NULL : &inbytes; + + while (iconv (cvtr->conv, inp, inbytesp, &out, &outbytes) == -1) + switch (errno) + { + case EINVAL: + if (outbytes < null_bytes + 1) + return -E2BIG; + if (!fallbackchar) + return -EINVAL; + *out++ = fallbackchar; + for (j = 0 ; j < null_bytes ; ++j) + *out++ = '\0'; + return out - 1 - out_; + + case EILSEQ: + if (outbytes == 0) + return -E2BIG; + if (!fallbackchar) + return -EILSEQ; + *out++ = fallbackchar; + outbytes--; + if (inp) + { + in++; + inbytes--; + } + break; + + case E2BIG: + return -E2BIG; + + default: + /* should never happen */ + fprintf (stderr, "Character conversion error: %s\n", + strerror (errno)); + NOT_REACHED (); + break; + } + } + + if (outbytes <= null_bytes - 1) + return -E2BIG; - if (outbytes == 0) - return -1; + for (i = 0 ; i < null_bytes ; ++i) + *out++ = '\0'; - *op = '\0'; - return op - op_; + return out - 1 - out_; } /* Converts the string TEXT, which should be encoded in FROM-encoding, to a @@ -217,7 +268,7 @@ recode_string_pool (const char *to, const char *from, return NULL; if ( length == -1 ) - length = strlen (text); + length = strlen (text); out = recode_substring_pool (to, from, ss_buffer (text, length), pool); return out.string; @@ -256,10 +307,10 @@ xconcat2 (const char *a, size_t a_len, fits or until HEAD_LEN reaches 0. [*] Actually this function drops grapheme clusters instead of characters, so - that, e.g. a Unicode character followed by a combining accent character - is either completely included or completely excluded from HEAD_LEN. See - UAX #29 at http://unicode.org/reports/tr29/ for more information on - grapheme clusters. + that, e.g. a Unicode character followed by a combining accent character + is either completely included or completely excluded from HEAD_LEN. See + UAX #29 at http://unicode.org/reports/tr29/ for more information on + grapheme clusters. A null ENCODING is treated as UTF-8. @@ -270,16 +321,16 @@ xconcat2 (const char *a, size_t a_len, Simple examples for encoding="UTF-8", max_len=6: - head="abc", tail="xyz" => 3 - head="abcd", tail="xyz" => 3 ("d" dropped). - head="abc", tail="uvwxyz" => 0 ("abc" dropped). - head="abc", tail="tuvwxyz" => 0 ("abc" dropped). + head="abc", tail="xyz" => 3 + head="abcd", tail="xyz" => 3 ("d" dropped). + head="abc", tail="uvwxyz" => 0 ("abc" dropped). + head="abc", tail="tuvwxyz" => 0 ("abc" dropped). Examples for encoding="ISO-8859-1", max_len=6: - head="éèä", tail="xyz" => 6 - (each letter in head is only 1 byte in ISO-8859-1 even though they - each take 2 bytes in UTF-8 encoding) + head="éèä", tail="xyz" => 6 + (each letter in head is only 1 byte in ISO-8859-1 even though they + each take 2 bytes in UTF-8 encoding) */ static size_t utf8_encoding_concat__ (const char *head, size_t head_len, @@ -398,25 +449,25 @@ utf8_encoding_concat__ (const char *head, size_t head_len, included, even if TAIL by itself is longer than MAX_LEN in ENCODING. [*] Actually this function drops grapheme clusters instead of characters, so - that, e.g. a Unicode character followed by a combining accent character - is either completely included or completely excluded from the returned - string. See UAX #29 at http://unicode.org/reports/tr29/ for more - information on grapheme clusters. + that, e.g. a Unicode character followed by a combining accent character + is either completely included or completely excluded from the returned + string. See UAX #29 at http://unicode.org/reports/tr29/ for more + information on grapheme clusters. A null ENCODING is treated as UTF-8. Simple examples for encoding="UTF-8", max_len=6: - head="abc", tail="xyz" => "abcxyz" - head="abcd", tail="xyz" => "abcxyz" - head="abc", tail="uvwxyz" => "uvwxyz" - head="abc", tail="tuvwxyz" => "tuvwxyz" + head="abc", tail="xyz" => "abcxyz" + head="abcd", tail="xyz" => "abcxyz" + head="abc", tail="uvwxyz" => "uvwxyz" + head="abc", tail="tuvwxyz" => "tuvwxyz" Examples for encoding="ISO-8859-1", max_len=6: - head="éèä", tail="xyz" => "éèäxyz" - (each letter in HEAD is only 1 byte in ISO-8859-1 even though they - each take 2 bytes in UTF-8 encoding) + head="éèä", tail="xyz" => "éèäxyz" + (each letter in HEAD is only 1 byte in ISO-8859-1 even though they + each take 2 bytes in UTF-8 encoding) */ char * utf8_encoding_concat (const char *head, const char *tail, @@ -456,10 +507,10 @@ utf8_encoding_concat_len (const char *head, const char *tail, ENCODING. Both S and the returned string are encoded in UTF-8. [*] Actually this function drops grapheme clusters instead of characters, so - that, e.g. a Unicode character followed by a combining accent character - is either completely included or completely excluded from the returned - string. See UAX #29 at http://unicode.org/reports/tr29/ for more - information on grapheme clusters. + that, e.g. a Unicode character followed by a combining accent character + is either completely included or completely excluded from the returned + string. See UAX #29 at http://unicode.org/reports/tr29/ for more + information on grapheme clusters. A null ENCODING is treated as UTF-8. */ @@ -496,22 +547,13 @@ filename_to_utf8 (const char *filename) return recode_string ("UTF-8", filename_encoding (), filename, -1); } -/* Converts the string TEXT, which should be encoded in FROM-encoding, to a - dynamically allocated string in TO-encoding. Any characters which cannot be - converted will be represented by '?'. - - The returned string will be null-terminated and allocated on POOL. - - This function's behaviour differs from that of g_convert_with_fallback - provided by GLib. The GLib function will fail (returns NULL) if any part of - the input string is not valid in the declared input encoding. This function - however perseveres even in the presence of badly encoded input. */ -struct substring -recode_substring_pool (const char *to, const char *from, - struct substring text, struct pool *pool) +static int +recode_substring_pool__ (const char *to, const char *from, + struct substring text, char fallbackchar, + struct pool *pool, struct substring *out) { - size_t outbufferlength; - iconv_t conv ; + size_t bufsize; + struct converter *conv; if (to == NULL) to = default_encoding; @@ -521,36 +563,86 @@ recode_substring_pool (const char *to, const char *from, conv = create_iconv (to, from); - if ( (iconv_t) -1 == conv ) + if ( NULL == conv ) { - struct substring out; - ss_alloc_substring_pool (&out, text, pool); - return out; + if (fallbackchar) + { + out->string = pool_malloc (pool, text.length + 1); + out->length = text.length; + memcpy (out->string, text.string, text.length); + out->string[out->length] = '\0'; + return 0; + } + else + return EPROTO; } - for ( outbufferlength = 1 ; outbufferlength != 0; outbufferlength <<= 1 ) - if ( outbufferlength > text.length) - { - char *output = pool_malloc (pool, outbufferlength); - ssize_t output_len = try_recode (conv, text.string, text.length, - output, outbufferlength); - if (output_len >= 0) - return ss_buffer (output, output_len); - pool_free (pool, output); - } + for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2) + { + char *output = pool_malloc (pool, bufsize); + ssize_t retval; + + retval = try_recode (conv, fallbackchar, text.string, text.length, + output, bufsize); + if (retval >= 0) + { + *out = ss_buffer (output, retval); + return 0; + } + pool_free (pool, output); + + if (retval != -E2BIG) + return -retval; + } NOT_REACHED (); } +/* Converts the string TEXT, which should be encoded in FROM-encoding, to a + dynamically allocated string in TO-encoding. Any characters which cannot be + converted will be represented by '?'. + + The returned string will be null-terminated and allocated on POOL with + pool_malloc(). + + This function's behaviour differs from that of g_convert_with_fallback + provided by GLib. The GLib function will fail (returns NULL) if any part of + the input string is not valid in the declared input encoding. This function + however perseveres even in the presence of badly encoded input. */ +struct substring +recode_substring_pool (const char *to, const char *from, + struct substring text, struct pool *pool) +{ + struct substring out; + + recode_substring_pool__ (to, from, text, '?', pool, &out); + return out; +} + +/* Converts the string TEXT, which should be encoded in FROM-encoding, to a + dynamically allocated string in TO-encoding. On success, returns 0, and the + converted null-terminated string, allocated from POOL with pool_malloc(), is + stored in *OUT. On failure, returns a positive errno value. + + The function fails with an error if any part of the input string is not + valid in the declared input encoding. */ +int +recode_pedantically (const char *to, const char *from, + struct substring text, struct pool *pool, + struct substring *out) +{ + int error; + + error = recode_substring_pool__ (to, from, text, 0, pool, out); + if (error) + *out = ss_empty (); + return error; +} + void i18n_init (void) { - setlocale (LC_CTYPE, ""); - setlocale (LC_COLLATE, ""); - setlocale (LC_MESSAGES, ""); -#if HAVE_LC_PAPER - setlocale (LC_PAPER, ""); -#endif + setlocale (LC_ALL, ""); bindtextdomain (PACKAGE, relocate(locale_dir)); textdomain (PACKAGE); @@ -575,7 +667,7 @@ set_default_encoding (const char *enc) /* Attempts to set the encoding from a locale name - returns true if successfull. + returns true if successful. This function does not (should not!) alter the current locale. */ bool @@ -598,7 +690,6 @@ set_encoding_from_locale (const char *loc) ok = false; } - setlocale (LC_CTYPE, tmp); free (tmp); @@ -624,6 +715,8 @@ i18n_done (void) HMAPX_FOR_EACH (cvtr, node, &map) { + if (cvtr == NULL) + continue; free (cvtr->tocode); free (cvtr->fromcode); if (cvtr->conv != (iconv_t) -1) @@ -654,15 +747,12 @@ valid_encoding (const char *enc) /* Return the system local's idea of the - decimal seperator character */ + decimal separator character */ char get_system_decimal (void) { char radix_char; - char *ol = xstrdup (setlocale (LC_NUMERIC, NULL)); - setlocale (LC_NUMERIC, ""); - #if HAVE_NL_LANGINFO radix_char = nl_langinfo (RADIXCHAR)[0]; #else @@ -673,10 +763,6 @@ get_system_decimal (void) } #endif - /* We MUST leave LC_NUMERIC untouched, since it would - otherwise interfere with data_{in,out} */ - setlocale (LC_NUMERIC, ol); - free (ol); return radix_char; } @@ -690,14 +776,191 @@ uc_name (ucs4_t uc, char buffer[16]) return buffer; } +/* UTF-8 functions that deal with uppercase/lowercase distinctions. */ + +/* Returns a hash value for the N bytes of UTF-8 encoded data starting at S, + with lowercase and uppercase letters treated as equal, starting from + BASIS. */ +unsigned int +utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis) +{ + uint8_t folded_buf[2048]; + size_t folded_len = sizeof folded_buf; + uint8_t *folded_s; + unsigned int hash; + + folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n, + NULL, UNINORM_NFKD, folded_buf, &folded_len); + if (folded_s != NULL) + { + hash = hash_bytes (folded_s, folded_len, basis); + if (folded_s != folded_buf) + free (folded_s); + } + else + { + if (errno == ENOMEM) + xalloc_die (); + hash = hash_bytes (s, n, basis); + } + + return hash; +} + +/* Returns a hash value for null-terminated UTF-8 string S, with lowercase and + uppercase letters treated as equal, starting from BASIS. */ +unsigned int +utf8_hash_case_string (const char *s, unsigned int basis) +{ + return utf8_hash_case_bytes (s, strlen (s), basis); +} + +/* Compares UTF-8 strings A and B case-insensitively. + Returns a negative value if A < B, zero if A == B, positive if A > B. */ +int +utf8_strcasecmp (const char *a, const char *b) +{ + return utf8_strncasecmp (a, strlen (a), b, strlen (b)); +} + +/* Compares UTF-8 strings A (with length AN) and B (with length BN) + case-insensitively. + Returns a negative value if A < B, zero if A == B, positive if A > B. */ +int +utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn) +{ + int result; + + if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an, + CHAR_CAST (const uint8_t *, b), bn, + NULL, UNINORM_NFKD, &result)) + { + if (errno == ENOMEM) + xalloc_die (); + + result = memcmp (a, b, MIN (an, bn)); + if (result == 0) + result = an < bn ? -1 : an > bn; + } + + return result; +} + +static bool +is_all_digits (const uint8_t *s, size_t len) +{ + for (size_t i = 0; i < len; i++) + if (!c_isdigit (s[i])) + return false; + return true; +} + +/* Compares UTF-8 strings A and B case-insensitively. If the strings end in a + number, then they are compared numerically. Returns a negative value if A < + B, zero if A == B, positive if A > B. */ +int +utf8_strverscasecmp (const char *a, const char *b) +{ + /* Normalize A. */ + uint8_t a_stub[64]; + size_t a_len = sizeof a_stub; + uint8_t *a_norm = u8_casefold (CHAR_CAST (uint8_t *, a), strlen (a), NULL, + UNINORM_NFKD, a_stub, &a_len); + + /* Normalize B. */ + uint8_t b_stub[64]; + size_t b_len = sizeof b_stub; + uint8_t *b_norm = u8_casefold (CHAR_CAST (uint8_t *, b), strlen (b), NULL, + UNINORM_NFKD, b_stub, &b_len); + + int result; + if (!a_norm || !b_norm) + { + result = strcmp (a, b); + goto exit; + } + + size_t len = MIN (a_len, b_len); + for (size_t i = 0; i < len; i++) + if (a_norm[i] != b_norm[i]) + { + /* If both strings end in digits, compare them numerically. */ + if (is_all_digits (&a_norm[i], a_len - i) + && is_all_digits (&b_norm[i], b_len - i)) + { + /* Start by stripping leading zeros, since those don't matter for + numerical comparison. */ + size_t ap, bp; + for (ap = i; ap < a_len; ap++) + if (a_norm[ap] != '0') + break; + for (bp = i; bp < b_len; bp++) + if (b_norm[bp] != '0') + break; + + /* The number with more digits, if there is one, is larger. */ + size_t a_digits = a_len - ap; + size_t b_digits = b_len - bp; + if (a_digits != b_digits) + result = a_digits > b_digits ? 1 : -1; + else + result = memcmp (&a_norm[ap], &b_norm[bp], a_digits); + } + else + result = a_norm[i] > b_norm[i] ? 1 : -1; + goto exit; + } + result = a_len < b_len ? -1 : a_len > b_len; + +exit: + if (a_norm != a_stub) + free (a_norm); + if (b_norm != b_stub) + free (b_norm); + return result; +} + +static char * +utf8_casemap (const char *s, + uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t, + uint8_t *, size_t *)) +{ + char *result; + size_t size; + + result = CHAR_CAST (char *, + f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1, + NULL, NULL, NULL, &size)); + if (result == NULL) + { + if (errno == ENOMEM) + xalloc_die (); + + result = xstrdup (s); + } + return result; +} + +char * +utf8_to_upper (const char *s) +{ + return utf8_casemap (s, u8_toupper); +} + +char * +utf8_to_lower (const char *s) +{ + return utf8_casemap (s, u8_tolower); +} + bool get_encoding_info (struct encoding_info *e, const char *name) { const struct substring in = SS_LITERAL_INITIALIZER ( - "\t\n\v\f\r " - "!\"#$%&'()*+,-./0123456789:;<=>?@" - "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`" - "abcdefghijklmnopqrstuvwxyz{|}~"); + "\t\n\v\f\r " + "!\"#$%&'()*+,-./0123456789:;<=>?@" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`" + "abcdefghijklmnopqrstuvwxyz{|}~"); struct substring out, cr, lf, space; bool ok; @@ -771,8 +1034,8 @@ is_encoding_ebcdic_compatible (const char *encoding) bool is_encoding_supported (const char *encoding) { - return (create_iconv__ ("UTF-8", encoding)->conv != (iconv_t) -1 - && create_iconv__ (encoding, "UTF-8")->conv != (iconv_t) -1); + return (create_iconv ("UTF-8", encoding) + && create_iconv (encoding, "UTF-8")); } /* Returns true if E is the name of a UTF-8 encoding. @@ -793,7 +1056,7 @@ static struct encoding_category *categories; static int n_categories; static void SENTINEL (0) -add_category (size_t *allocated_categories, const char *category, ...) + add_category (size_t *allocated_categories, const char *category, ...) { struct encoding_category *c; const char *encodings[16];