From: Ben Pfaff Date: Fri, 28 Dec 2012 03:51:08 +0000 (-0800) Subject: i18n: New functions for UTF-8 case conversion. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8539c057c6889ff335da0e00117cc0f5fb9bf72d;p=pspp i18n: New functions for UTF-8 case conversion. Also, use the new functions in a few cases where we want a full UTF-8 conversion. --- diff --git a/Smake b/Smake index 22699f65e3..72ec791a5e 100644 --- a/Smake +++ b/Smake @@ -78,6 +78,8 @@ GNULIB_MODULES = \ trunc \ unicase/u8-casecmp \ unicase/u8-casefold \ + unicase/u8-tolower \ + unicase/u8-toupper \ unictype/ctype-print \ unictype/property-id-continue \ unictype/property-id-start \ diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index defe460f5c..35f40d3dbc 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -1573,7 +1573,8 @@ parse_long_var_name_map (struct sfm_reader *r, if (record == NULL) { - /* Convert variable names to lowercase. */ + /* There are no long variable names. Use the short variable names, + converted to lowercase, as the long variable names. */ size_t i; for (i = 0; i < dict_get_var_cnt (dict); i++) @@ -1581,11 +1582,8 @@ parse_long_var_name_map (struct sfm_reader *r, struct variable *var = dict_get_var (dict, i); char *new_name; - new_name = xstrdup (var_get_name (var)); - str_lowercase (new_name); - + new_name = utf8_to_lower (var_get_name (var)); rename_var_and_save_short_names (dict, var, new_name); - free (new_name); } diff --git a/src/data/sys-file-writer.c b/src/data/sys-file-writer.c index 5003ca2b89..d02369856f 100644 --- a/src/data/sys-file-writer.c +++ b/src/data/sys-file-writer.c @@ -774,11 +774,12 @@ write_mrsets (struct sfm_writer *w, const struct dictionary *dict, for (j = 0; j < mrset->n_vars; j++) { const char *short_name_utf8 = var_get_short_name (mrset->vars[j], 0); + char *lower_name_utf8 = utf8_to_lower (short_name_utf8); char *short_name = recode_string (encoding, "UTF-8", - short_name_utf8, -1); - str_lowercase (short_name); + lower_name_utf8, -1); ds_put_format (&s, " %s", short_name); free (short_name); + free (lower_name_utf8); } ds_put_byte (&s, '\n'); } diff --git a/src/data/variable.c b/src/data/variable.c index b63e344905..fe4645ee41 100644 --- a/src/data/variable.c +++ b/src/data/variable.c @@ -926,8 +926,7 @@ var_set_short_name (struct variable *var, size_t idx, const char *short_name) for (i = old_cnt; i < var->short_name_cnt; i++) var->short_names[i] = NULL; } - var->short_names[idx] = xstrdup (short_name); - str_uppercase (var->short_names[idx]); + var->short_names[idx] = utf8_to_upper (short_name); } dict_var_changed (var); diff --git a/src/libpspp/i18n.c b/src/libpspp/i18n.c index 1779afc434..dca85db4f0 100644 --- a/src/libpspp/i18n.c +++ b/src/libpspp/i18n.c @@ -748,6 +748,39 @@ utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn) return result; } + +static char * +utf8_casemap (const char *s, + uint8_t *(*f) (const uint8_t *, size_t, const char *, uninorm_t, + uint8_t *, size_t *)) +{ + char *result; + size_t size; + + result = CHAR_CAST (char *, + f (CHAR_CAST (const uint8_t *, s), strlen (s) + 1, + NULL, NULL, NULL, &size)); + if (result == NULL) + { + if (errno == ENOMEM) + xalloc_die (); + + result = xstrdup (s); + } + return result; +} + +char * +utf8_to_upper (const char *s) +{ + return utf8_casemap (s, u8_toupper); +} + +char * +utf8_to_lower (const char *s) +{ + return utf8_casemap (s, u8_tolower); +} bool get_encoding_info (struct encoding_info *e, const char *name) diff --git a/src/libpspp/i18n.h b/src/libpspp/i18n.h index 380c3cbb79..6722b5cec9 100644 --- a/src/libpspp/i18n.h +++ b/src/libpspp/i18n.h @@ -72,6 +72,8 @@ unsigned int utf8_hash_case_bytes (const char *, size_t n, unsigned int basis); unsigned int utf8_hash_case_string (const char *, unsigned int basis); int utf8_strcasecmp (const char *, const char *); int utf8_strncasecmp (const char *, size_t, const char *, size_t); +char *utf8_to_upper (const char *); +char *utf8_to_lower (const char *); /* Information about character encodings. */ diff --git a/src/libpspp/str.c b/src/libpspp/str.c index a58c52c36e..cde0db9ae0 100644 --- a/src/libpspp/str.c +++ b/src/libpspp/str.c @@ -28,6 +28,7 @@ #include "libpspp/message.h" #include "libpspp/pool.h" +#include "gl/c-ctype.h" #include "gl/c-vasnprintf.h" #include "gl/relocatable.h" #include "gl/minmax.h" @@ -233,20 +234,26 @@ str_copy_buf_trunc (char *dst, size_t dst_size, dst[dst_len] = '\0'; } -/* Converts each byte in S to uppercase. */ +/* Converts each byte in S to uppercase. + + This is suitable only for ASCII strings. Use utf8_to_upper() for UTF-8 + strings.*/ void str_uppercase (char *s) { for (; *s != '\0'; s++) - *s = toupper ((unsigned char) *s); + *s = c_toupper ((unsigned char) *s); } -/* Converts each byte in S to lowercase. */ +/* Converts each byte in S to lowercase. + + This is suitable only for ASCII strings. Use utf8_to_lower() for UTF-8 + strings.*/ void str_lowercase (char *s) { for (; *s != '\0'; s++) - *s = tolower ((unsigned char) *s); + *s = c_tolower ((unsigned char) *s); } /* Converts NUMBER into a string in 26-adic notation in BUFFER, diff --git a/tests/data/sys-file-reader.at b/tests/data/sys-file-reader.at index 9fd8c09e19..c5699b136a 100644 --- a/tests/data/sys-file-reader.at +++ b/tests/data/sys-file-reader.at @@ -131,7 +131,7 @@ num8,Format: F8.0,,8 ,Missing Values: 1 THRU 3; 5,, num9,Format: F8.0,,9 ,Missing Values: 1 THRU HIGHEST; -5,, -numÀÈÌÑÒ,Format: F8.0,,10 +numàèìñò,Format: F8.0,,10 ,Missing Values: LOWEST THRU 1; 5,, str1,Format: A4,,11 str2,String variable 2's label,,12 @@ -151,7 +151,7 @@ str8,25-byte string,,18 ,Format: A25,, Table: Data List -num1,num2,num3,num4,num5,num6,num7,num8,num9,numÀÈÌÑÒ,str1,str2,str3,str4,str5,str6,str7,str8 +num1,num2,num3,num4,num5,num6,num7,num8,num9,numàèìñò,str1,str2,str3,str4,str5,str6,str7,str8 1,2,3,4,5,6,7,8,9,10,abcd,efgh,ijkl,mnop,qrst,uvwx,yzABCDEFGHI,JKLMNOPQRSTUVWXYZ01234567 ]) done @@ -1014,11 +1014,11 @@ LIST. AT_CHECK([pspp -o pspp.csv sys-file.sps]) AT_CHECK([grep -v Measure pspp.csv | grep -v Display], [0], [dnl Variable,Description,,Position -sÉq256,Format: A256,,1 +séq256,Format: A256,,1 str600,Format: A600,,2 Table: Data List -sÉq256,str600 +séq256,str600 abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@a,abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyz ]) done