From 23fd25fa0a2fb9d613f4f9445000e49cc3b83db1 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Thu, 27 Dec 2012 19:32:40 -0800 Subject: [PATCH] Use UTF-8 case-insensitive hashes and comparisons for language identifiers. The PSPP language has case-insensitive identifiers (variable names, etc.) but until now it has only implemented case insensitivity for ASCII characters. This commit properly implements case insensitivity for all Unicode characters, using libunistring. Bug #31072. --- Smake | 3 + src/data/attributes.c | 7 ++- src/data/dictionary.c | 14 ++--- src/data/file-handle-def.c | 6 +- src/data/session.c | 9 ++- src/data/sys-file-reader.c | 2 +- src/data/variable.c | 8 +-- src/data/vector.c | 5 +- src/language/control/repeat.c | 5 +- src/language/dictionary/modify-variables.c | 7 ++- src/language/dictionary/mrsets.c | 4 +- src/language/dictionary/vector.c | 5 +- src/language/lexer/variable-parser.c | 9 +-- src/language/stats/descriptives.c | 4 +- src/libpspp/hash-functions.c | 49 +-------------- src/libpspp/hash-functions.h | 4 +- src/libpspp/i18n.c | 72 ++++++++++++++++++++++ src/libpspp/i18n.h | 5 ++ src/libpspp/stringi-map.c | 17 ++--- src/libpspp/stringi-set.c | 17 ++--- tests/automake.mk | 1 + tests/data/dictionary.at | 14 +++++ tests/libpspp/stringi-map-test.c | 5 +- tests/libpspp/stringi-set-test.c | 9 +-- 24 files changed, 170 insertions(+), 111 deletions(-) create mode 100644 tests/data/dictionary.at diff --git a/Smake b/Smake index f82f4f4442..22699f65e3 100644 --- a/Smake +++ b/Smake @@ -76,11 +76,14 @@ GNULIB_MODULES = \ sys_stat \ tempname \ trunc \ + unicase/u8-casecmp \ + unicase/u8-casefold \ unictype/ctype-print \ unictype/property-id-continue \ unictype/property-id-start \ unigbrk/uc-is-grapheme-break \ unilbrk/u8-possible-linebreaks \ + uninorm/nfkd \ unistd \ unistr/u8-check \ unistr/u8-cpy \ diff --git a/src/data/attributes.c b/src/data/attributes.c index d262e2ebc3..c4ae97c285 100644 --- a/src/data/attributes.c +++ b/src/data/attributes.c @@ -23,6 +23,7 @@ #include "libpspp/array.h" #include "libpspp/hash-functions.h" +#include "libpspp/i18n.h" #include "gl/xalloc.h" @@ -223,8 +224,8 @@ attrset_lookup (struct attrset *set, const char *name) { struct attribute *attr; HMAP_FOR_EACH_WITH_HASH (attr, struct attribute, node, - hash_case_string (name, 0), &set->map) - if (!strcasecmp (attribute_get_name (attr), name)) + utf8_hash_case_string (name, 0), &set->map) + if (!utf8_strcasecmp (attribute_get_name (attr), name)) break; return attr; } @@ -237,7 +238,7 @@ attrset_add (struct attrset *set, struct attribute *attr) { const char *name = attribute_get_name (attr); assert (attrset_lookup (set, name) == NULL); - hmap_insert (&set->map, &attr->node, hash_case_string (name, 0)); + hmap_insert (&set->map, &attr->node, utf8_hash_case_string (name, 0)); } /* Deletes any attribute from SET that matches NAME diff --git a/src/data/dictionary.c b/src/data/dictionary.c index 5731d78695..d36f8c519f 100644 --- a/src/data/dictionary.c +++ b/src/data/dictionary.c @@ -397,7 +397,7 @@ add_var (struct dictionary *d, struct variable *v) vardict->dict = d; vardict->var = v; hmap_insert (&d->name_map, &vardict->name_node, - hash_case_string (var_get_name (v), 0)); + utf8_hash_case_string (var_get_name (v), 0)); vardict->case_index = d->next_value_idx; var_set_vardict (v, vardict); @@ -487,10 +487,10 @@ dict_lookup_var (const struct dictionary *d, const char *name) struct vardict_info *vardict; HMAP_FOR_EACH_WITH_HASH (vardict, struct vardict_info, name_node, - hash_case_string (name, 0), &d->name_map) + utf8_hash_case_string (name, 0), &d->name_map) { struct variable *var = vardict->var; - if (!strcasecmp (var_get_name (var), name)) + if (!utf8_strcasecmp (var_get_name (var), name)) return var; } @@ -742,7 +742,7 @@ rename_var (struct variable *v, const char *new_name) struct vardict_info *vardict = var_get_vardict (v); var_clear_vardict (v); var_set_name (v, new_name); - vardict->name_node.hash = hash_case_string (new_name, 0); + vardict->name_node.hash = utf8_hash_case_string (new_name, 0); var_set_vardict (v, vardict); } @@ -753,7 +753,7 @@ void dict_rename_var (struct dictionary *d, struct variable *v, const char *new_name) { - assert (!strcasecmp (var_get_name (v), new_name) + assert (!utf8_strcasecmp (var_get_name (v), new_name) || dict_lookup_var (d, new_name) == NULL); unindex_var (d, var_get_vardict (v)); @@ -1411,7 +1411,7 @@ dict_lookup_vector (const struct dictionary *d, const char *name) { size_t i; for (i = 0; i < d->vector_cnt; i++) - if (!strcasecmp (vector_get_name (d->vector[i]), name)) + if (!utf8_strcasecmp (vector_get_name (d->vector[i]), name)) return d->vector[i]; return NULL; } @@ -1456,7 +1456,7 @@ dict_lookup_mrset_idx (const struct dictionary *dict, const char *name) size_t i; for (i = 0; i < dict->n_mrsets; i++) - if (!strcasecmp (name, dict->mrsets[i]->name)) + if (!utf8_strcasecmp (name, dict->mrsets[i]->name)) return i; return SIZE_MAX; diff --git a/src/data/file-handle-def.c b/src/data/file-handle-def.c index e4447a080b..121a4909c4 100644 --- a/src/data/file-handle-def.c +++ b/src/data/file-handle-def.c @@ -176,8 +176,8 @@ fh_from_id (const char *id) struct file_handle *handle; HMAP_FOR_EACH_WITH_HASH (handle, struct file_handle, name_node, - hash_case_string (id, 0), &named_handles) - if (!strcasecmp (id, handle->id)) + utf8_hash_case_string (id, 0), &named_handles) + if (!utf8_strcasecmp (id, handle->id)) { return fh_ref (handle); } @@ -206,7 +206,7 @@ create_handle (const char *id, char *handle_name, enum fh_referent referent, if (id != NULL) { hmap_insert (&named_handles, &handle->name_node, - hash_case_string (handle->id, 0)); + utf8_hash_case_string (handle->id, 0)); } return handle; diff --git a/src/data/session.c b/src/data/session.c index a308ec1f62..c6f5031a53 100644 --- a/src/data/session.c +++ b/src/data/session.c @@ -25,6 +25,7 @@ #include "libpspp/assertion.h" #include "libpspp/cast.h" #include "libpspp/hash-functions.h" +#include "libpspp/i18n.h" #include "libpspp/str.h" #include "libpspp/hmapx.h" @@ -94,7 +95,8 @@ session_add_dataset (struct session *s, struct dataset *ds) if (old != NULL) session_remove_dataset (s, old); - hmapx_insert (&s->datasets, ds, hash_case_string (dataset_name (ds), 0)); + hmapx_insert (&s->datasets, ds, + utf8_hash_case_string (dataset_name (ds), 0)); if (s->active == NULL) s->active = ds; @@ -193,8 +195,9 @@ session_lookup_dataset__ (const struct session *s_, const char *name) struct hmapx_node *node; struct dataset *ds; - HMAPX_FOR_EACH_WITH_HASH (ds, node, hash_case_string (name, 0), &s->datasets) - if (!strcasecmp (dataset_name (ds), name)) + HMAPX_FOR_EACH_WITH_HASH (ds, node, utf8_hash_case_string (name, 0), + &s->datasets) + if (!utf8_strcasecmp (dataset_name (ds), name)) return node; return NULL; diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index 83da9ee82e..defe460f5c 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -1610,7 +1610,7 @@ parse_long_var_name_map (struct sfm_reader *r, } /* Identify any duplicates. */ - if (strcasecmp (var_get_short_name (var, 0), long_name) + if (utf8_strcasecmp (var_get_short_name (var, 0), long_name) && dict_lookup_var (dict, long_name) != NULL) { sys_warn (r, record->pos, diff --git a/src/data/variable.c b/src/data/variable.c index 72d9ade675..b63e344905 100644 --- a/src/data/variable.c +++ b/src/data/variable.c @@ -201,7 +201,7 @@ compare_vars_by_name (const void *a_, const void *b_, const void *aux UNUSED) const struct variable *a = a_; const struct variable *b = b_; - return strcasecmp (a->name, b->name); + return utf8_strcasecmp (a->name, b->name); } /* A hsh_hash_func that hashes variable V based on its name. */ @@ -210,7 +210,7 @@ hash_var_by_name (const void *v_, const void *aux UNUSED) { const struct variable *v = v_; - return hash_case_string (v->name, 0); + return utf8_hash_case_string (v->name, 0); } /* A hsh_compare_func that orders pointers to variables A and B @@ -222,7 +222,7 @@ compare_var_ptrs_by_name (const void *a_, const void *b_, struct variable *const *a = a_; struct variable *const *b = b_; - return strcasecmp (var_get_name (*a), var_get_name (*b)); + return utf8_strcasecmp (var_get_name (*a), var_get_name (*b)); } /* A hsh_compare_func that orders pointers to variables A and B @@ -246,7 +246,7 @@ hash_var_ptr_by_name (const void *v_, const void *aux UNUSED) { struct variable *const *v = v_; - return hash_case_string (var_get_name (*v), 0); + return utf8_hash_case_string (var_get_name (*v), 0); } /* Returns the type of variable V. */ diff --git a/src/data/vector.c b/src/data/vector.c index 87046ad42b..7c8ec4178d 100644 --- a/src/data/vector.c +++ b/src/data/vector.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2006, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 2006, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -23,6 +23,7 @@ #include "data/dictionary.h" #include "data/identifier.h" #include "libpspp/assertion.h" +#include "libpspp/i18n.h" #include "libpspp/str.h" #include "gl/xalloc.h" @@ -140,6 +141,6 @@ compare_vector_ptrs_by_name (const void *a_, const void *b_) struct vector *a = *pa; struct vector *b = *pb; - return strcasecmp (a->name, b->name); + return utf8_strcasecmp (a->name, b->name); } diff --git a/src/language/control/repeat.c b/src/language/control/repeat.c index e76c4903de..b2c2bb413e 100644 --- a/src/language/control/repeat.c +++ b/src/language/control/repeat.c @@ -30,6 +30,7 @@ #include "libpspp/cast.h" #include "libpspp/hash-functions.h" #include "libpspp/hmap.h" +#include "libpspp/i18n.h" #include "libpspp/message.h" #include "libpspp/str.h" #include "libpspp/misc.h" @@ -78,7 +79,7 @@ cmd_do_repeat (struct lexer *lexer, struct dataset *ds) static unsigned int hash_dummy (const char *name, size_t name_len) { - return hash_case_bytes (name, name_len, 0); + return utf8_hash_case_bytes (name, name_len, 0); } static const struct dummy_var * @@ -88,7 +89,7 @@ find_dummy_var (struct hmap *hmap, const char *name, size_t name_len) HMAP_FOR_EACH_WITH_HASH (dv, struct dummy_var, hmap_node, hash_dummy (name, name_len), hmap) - if (strcasecmp (dv->name, name)) + if (utf8_strcasecmp (dv->name, name)) return dv; return NULL; diff --git a/src/language/dictionary/modify-variables.c b/src/language/dictionary/modify-variables.c index d73b95e8a2..f0b03d48f9 100644 --- a/src/language/dictionary/modify-variables.c +++ b/src/language/dictionary/modify-variables.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -28,6 +28,7 @@ #include "libpspp/assertion.h" #include "libpspp/bit-vector.h" #include "libpspp/compiler.h" +#include "libpspp/i18n.h" #include "libpspp/message.h" #include "libpspp/misc.h" #include "libpspp/str.h" @@ -355,7 +356,7 @@ compare_variables_given_ordering (const void *a_, const void *b_, result = a_index < b_index ? -1 : a_index > b_index; } else - result = strcasecmp (var_get_name (a), var_get_name (b)); + result = utf8_strcasecmp (var_get_name (a), var_get_name (b)); if (!ordering->forward) result = -result; return result; @@ -377,7 +378,7 @@ compare_var_renaming_by_new_name (const void *a_, const void *b_, const struct var_renaming *a = a_; const struct var_renaming *b = b_; - return strcasecmp (a->new_name, b->new_name); + return utf8_strcasecmp (a->new_name, b->new_name); } /* Returns true if performing VM on dictionary D would not cause diff --git a/src/language/dictionary/mrsets.c b/src/language/dictionary/mrsets.c index 2f7c8f6d29..9e1ca3b0d0 100644 --- a/src/language/dictionary/mrsets.c +++ b/src/language/dictionary/mrsets.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -417,7 +417,7 @@ parse_group (struct lexer *lexer, struct dictionary *dict, if (width == c->width && value_equal (value, &c->value, width)) { - if (!c->warned && strcasecmp (c->label, label)) + if (!c->warned && utf8_strcasecmp (c->label, label)) { char *s = data_out (value, var_get_encoding (var), var_get_print_format (var)); diff --git a/src/language/dictionary/vector.c b/src/language/dictionary/vector.c index b0d696154b..512ec3c2bf 100644 --- a/src/language/dictionary/vector.c +++ b/src/language/dictionary/vector.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -27,6 +27,7 @@ #include "language/lexer/lexer.h" #include "language/lexer/variable-parser.h" #include "libpspp/assertion.h" +#include "libpspp/i18n.h" #include "libpspp/message.h" #include "libpspp/misc.h" #include "libpspp/pool.h" @@ -68,7 +69,7 @@ cmd_vector (struct lexer *lexer, struct dataset *ds) } for (i = 0; i < vector_cnt; i++) - if (!strcasecmp (vectors[i], lex_tokcstr (lexer))) + if (!utf8_strcasecmp (vectors[i], lex_tokcstr (lexer))) { msg (SE, _("Vector name %s is given twice."), lex_tokcstr (lexer)); diff --git a/src/language/lexer/variable-parser.c b/src/language/lexer/variable-parser.c index 5c19b81e2f..8c4f8fe8a6 100644 --- a/src/language/lexer/variable-parser.c +++ b/src/language/lexer/variable-parser.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2009, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -30,6 +30,7 @@ #include "libpspp/assertion.h" #include "libpspp/cast.h" #include "libpspp/hash-functions.h" +#include "libpspp/i18n.h" #include "libpspp/hmapx.h" #include "libpspp/message.h" #include "libpspp/misc.h" @@ -819,9 +820,9 @@ array_var_set_lookup_var_idx (const struct var_set *vs, const char *name, struct hmapx_node *node; struct variable **varp; - HMAPX_FOR_EACH_WITH_HASH (varp, node, hash_case_string (name, 0), + HMAPX_FOR_EACH_WITH_HASH (varp, node, utf8_hash_case_string (name, 0), &avs->vars_by_name) - if (!strcasecmp (name, var_get_name (*varp))) + if (!utf8_strcasecmp (name, var_get_name (*varp))) { *idx = varp - avs->var; return true; @@ -869,7 +870,7 @@ var_set_create_from_array (struct variable *const *var, size_t var_cnt) return NULL; } hmapx_insert (&avs->vars_by_name, CONST_CAST (void *, &avs->var[i]), - hash_case_string (name, 0)); + utf8_hash_case_string (name, 0)); } return vs; diff --git a/src/language/stats/descriptives.c b/src/language/stats/descriptives.c index 54bc49d946..8efb5f1fb4 100644 --- a/src/language/stats/descriptives.c +++ b/src/language/stats/descriptives.c @@ -512,7 +512,7 @@ try_name (const struct dictionary *dict, struct dsc_proc *dsc, for (i = 0; i < dsc->var_cnt; i++) { struct dsc_var *dsc_var = &dsc->vars[i]; - if (dsc_var->z_name != NULL && !strcasecmp (dsc_var->z_name, name)) + if (dsc_var->z_name != NULL && !utf8_strcasecmp (dsc_var->z_name, name)) return false; } return true; @@ -1017,7 +1017,7 @@ descriptives_compare_dsc_vars (const void *a_, const void *b_, const void *dsc_) int result; if (dsc->sort_by_stat == DSC_NAME) - result = strcasecmp (var_get_name (a->v), var_get_name (b->v)); + result = utf8_strcasecmp (var_get_name (a->v), var_get_name (b->v)); else { double as = a->stats[dsc->sort_by_stat]; diff --git a/src/libpspp/hash-functions.c b/src/libpspp/hash-functions.c index c43017313a..7a8d8162ec 100644 --- a/src/libpspp/hash-functions.c +++ b/src/libpspp/hash-functions.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -102,53 +102,6 @@ hash_string (const char *s, unsigned int basis) return hash_bytes (s, strlen (s), basis); } -/* Returns a hash value for the N bytes at S, with lowercase and uppercase - letters treated as equal, starting from BASIS. */ -unsigned int -hash_case_bytes (const void *s_, size_t n, unsigned int basis) -{ - const char *s = s_; - uint32_t a, b, c; - uint32_t tmp[3]; - int i; - - a = b = c = 0xdeadbeef + n + basis; - - while (n >= 12) - { - for (i = 0; i < 12; i++) - ((unsigned char *)tmp)[i] = toupper ((unsigned char) s[i]); - a += tmp[0]; - b += tmp[1]; - c += tmp[2]; - HASH_MIX (a, b, c); - n -= 12; - s += 12; - } - - if (n > 0) - { - memset (tmp, 0, 12); - for (i = 0; i < n; i++) - ((unsigned char *)tmp)[i] = toupper ((unsigned char) s[i]); - a += tmp[0]; - b += tmp[1]; - c += tmp[2]; - } - - HASH_FINAL (a, b, c); - return c; -} - -/* Returns a hash value for null-terminated string S, with - lowercase and uppercase letters treated as equal, starting - from BASIS. */ -unsigned int -hash_case_string (const char *s, unsigned int basis) -{ - return hash_case_bytes (s, strlen (s), basis); -} - /* Returns a hash value for integer X, starting from BASIS. */ unsigned int hash_int (int x, unsigned int basis) diff --git a/src/libpspp/hash-functions.h b/src/libpspp/hash-functions.h index 86414fb1c2..f792ba2a96 100644 --- a/src/libpspp/hash-functions.h +++ b/src/libpspp/hash-functions.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2009, 2010 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2009, 2010, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -21,8 +21,6 @@ unsigned int hash_bytes (const void *, size_t, unsigned int basis); unsigned int hash_string (const char *, unsigned int basis); -unsigned int hash_case_bytes (const void *, size_t, unsigned int basis); -unsigned int hash_case_string (const char *, unsigned int basis); unsigned int hash_int (int, unsigned int basis); unsigned int hash_double (double, unsigned int basis); unsigned int hash_pointer (const void *, unsigned int basis); diff --git a/src/libpspp/i18n.c b/src/libpspp/i18n.c index 754c9321f3..1779afc434 100644 --- a/src/libpspp/i18n.c +++ b/src/libpspp/i18n.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "libpspp/assertion.h" @@ -39,6 +40,7 @@ #include "gl/c-strcase.h" #include "gl/localcharset.h" +#include "gl/minmax.h" #include "gl/xalloc.h" #include "gl/relocatable.h" #include "gl/xstrndup.h" @@ -677,6 +679,76 @@ uc_name (ucs4_t uc, char buffer[16]) return buffer; } +/* UTF-8 functions that deal with uppercase/lowercase distinctions. */ + +/* Returns a hash value for the N bytes of UTF-8 encoded data starting at S, + with lowercase and uppercase letters treated as equal, starting from + BASIS. */ +unsigned int +utf8_hash_case_bytes (const char *s, size_t n, unsigned int basis) +{ + uint8_t folded_buf[2048]; + size_t folded_len = sizeof folded_buf; + uint8_t *folded_s; + unsigned int hash; + + folded_s = u8_casefold (CHAR_CAST (const uint8_t *, s), n, + NULL, UNINORM_NFKD, folded_buf, &folded_len); + if (folded_s != NULL) + { + hash = hash_bytes (folded_s, folded_len, basis); + if (folded_s != folded_buf) + free (folded_s); + } + else + { + if (errno == ENOMEM) + xalloc_die (); + hash = hash_bytes (s, n, basis); + } + + return hash; +} + +/* Returns a hash value for null-terminated UTF-8 string S, with lowercase and + uppercase letters treated as equal, starting from BASIS. */ +unsigned int +utf8_hash_case_string (const char *s, unsigned int basis) +{ + return utf8_hash_case_bytes (s, strlen (s), basis); +} + +/* Compares UTF-8 strings A and B case-insensitively. + Returns a negative value if A < B, zero if A == B, positive if A > B. */ +int +utf8_strcasecmp (const char *a, const char *b) +{ + return utf8_strncasecmp (a, strlen (a), b, strlen (b)); +} + +/* Compares UTF-8 strings A (with length AN) and B (with length BN) + case-insensitively. + Returns a negative value if A < B, zero if A == B, positive if A > B. */ +int +utf8_strncasecmp (const char *a, size_t an, const char *b, size_t bn) +{ + int result; + + if (u8_casecmp (CHAR_CAST (const uint8_t *, a), an, + CHAR_CAST (const uint8_t *, b), bn, + NULL, UNINORM_NFKD, &result)) + { + if (errno == ENOMEM) + xalloc_die (); + + result = memcmp (a, b, MIN (an, bn)); + if (result == 0) + result = an < bn ? -1 : an > bn; + } + + return result; +} + bool get_encoding_info (struct encoding_info *e, const char *name) { diff --git a/src/libpspp/i18n.h b/src/libpspp/i18n.h index 4dbf61a299..380c3cbb79 100644 --- a/src/libpspp/i18n.h +++ b/src/libpspp/i18n.h @@ -67,6 +67,11 @@ void set_default_encoding (const char *enc); bool set_encoding_from_locale (const char *loc); const char *uc_name (ucs4_t uc, char buffer[16]); + +unsigned int utf8_hash_case_bytes (const char *, size_t n, unsigned int basis); +unsigned int utf8_hash_case_string (const char *, unsigned int basis); +int utf8_strcasecmp (const char *, const char *); +int utf8_strncasecmp (const char *, size_t, const char *, size_t); /* Information about character encodings. */ diff --git a/src/libpspp/stringi-map.c b/src/libpspp/stringi-map.c index d3e5144de7..7b8d398d03 100644 --- a/src/libpspp/stringi-map.c +++ b/src/libpspp/stringi-map.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2009, 2010 Free Software Foundation, Inc. + Copyright (C) 2009, 2010, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -22,6 +22,7 @@ #include #include "libpspp/hash-functions.h" +#include "libpspp/i18n.h" #include "libpspp/string-set.h" #include "libpspp/stringi-set.h" @@ -148,7 +149,7 @@ stringi_map_find (const struct stringi_map *map, const char *key) struct stringi_map_node * stringi_map_find_node (const struct stringi_map *map, const char *key) { - return stringi_map_find_node__ (map, key, hash_case_string (key, 0)); + return stringi_map_find_node__ (map, key, utf8_hash_case_string (key, 0)); } /* If MAP contains KEY (or an equivalent with different case) as a key, deletes @@ -175,7 +176,7 @@ struct stringi_map_node * stringi_map_insert (struct stringi_map *map, const char *key, const char *value) { - unsigned int hash = hash_case_string (key, 0); + unsigned int hash = utf8_hash_case_string (key, 0); struct stringi_map_node *node = stringi_map_find_node__ (map, key, hash); if (node == NULL) node = stringi_map_insert__ (map, xstrdup (key), xstrdup (value), hash); @@ -189,7 +190,7 @@ stringi_map_insert (struct stringi_map *map, const char *key, struct stringi_map_node * stringi_map_insert_nocopy (struct stringi_map *map, char *key, char *value) { - unsigned int hash = hash_case_string (key, 0); + unsigned int hash = utf8_hash_case_string (key, 0); struct stringi_map_node *node = stringi_map_find_node__ (map, key, hash); if (node == NULL) node = stringi_map_insert__ (map, key, value, hash); @@ -208,7 +209,7 @@ struct stringi_map_node * stringi_map_replace (struct stringi_map *map, const char *key, const char *value) { - unsigned int hash = hash_case_string (key, 0); + unsigned int hash = utf8_hash_case_string (key, 0); struct stringi_map_node *node = stringi_map_find_node__ (map, key, hash); if (node == NULL) node = stringi_map_insert__ (map, xstrdup (key), xstrdup (value), hash); @@ -224,7 +225,7 @@ stringi_map_replace (struct stringi_map *map, const char *key, struct stringi_map_node * stringi_map_replace_nocopy (struct stringi_map *map, char *key, char *value) { - unsigned int hash = hash_case_string (key, 0); + unsigned int hash = utf8_hash_case_string (key, 0); struct stringi_map_node *node = stringi_map_find_node__ (map, key, hash); if (node == NULL) node = stringi_map_insert__ (map, key, value, hash); @@ -242,7 +243,7 @@ stringi_map_replace_nocopy (struct stringi_map *map, char *key, char *value) bool stringi_map_delete (struct stringi_map *map, const char *key) { - return stringi_map_delete__ (map, key, hash_case_string (key, 0)); + return stringi_map_delete__ (map, key, utf8_hash_case_string (key, 0)); } /* Deletes NODE from MAP and destroys the node and its key and value. */ @@ -344,7 +345,7 @@ stringi_map_find_node__ (const struct stringi_map *map, const char *key, HMAP_FOR_EACH_WITH_HASH (node, struct stringi_map_node, hmap_node, hash, &map->hmap) - if (!strcasecmp (key, node->key)) + if (!utf8_strcasecmp (key, node->key)) return node; return NULL; diff --git a/src/libpspp/stringi-set.c b/src/libpspp/stringi-set.c index a7ae699471..b442a41567 100644 --- a/src/libpspp/stringi-set.c +++ b/src/libpspp/stringi-set.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2009, 2010 Free Software Foundation, Inc. + Copyright (C) 2009, 2010, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -26,6 +26,7 @@ #include "libpspp/cast.h" #include "libpspp/hash-functions.h" +#include "libpspp/i18n.h" #include "gl/xalloc.h" @@ -88,7 +89,7 @@ stringi_set_contains (const struct stringi_set *set, const char *s) struct stringi_set_node * stringi_set_find_node (const struct stringi_set *set, const char *s) { - return stringi_set_find_node__ (set, s, hash_case_string (s, 0)); + return stringi_set_find_node__ (set, s, utf8_hash_case_string (s, 0)); } /* Inserts a copy of S into SET. Returns true if successful, false if SET @@ -96,7 +97,7 @@ stringi_set_find_node (const struct stringi_set *set, const char *s) bool stringi_set_insert (struct stringi_set *set, const char *s) { - unsigned int hash = hash_case_string (s, 0); + unsigned int hash = utf8_hash_case_string (s, 0); if (!stringi_set_find_node__ (set, s, hash)) { stringi_set_insert__ (set, xstrdup (s), hash); @@ -112,7 +113,7 @@ stringi_set_insert (struct stringi_set *set, const char *s) bool stringi_set_insert_nocopy (struct stringi_set *set, char *s) { - unsigned int hash = hash_case_string (s, 0); + unsigned int hash = utf8_hash_case_string (s, 0); if (!stringi_set_find_node__ (set, s, hash)) { stringi_set_insert__ (set, s, hash); @@ -130,7 +131,7 @@ stringi_set_insert_nocopy (struct stringi_set *set, char *s) bool stringi_set_delete (struct stringi_set *set, const char *s) { - return stringi_set_delete__ (set, s, hash_case_string (s, 0)); + return stringi_set_delete__ (set, s, utf8_hash_case_string (s, 0)); } /* Deletes NODE from SET, and frees NODE and its string. */ @@ -258,7 +259,7 @@ compare_strings (const void *a_, const void *b_) { const char *const *a = a_; const char *const *b = b_; - return strcasecmp (*a, *b); + return utf8_strcasecmp (*a, *b); } /* Allocates and returns an array that points to each of the strings in SET. @@ -267,7 +268,7 @@ compare_strings (const void *a_, const void *b_) caller it is responsible for freeing the returned array itself (with free()). - The returned array is ordered according to strcasecmp(). */ + The returned array is ordered according to utf8_strcasecmp(). */ char ** stringi_set_get_sorted_array (const struct stringi_set *set) { @@ -286,7 +287,7 @@ stringi_set_find_node__ (const struct stringi_set *set, const char *s, HMAP_FOR_EACH_WITH_HASH (node, struct stringi_set_node, hmap_node, hash, &set->hmap) - if (!strcasecmp (s, node->string)) + if (!utf8_strcasecmp (s, node->string)) return node; return NULL; diff --git a/tests/automake.mk b/tests/automake.mk index 6c4cef20aa..737df306f8 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -238,6 +238,7 @@ TESTSUITE_AT = \ tests/data/data-in.at \ tests/data/data-out.at \ tests/data/datasheet-test.at \ + tests/data/dictionary.at \ tests/data/format-guesser.at \ tests/data/por-file.at \ tests/data/sys-file-reader.at \ diff --git a/tests/data/dictionary.at b/tests/data/dictionary.at new file mode 100644 index 0000000000..ef62c664f8 --- /dev/null +++ b/tests/data/dictionary.at @@ -0,0 +1,14 @@ +AT_BANNER([dictionary]) + +AT_SETUP([dictionary case-insensitivity]) +AT_DATA([dictionary.sps], [dnl +DATA LIST LIST /aèiöu aeiou. +BEGIN DATA +1 2 +END DATA. +LIST AÈIÖU +RENAME VARIABLE (aèiöu=AÈIÖU). +LIST. +RENAME VARIABLE (aeiou=aèiöu). +]) +AT_CLEANUP diff --git a/tests/libpspp/stringi-map-test.c b/tests/libpspp/stringi-map-test.c index a8dd9dd6b7..3ae027fb36 100644 --- a/tests/libpspp/stringi-map-test.c +++ b/tests/libpspp/stringi-map-test.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2007, 2008, 2009, 2010 Free Software Foundation, Inc. + Copyright (C) 2007, 2008, 2009, 2010, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -39,6 +39,7 @@ #include "libpspp/hash-functions.h" #include "libpspp/compiler.h" +#include "libpspp/i18n.h" #include "libpspp/str.h" #include "libpspp/string-set.h" #include "libpspp/stringi-set.h" @@ -276,7 +277,7 @@ check_map_contains (struct stringi_map *map, node = stringi_map_find_node (map, key); check (node != NULL); - check (!strcasecmp (key, stringi_map_node_get_key (node))); + check (!utf8_strcasecmp (key, stringi_map_node_get_key (node))); check (!strcmp (value, stringi_map_node_get_value (node))); check (node == stringi_map_insert (map, key, "012")); diff --git a/tests/libpspp/stringi-set-test.c b/tests/libpspp/stringi-set-test.c index a20e84ce18..daf7b2fa6d 100644 --- a/tests/libpspp/stringi-set-test.c +++ b/tests/libpspp/stringi-set-test.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2007, 2008, 2009, 2010 Free Software Foundation, Inc. + Copyright (C) 2007, 2008, 2009, 2010, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -36,6 +36,7 @@ #include #include "libpspp/compiler.h" +#include "libpspp/i18n.h" #include "libpspp/str.h" /* Exit with a failure code. @@ -244,7 +245,7 @@ check_set_contains (struct stringi_set *set, const char *string) node = stringi_set_find_node (set, string); check (node != NULL); - check (!strcasecmp (string, stringi_set_node_get_string (node))); + check (!utf8_strcasecmp (string, stringi_set_node_get_string (node))); } /* Checks that SET contains the CNT strings in DATA, that its structure is @@ -302,7 +303,7 @@ check_stringi_set (struct stringi_set *set, const int data[], size_t cnt) check (s == array[i]); for (j = 0; j < left; j++) - if (!strcasecmp (s, make_string (data_copy[j]))) + if (!utf8_strcasecmp (s, make_string (data_copy[j]))) { data_copy[j] = data_copy[--left]; goto next; @@ -319,7 +320,7 @@ check_stringi_set (struct stringi_set *set, const int data[], size_t cnt) for (i = 0; i < cnt; i++) { if (i > 0) - check (strcasecmp (array[i - 1], array[i]) < 0); + check (utf8_strcasecmp (array[i - 1], array[i]) < 0); check (stringi_set_contains (set, array[i])); } free (array); -- 2.30.2