pintos-os.org Git - pspp/blob - src/libpspp/i18n.h

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2006, 2010, 2011, 2012, 2014, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #ifndef I18N_H
  18 #define I18N_H
  19
  20 /*
  21
  22   PSPP has three ``working'' locales:
  23
  24   * The user interface locale.
  25
  26     This is the locale which is visible to the person using pspp.  Error
  27     messages and confidence indications are written in this locale.  For
  28     example ``Cannot open file'' will be written in the user interface locale.
  29
  30     This locale is set from the environment of the user who starts PSPP or from
  31     the system locale if not set.
  32
  33   * The output locale.
  34
  35     This locale should be visible to the person reading a report generated by
  36     pspp.  Non-data related strings (e.g., "Page number", "Standard Deviation"
  37     etc.) appear in this locale.
  38
  39   * The data locale.
  40
  41     Only the character encoding is relevant.
  42
  43     This locale is the one associated with the data being analysed.  The only
  44     important aspect of this locale is the character encoding.  (It might also
  45     be desirable for the LC_COLLATE category to be used for the purposes of
  46     sorting data.)  The dictionary pertaining to the data contains a field
  47     denoting the encoding.  Any string data stored in a "union value" is
  48     encoded in the dictionary's character set.
  49
  50   Each of these locales may, at different times take separate (or identical)
  51   values.  So for example, a French statistician can use pspp to prepare a
  52   report in the English language, using a datafile which has been created by a
  53   Japanese researcher hence uses a Japanese character set.
  54
  55   It's rarely, if ever, necessary to interrogate the system to find out the
  56   values of the 3 locales.  However it's important to be aware of the source
  57   (destination) locale when reading (writing) string data.  When transferring
  58   data between a source and a destination, the appropriate recoding must be
  59   performed.
  60
  61   System Files
  62   ============
  63
  64   '.sav' files contain a field which is supposed to identify the encoding of
  65   the data they contain.  However, many files produced by early versions of
  66   spss set this to "2" (ASCII) regardless of the encoding of the data.  Later
  67   versions contain an additional record (the "Character Encoding Record")
  68   describing the encoding.  When a system file is read, the dictionary's
  69   encoding is set using information gleaned from the system file.  If the
  70   encoding cannot be determined or would be unreliable, then it remains unset.
  71
  72   GUI
  73   ===
  74
  75   The psppire graphic user interface is written using the GTK+ api, for which
  76   all strings must be encoded in UTF-8.  All strings passed to the GTK+/GLib
  77   library functions (except for filenames) must be UTF-8 encoded otherwise
  78   errors will occur.  Thus, for the purposes of programming PSPPIRE, the user
  79   interface locale should be assumed to be UTF-8, even if setlocale() and/or
  80   nl_langinfo indicates otherwise.
  81
  82   Filenames
  83   ---------
  84
  85   The GLib API has some special functions for dealing with filenames.  Strings
  86   returned from functions like gtk_file_chooser_dialog_get_name() are not, in
  87   general, encoded in UTF-8, but in "filename" encoding.  If that filename is
  88   passed to another GLib function which expects a filename, no conversion is
  89   necessary.  If it's passed to a function for the purposes of displaying it
  90   (e.g. in a window's title-bar) it must be converted to UTF-8 (there is a
  91   special function for this: g_filename_display_name or g_filename_basename).
  92   If however, a filename needs to be passed outside of GTK+/GLib, e.g.  to
  93   fopen, it must be converted to the local system encoding.
  94
  95   Existing Locale Handling Functions
  96   ==================================
  97
  98   The major aspect of locale handling which the programmer has to consider is
  99   that of character encoding.  recode_string() is the main function for
 100   changing the encoding of strings.
 101
 102   To minimise the number of conversions required, and to simplify design, PSPP
 103   attempts to store all internal strings in UTF-8 encoding.  Thus, when reading
 104   system and portable files (or any other data source), the following items are
 105   immediately converted to UTF-8
 106
 107   * Variable names
 108   * Variable labels
 109   * Value labels
 110
 111   Conversely, when writing system files, these are converted back to the
 112   encoding of that system file.
 113
 114   String data stored in "union value"s are left in their original encoding.
 115   These are converted for display later by data_out().
 116
 117   Quirks
 118   ======
 119
 120   For historical reasons, not all locale handling follows POSIX conventions.
 121   This makes it difficult (impossible?) to elegantly handle issues.  For
 122   example, it would make sense for the GUI's datasheet to display numbers
 123   formatted according to LC_NUMERIC.  Instead however there is data_out(),
 124   which uses settings_get_decimal_char() function instead of the locale's
 125   decimal separator.  Similarly, formatting of monetary values is displayed in
 126   a PSPP/SPSS-specific fashion instead of using LC_MONETARY.
 127 */
 128
 129 #include "libpspp/compiler.h"
 130 #include "libpspp/str.h"
 131 #include <stdbool.h>
 132 #include <unistr.h>
 133
 134 void  i18n_done (void);
 135 void  i18n_init (void);
 136
 137 #define UTF8 "UTF-8"
 138
 139 /* The encoding of literal strings in PSPP source code, as seen at execution
 140    time.  In fact this is likely to be some extended ASCII encoding, such as
 141    UTF-8 or ISO-8859-1, but ASCII is adequate for our purposes. */
 142 #define C_ENCODING "ASCII"
 143
 144 struct pool;
 145
 146 char recode_byte (const char *to, const char *from, char);
 147
 148 char *recode_string (const char *to, const char *from,
 149                      const char *text, int len);
 150 char *recode_string_pool (const char *to, const char *from,
 151                           const char *text, int length, struct pool *);
 152 struct substring recode_substring_pool (const char *to, const char *from,
 153                                         struct substring text, struct pool *);
 154 int recode_pedantically (const char *to, const char *from,
 155                          struct substring text, struct pool *,
 156                          struct substring *out);
 157
 158 size_t recode_string_len (const char *to, const char *from,
 159                           const char *text, int len);
 160
 161 char *utf8_encoding_trunc (const char *, const char *encoding,
 162                            size_t max_len);
 163 size_t utf8_encoding_trunc_len (const char *, const char *encoding,
 164                                 size_t max_len);
 165
 166 char *utf8_encoding_concat (const char *head, const char *tail,
 167                             const char *encoding, size_t max_len);
 168 size_t utf8_encoding_concat_len (const char *head, const char *tail,
 169                                  const char *encoding, size_t max_len);
 170
 171 size_t utf8_count_columns (const char *, size_t);
 172 size_t utf8_columns_to_bytes (const char *, size_t, size_t n_columns);
 173
 174 char *utf8_to_filename (const char *filename);
 175 char *filename_to_utf8 (const char *filename);
 176
 177 bool valid_encoding (const char *enc);
 178
 179 char get_system_decimal (void);
 180
 181 const char * get_default_encoding (void);
 182 void set_default_encoding (const char *enc);
 183
 184 bool set_encoding_from_locale (const char *loc);
 185
 186 const char *uc_name (ucs4_t uc, char buffer[16]);
 187
 188 unsigned int utf8_hash_case_bytes (const char *, size_t n, unsigned int basis) WARN_UNUSED_RESULT;
 189 unsigned int utf8_hash_case_string (const char *, unsigned int basis) WARN_UNUSED_RESULT;
 190 unsigned int utf8_hash_case_substring (struct substring, unsigned int basis)
 191   WARN_UNUSED_RESULT;
 192 int utf8_strcasecmp (const char *, const char *);
 193 int utf8_sscasecmp (struct substring, struct substring);
 194 int utf8_strncasecmp (const char *, size_t, const char *, size_t);
 195 int utf8_strverscasecmp (const char *, const char *);
 196 char *utf8_to_upper (const char *);
 197 char *utf8_to_lower (const char *);
 198 char *utf8_to_title (const char *);
 199 \f
 200 /* Information about character encodings. */
 201
 202 /* ISO C defines a set of characters that a C implementation must support at
 203    runtime, called the C basic execution character set, which consists of the
 204    following characters:
 205
 206        A B C D E F G H I J K L M
 207        N O P Q R S T U V W X Y Z
 208        a b c d e f g h i j k l m
 209        n o p q r s t u v w x y z
 210        0 1 2 3 4 5 6 7 8 9
 211        ! " # % & ' () * + , - . / :
 212        ; < = > ? [ \ ] ^ _ { | } ~
 213        space \a \b \r \n \t \v \f \0
 214
 215    The following is true of every member of the C basic execution character
 216    set in all "reasonable" encodings:
 217
 218        1. Every member of the C basic character set is encoded.
 219
 220        2. Every member of the C basic character set has the same width in
 221           bytes, called the "unit width".  Most encodings have a unit width of
 222           1 byte, but UCS-2 and UTF-16 have a unit width of 2 bytes and UCS-4
 223           and UTF-32 have a unit width of 4 bytes.
 224
 225        3. In a stateful encoding, the encoding of members of the C basic
 226           character set does not vary with shift state.
 227
 228        4. When a string is read unit-by-unit, a unit that has the encoded value
 229           of a member of the C basic character set, EXCEPT FOR THE DECIMAL
 230           DIGITS, always represents that member.  That is, if the encoding has
 231           multi-unit characters, the units that encode the C basic character
 232           set are never part of a multi-unit character.
 233
 234           The exception for decimal digits is due to GB18030, which uses
 235           decimal digits as part of multi-byte encodings.
 236
 237    All 8-bit and wider encodings that I have been able to find follow these
 238    rules.  7-bit and narrower encodings (e.g. UTF-7) do not.  I'm not too
 239    concerned about that. */
 240
 241 #include <stdbool.h>
 242
 243 /* Maximum width of a unit, in bytes.  UTF-32 with 4-byte units is the widest
 244    that I am aware of. */
 245 #define MAX_UNIT 4
 246
 247 /* Information about an encoding. */
 248 struct encoding_info
 249   {
 250     /* Encoding name.  IANA says character set names may be up to 40 US-ASCII
 251        characters. */
 252     char name[41];
 253
 254     /* True if this encoding has a unit width of 1 byte, and every character
 255        used in ASCII text files has the same value in this encoding. */
 256     bool is_ascii_compatible;
 257
 258     /* True if this encoding has a unit width of 1 byte and appears to be
 259        EBCDIC-based.  */
 260     bool is_ebcdic_compatible;
 261
 262     /* Character information. */
 263     int unit;                   /* Unit width, in bytes. */
 264     char cr[MAX_UNIT];          /* \r in encoding, 'unit' bytes long. */
 265     char lf[MAX_UNIT];          /* \n in encoding, 'unit' bytes long. */
 266     char space[MAX_UNIT];       /* ' ' in encoding, 'unit' bytes long. */
 267   };
 268
 269 bool get_encoding_info (struct encoding_info *, const char *name);
 270 bool is_encoding_ascii_compatible (const char *encoding);
 271 bool is_encoding_ebcdic_compatible (const char *encoding);
 272 bool is_encoding_supported (const char *encoding);
 273
 274 bool is_encoding_utf8 (const char *encoding);
 275 \f
 276 /* Database of encodings, by language or region. */
 277
 278 struct encoding_category
 279   {
 280     const char *category;       /* e.g. "Arabic" or "Western European". */
 281     const char **encodings;     /* Encodings within the category. */
 282     size_t n_encodings;         /* Number of encodings in category. */
 283   };
 284
 285 struct encoding_category *get_encoding_categories (void);
 286 size_t get_n_encoding_categories (void);
 287
 288 /* Return the ISO two letter code for the current LC_MESSAGES
 289    locale category.  */
 290 char *get_language (void);
 291
 292 #endif /* i18n.h */