pintos-os.org Git - pspp/blob - src/libpspp/encoding-guesser.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2011, 2012, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "libpspp/encoding-guesser.h"
  20
  21 #include <errno.h>
  22 #include <iconv.h>
  23 #include <stdbool.h>
  24 #include <stdio.h>
  25 #include <stdint.h>
  26 #include <string.h>
  27 #include <unistr.h>
  28
  29 #include "libpspp/cast.h"
  30 #include "libpspp/i18n.h"
  31
  32 #include "gl/localcharset.h"
  33 #include "gl/c-strcase.h"
  34
  35 /* http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info is a useful source
  36    of information about encoding detection.
  37 */
  38
  39 /* Returns the encoding specified by ENCODING, which must be in one of the
  40    forms described at the top of encoding-guesser.h.  The returned string might
  41    be ENCODING itself or a suffix of it, or it might be a statically allocated
  42    string. */
  43 const char *
  44 encoding_guess_parse_encoding (const char *encoding)
  45 {
  46   if (encoding == NULL
  47       || !c_strcasecmp (encoding, "auto")
  48       || !c_strcasecmp (encoding, "auto,locale")
  49       || !c_strcasecmp (encoding, "locale"))
  50     return locale_charset ();
  51   else if (!c_strncasecmp (encoding, "auto,", 5))
  52     return encoding + 5;
  53   else
  54     return encoding;
  55 }
  56
  57 /* Returns true if ENCODING, which must be in one of the forms described at the
  58    top of encoding-guesser.h, is one that performs encoding autodetection,
  59    false otherwise. */
  60 bool
  61 encoding_guess_encoding_is_auto (const char *encoding)
  62 {
  63   return (encoding == NULL
  64           || (!c_strncasecmp (encoding, "auto", 4)
  65               && (encoding[4] == ',' || encoding[4] == '\0')));
  66 }
  67
  68 static uint16_t
  69 get_be16 (const uint8_t *data)
  70 {
  71   return (data[0] << 8) | data[1];
  72 }
  73
  74 static uint16_t
  75 get_le16 (const uint8_t *data)
  76 {
  77   return (data[1] << 8) | data[0];
  78 }
  79
  80 static uint32_t
  81 get_be32 (const uint8_t *data)
  82 {
  83   return (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3];
  84
  85 }
  86
  87 static uint32_t
  88 get_le32 (const uint8_t *data)
  89 {
  90   return (data[3] << 24) | (data[2] << 16) | (data[1] << 8) | data[0];
  91 }
  92
  93 static const char *
  94 guess_utf16 (const uint8_t *data, size_t n)
  95 {
  96   size_t even_nulls, odd_nulls;
  97
  98   if (n < ENCODING_GUESS_MIN && n % 2 != 0)
  99     return NULL;
 100
 101   even_nulls = odd_nulls = 0;
 102   while (n >= 2)
 103     {
 104       even_nulls += data[0] == 0;
 105       odd_nulls += data[1] == 0;
 106       if (data[0] == 0 && data[1] == 0)
 107         return NULL;
 108
 109       data += 2;
 110       n -= 2;
 111     }
 112
 113   if (odd_nulls > even_nulls)
 114     return "UTF-16LE";
 115   else if (even_nulls > 0)
 116     return "UTF-16BE";
 117   else
 118     return NULL;
 119 }
 120
 121 static bool
 122 is_utf32 (const uint8_t *data, size_t n, uint32_t (*get_u32) (const uint8_t *))
 123 {
 124   if (n < ENCODING_GUESS_MIN && n % 4 != 0)
 125     return false;
 126
 127   while (n >= 4)
 128     {
 129       uint32_t uc = get_u32 (data);
 130
 131       if (uc < 0x09 || uc > 0x10ffff)
 132         return false;
 133
 134       data += 4;
 135       n -= 4;
 136     }
 137
 138   return true;
 139 }
 140
 141 /* Counts and returns the number of bytes, but no more than N, starting at S
 142    that are ASCII text characters. */
 143 size_t
 144 encoding_guess_count_ascii (const void *s_, size_t n)
 145 {
 146   const uint8_t *s = s_;
 147   size_t ofs;
 148
 149   for (ofs = 0; ofs < n; ofs++)
 150     if (!encoding_guess_is_ascii_text (s[ofs]))
 151       break;
 152   return ofs;
 153 }
 154
 155 static bool
 156 is_all_utf8_text (const void *s_, size_t n)
 157 {
 158   const uint8_t *s = s_;
 159   size_t ofs;
 160
 161   ofs = 0;
 162   while (ofs < n)
 163     {
 164       uint8_t c = s[ofs];
 165       if (c < 0x80)
 166         {
 167           if (!encoding_guess_is_ascii_text (c))
 168             return false;
 169           ofs++;
 170         }
 171       else
 172         {
 173           ucs4_t uc;
 174           int mblen;
 175
 176           mblen = u8_mbtoucr (&uc, s + ofs, n - ofs);
 177           if (mblen < 0)
 178             return mblen == -2;
 179
 180           ofs += mblen;
 181         }
 182     }
 183   return true;
 184 }
 185
 186 static bool
 187 is_utf8_bom (const uint8_t *data, size_t n)
 188 {
 189   return n >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf;
 190 }
 191
 192 static bool
 193 is_bom_length (size_t n, size_t w)
 194 {
 195   return n >= ENCODING_GUESS_MIN || (n && n % w == 0);
 196 }
 197
 198 static bool
 199 is_utf16le_bom (const uint8_t *data, size_t n)
 200 {
 201   return is_bom_length (n, 2) && get_le16 (data) == 0xfeff;
 202 }
 203
 204 static bool
 205 is_utf16be_bom (const uint8_t *data, size_t n)
 206 {
 207   return is_bom_length (n, 2) && get_be16 (data) == 0xfeff;
 208 }
 209
 210 static bool
 211 is_utf32le_bom (const uint8_t *data, size_t n)
 212 {
 213   return is_bom_length (n, 4) && get_le32 (data) == 0xfeff;
 214 }
 215
 216 static bool
 217 is_utf32be_bom (const uint8_t *data, size_t n)
 218 {
 219   return is_bom_length (n, 4) && get_be32 (data) == 0xfeff;
 220 }
 221
 222 /* Attempts to guess the encoding of a text file based on ENCODING, an encoding
 223    name in one of the forms described at the top of encoding-guesser.h, and
 224    DATA, which contains the first N bytes of the file.  Returns the guessed
 225    encoding, which might be ENCODING itself or a suffix of it or a statically
 226    allocated string.
 227
 228    Encoding autodetection only takes place if ENCODING actually specifies
 229    autodetection.  See encoding-guesser.h for details.
 230
 231    UTF-8 cannot be distinguished from other ASCII-based encodings until a
 232    non-ASCII text character is encountered.  If ENCODING specifies
 233    autodetection and this function returns "ASCII", then the client should
 234    process the input until it encounters an non-ASCII character (as returned by
 235    encoding_guess_is_ascii_text()) and then use encoding_guess_tail_encoding()
 236    to make a final encoding guess.  See encoding-guesser.h for details.
 237
 238    N must be at least ENCODING_GUESS_MIN, unless the file is shorter than
 239    that. */
 240 const char *
 241 encoding_guess_head_encoding (const char *encoding,
 242                               const void *data_, size_t n)
 243 {
 244   const uint8_t *data = data_;
 245   const char *fallback_encoding;
 246   const char *guess;
 247
 248   fallback_encoding = encoding_guess_parse_encoding (encoding);
 249   if (!encoding_guess_encoding_is_auto (encoding))
 250     return fallback_encoding;
 251
 252   if (n == 0)
 253     return fallback_encoding;
 254
 255   if (is_utf32be_bom (data, n) || is_utf32le_bom (data, n))
 256     return "UTF-32";
 257
 258   if (n >= 4)
 259     {
 260       uint32_t x = get_be32 (data);
 261       if (x == 0x84319533)
 262         return "GB-18030";
 263       else if (x == 0xdd736673)
 264         return "UTF-EBCDIC";
 265     }
 266
 267   if (is_utf16be_bom (data, n) || is_utf16le_bom (data, n))
 268     return "UTF-16";
 269
 270   if (is_utf8_bom (data, n))
 271     return "UTF-8";
 272
 273   guess = guess_utf16 (data, n);
 274   if (guess != NULL)
 275     return guess;
 276
 277   if (is_utf32 (data, n, get_be32))
 278     return "UTF-32BE";
 279   if (is_utf32 (data, n, get_le32))
 280     return "UTF-32LE";
 281
 282   /* We've tried all the "giveaways" that make the encoding obvious.  That
 283      rules out, incidentally, all the encodings with multibyte units
 284      (e.g. UTF-16, UTF-32).  Our remaining goal is to try to distinguish UTF-8
 285      from some ASCII-based fallback encoding. */
 286
 287   /* If the fallback encoding isn't ASCII compatible, give up. */
 288   if (!is_encoding_ascii_compatible (fallback_encoding))
 289     return fallback_encoding;
 290
 291   /* If the data we have clearly is not UTF-8, give up. */
 292   if (!encoding_guess_tail_is_utf8 (data, n))
 293     {
 294       /* If the fallback encoding is UTF-8, fall back on something else.*/
 295       if (is_encoding_utf8 (fallback_encoding))
 296         return "windows-1252";
 297
 298       return fallback_encoding;
 299     }
 300
 301   return "ASCII";
 302 }
 303
 304 static bool
 305 is_encoding_utf16 (const char *encoding)
 306 {
 307   return (!c_strcasecmp (encoding, "utf-16")
 308           || !c_strcasecmp (encoding, "utf16"));
 309 }
 310
 311 static bool
 312 is_encoding_utf32 (const char *encoding)
 313 {
 314   return (!c_strcasecmp (encoding, "utf-32")
 315           || !c_strcasecmp (encoding, "utf32"));
 316 }
 317
 318 /* If ENCODING is the name of an encoding that could begin with a byte-order
 319    mark, and in fact the N bytes in DATA do begin with a byte-order mark,
 320    returns the number of bytes in the byte-order mark.  Otherwise, returns 0.
 321
 322    N must be at least ENCODING_GUESS_MIN, unless the file is shorter than
 323    that. */
 324 size_t
 325 encoding_guess_bom_length (const char *encoding,
 326                            const void *data_, size_t n)
 327 {
 328   const uint8_t *data = data_;
 329
 330   return (is_utf8_bom (data, n) && is_encoding_utf8 (encoding) ? 3
 331           : is_utf16le_bom (data, n) && is_encoding_utf16 (encoding) ? 2
 332           : is_utf16be_bom (data, n) && is_encoding_utf16 (encoding) ? 2
 333           : is_utf32le_bom (data, n) && is_encoding_utf32 (encoding) ? 4
 334           : is_utf32be_bom (data, n) && is_encoding_utf32 (encoding) ? 4
 335           : 0);
 336 }
 337
 338 /* Returns an encoding guess based on ENCODING and the N bytes of text starting
 339    at DATA.  DATA should start with the first non-ASCII text character (as
 340    determined by encoding_guess_is_ascii_text()) found in the input.
 341
 342    The return value will either be "UTF-8" or the fallback encoding for
 343    ENCODING.
 344
 345    See encoding-guesser.h for intended use of this function.
 346
 347    N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than
 348    that starting with the first non-ASCII text character. */
 349 const char *
 350 encoding_guess_tail_encoding (const char *encoding,
 351                               const void *data, size_t n)
 352 {
 353
 354   if (encoding_guess_tail_is_utf8 (data, n) != 0)
 355     return "UTF-8";
 356   else
 357     {
 358       /* The data is not UTF-8. */
 359       const char *fallback_encoding = encoding_guess_parse_encoding (encoding);
 360
 361       /* If the fallback encoding is UTF-8, fall back on something else.*/
 362       if (is_encoding_utf8 (fallback_encoding))
 363         return "windows-1252";
 364
 365       return fallback_encoding;
 366     }
 367
 368 }
 369
 370 /* Returns an encoding guess based on ENCODING and the N bytes of text starting
 371    at DATA.  DATA should start with the first non-ASCII text character (as
 372    determined by encoding_guess_is_ascii_text()) found in the input.
 373
 374    The return value is:
 375
 376        0, if the encoding is definitely not UTF-8 (because the input contains
 377        byte sequences that are not valid in UTF-8).
 378
 379        1, if the encoding appears to be UTF-8 (because the input contains valid
 380        UTF-8 multibyte sequences).
 381
 382        -1, if the input contains only ASCII characters.  (This means that the
 383        input may be treated as UTF-8, since ASCII is a subset of UTF-8.)
 384
 385    See encoding-guesser.h for intended use of this function.
 386
 387    N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than
 388    that starting with the first non-ASCII text character. */
 389 int
 390 encoding_guess_tail_is_utf8 (const void *data, size_t n)
 391 {
 392   /* If all the bytes are in the ASCII range, it's just ASCII. */
 393   if (encoding_guess_count_ascii (data, n) == n)
 394     return -1;
 395
 396   return (n < ENCODING_GUESS_MIN
 397           ? u8_check (data, n) == NULL
 398           : is_all_utf8_text (data, n));
 399 }
 400
 401 /* Attempts to guess the encoding of a text file based on ENCODING, an encoding
 402    name in one of the forms described at the top of encoding-guesser.h, and the
 403    SIZE byts in DATA, which contains the entire contents of the file.  Returns
 404    the guessed encoding, which might be ENCODING itself or a suffix of it or a
 405    statically allocated string.
 406
 407    Encoding autodetection only takes place if ENCODING actually specifies
 408    autodetection.  See encoding-guesser.h for details. */
 409 const char *
 410 encoding_guess_whole_file (const char *encoding, const void *text, size_t size)
 411 {
 412   const char *guess;
 413
 414   guess = encoding_guess_head_encoding (encoding, text, size);
 415   if (!strcmp (guess, "ASCII") && encoding_guess_encoding_is_auto (encoding))
 416     return encoding_guess_tail_encoding (encoding, text, size);
 417   else
 418     return guess;
 419 }