pintos-os.org Git - pspp/blob - src/libpspp/encoding-guesser.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2011, 2012, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "libpspp/encoding-guesser.h"
  20
  21 #include <errno.h>
  22 #include <iconv.h>
  23 #include <stdbool.h>
  24 #include <stdio.h>
  25 #include <stdint.h>
  26 #include <string.h>
  27 #include <unistr.h>
  28
  29 #include "libpspp/cast.h"
  30 #include "libpspp/i18n.h"
  31
  32 #include "gl/localcharset.h"
  33 #include "gl/c-strcase.h"
  34
  35 /* http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info is a useful source
  36    of information about encoding detection.
  37 */
  38
  39 /* Returns the encoding specified by ENCODING, which must be in one of the
  40    forms described at the top of encoding-guesser.h.  The returned string might
  41    be ENCODING itself or a suffix of it, or it might be a statically allocated
  42    string. */
  43 const char *
  44 encoding_guess_parse_encoding (const char *encoding)
  45 {
  46   if (encoding == NULL
  47       || !c_strcasecmp (encoding, "auto")
  48       || !c_strcasecmp (encoding, "auto,locale")
  49       || !c_strcasecmp (encoding, "locale"))
  50     return locale_charset ();
  51   else if (!c_strncasecmp (encoding, "auto,", 5))
  52     return encoding + 5;
  53   else
  54     return encoding;
  55 }
  56
  57 /* Returns true if ENCODING, which must be in one of the forms described at the
  58    top of encoding-guesser.h, is one that performs encoding autodetection,
  59    false otherwise. */
  60 bool
  61 encoding_guess_encoding_is_auto (const char *encoding)
  62 {
  63   return (encoding == NULL
  64           || (!c_strncasecmp (encoding, "auto", 4)
  65               && (encoding[4] == ',' || encoding[4] == '\0')));
  66 }
  67
  68 static uint16_t
  69 get_be16 (const uint8_t *data)
  70 {
  71   return (data[0] << 8) | data[1];
  72 }
  73
  74 static uint16_t
  75 get_le16 (const uint8_t *data)
  76 {
  77   return (data[1] << 8) | data[0];
  78 }
  79
  80 static uint32_t
  81 get_be32 (const uint8_t *data)
  82 {
  83   return (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3];
  84
  85 }
  86
  87 static uint32_t
  88 get_le32 (const uint8_t *data)
  89 {
  90   return (data[3] << 24) | (data[2] << 16) | (data[1] << 8) | data[0];
  91 }
  92
  93 static const char *
  94 guess_utf16 (const uint8_t *data, size_t n)
  95 {
  96   size_t even_nulls, odd_nulls;
  97
  98   if (n < ENCODING_GUESS_MIN && n % 2 != 0)
  99     return NULL;
 100
 101   even_nulls = odd_nulls = 0;
 102   while (n >= 2)
 103     {
 104       even_nulls += data[0] == 0;
 105       odd_nulls += data[1] == 0;
 106       if (data[0] == 0 && data[1] == 0)
 107         return NULL;
 108
 109       data += 2;
 110       n -= 2;
 111     }
 112
 113   if (odd_nulls > even_nulls)
 114     return "UTF-16LE";
 115   else if (even_nulls > 0)
 116     return "UTF-16BE";
 117   else
 118     return NULL;
 119 }
 120
 121 static bool
 122 is_utf32 (const uint8_t *data, size_t n, uint32_t (*get_u32) (const uint8_t *))
 123 {
 124   if (n < ENCODING_GUESS_MIN && n % 4 != 0)
 125     return false;
 126
 127   while (n >= 4)
 128     {
 129       uint32_t uc = get_u32 (data);
 130
 131       if (uc < 0x09 || uc > 0x10ffff)
 132         return false;
 133
 134       data += 4;
 135       n -= 4;
 136     }
 137
 138   return true;
 139 }
 140
 141 /* Counts and returns the number of bytes, but no more than N, starting at S
 142    that are ASCII text characters. */
 143 size_t
 144 encoding_guess_count_ascii (const void *s_, size_t n)
 145 {
 146   const uint8_t *s = s_;
 147   size_t ofs;
 148
 149   for (ofs = 0; ofs < n; ofs++)
 150     if (!encoding_guess_is_ascii_text (s[ofs]))
 151       break;
 152   return ofs;
 153 }
 154
 155 static bool
 156 is_all_utf8_text (const void *s_, size_t n)
 157 {
 158   const uint8_t *s = s_;
 159   size_t ofs;
 160
 161   ofs = 0;
 162   while (ofs < n)
 163     {
 164       uint8_t c = s[ofs];
 165       if (c < 0x80)
 166         {
 167           if (!encoding_guess_is_ascii_text (c))
 168             return false;
 169           ofs++;
 170         }
 171       else
 172         {
 173           ucs4_t uc;
 174           int mblen;
 175
 176           mblen = u8_mbtoucr (&uc, s + ofs, n - ofs);
 177           if (mblen < 0)
 178             return mblen == -2;
 179
 180           ofs += mblen;
 181         }
 182     }
 183   return true;
 184 }
 185
 186 static bool
 187 is_utf8_bom (const uint8_t *data, size_t n)
 188 {
 189   return n >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf;
 190 }
 191
 192 static bool
 193 is_utf16le_bom (const uint8_t *data, size_t n)
 194 {
 195   return (n >= ENCODING_GUESS_MIN || n % 2 == 0) && get_le16 (data) == 0xfeff;
 196 }
 197
 198 static bool
 199 is_utf16be_bom (const uint8_t *data, size_t n)
 200 {
 201   return (n >= ENCODING_GUESS_MIN || n % 2 == 0) && get_be16 (data) == 0xfeff;
 202 }
 203
 204 static bool
 205 is_utf32le_bom (const uint8_t *data, size_t n)
 206 {
 207   return (n >= ENCODING_GUESS_MIN || n % 4 == 0) && get_le32 (data) == 0xfeff;
 208 }
 209
 210 static bool
 211 is_utf32be_bom (const uint8_t *data, size_t n)
 212 {
 213   return (n >= ENCODING_GUESS_MIN || n % 4 == 0) && get_be32 (data) == 0xfeff;
 214 }
 215
 216 /* Attempts to guess the encoding of a text file based on ENCODING, an encoding
 217    name in one of the forms described at the top of encoding-guesser.h, and
 218    DATA, which contains the first N bytes of the file.  Returns the guessed
 219    encoding, which might be ENCODING itself or a suffix of it or a statically
 220    allocated string.
 221
 222    Encoding autodetection only takes place if ENCODING actually specifies
 223    autodetection.  See encoding-guesser.h for details.
 224
 225    UTF-8 cannot be distinguished from other ASCII-based encodings until a
 226    non-ASCII text character is encountered.  If ENCODING specifies
 227    autodetection and this function returns "ASCII", then the client should
 228    process the input until it encounters an non-ASCII character (as returned by
 229    encoding_guess_is_ascii_text()) and then use encoding_guess_tail_encoding()
 230    to make a final encoding guess.  See encoding-guesser.h for details.
 231
 232    N must be at least ENCODING_GUESS_MIN, unless the file is shorter than
 233    that. */
 234 const char *
 235 encoding_guess_head_encoding (const char *encoding,
 236                               const void *data_, size_t n)
 237 {
 238   const uint8_t *data = data_;
 239   const char *fallback_encoding;
 240   const char *guess;
 241
 242   fallback_encoding = encoding_guess_parse_encoding (encoding);
 243   if (!encoding_guess_encoding_is_auto (encoding))
 244     return fallback_encoding;
 245
 246   if (n == 0)
 247     return fallback_encoding;
 248
 249   if (is_utf32be_bom (data, n) || is_utf32le_bom (data, n))
 250     return "UTF-32";
 251
 252   if (n >= 4)
 253     {
 254       uint32_t x = get_be32 (data);
 255       if (x == 0x84319533)
 256         return "GB-18030";
 257       else if (x == 0xdd736673)
 258         return "UTF-EBCDIC";
 259     }
 260
 261   if (is_utf16be_bom (data, n) || is_utf16le_bom (data, n))
 262     return "UTF-16";
 263
 264   if (is_utf8_bom (data, n))
 265     return "UTF-8";
 266
 267   guess = guess_utf16 (data, n);
 268   if (guess != NULL)
 269     return guess;
 270
 271   if (is_utf32 (data, n, get_be32))
 272     return "UTF-32BE";
 273   if (is_utf32 (data, n, get_le32))
 274     return "UTF-32LE";
 275
 276   /* We've tried all the "giveaways" that make the encoding obvious.  That
 277      rules out, incidentally, all the encodings with multibyte units
 278      (e.g. UTF-16, UTF-32).  Our remaining goal is to try to distinguish UTF-8
 279      from some ASCII-based fallback encoding. */
 280
 281   /* If the fallback encoding isn't ASCII compatible, give up. */
 282   if (!is_encoding_ascii_compatible (fallback_encoding))
 283     return fallback_encoding;
 284
 285   /* If the data we have clearly is not UTF-8, give up. */
 286   if (!encoding_guess_tail_is_utf8 (data, n))
 287     {
 288       /* If the fallback encoding is UTF-8, fall back on something else.*/
 289       if (is_encoding_utf8 (fallback_encoding))
 290         return "windows-1252";
 291
 292       return fallback_encoding;
 293     }
 294
 295   return "ASCII";
 296 }
 297
 298 static bool
 299 is_encoding_utf16 (const char *encoding)
 300 {
 301   return (!c_strcasecmp (encoding, "utf-16")
 302           || !c_strcasecmp (encoding, "utf16"));
 303 }
 304
 305 static bool
 306 is_encoding_utf32 (const char *encoding)
 307 {
 308   return (!c_strcasecmp (encoding, "utf-32")
 309           || !c_strcasecmp (encoding, "utf32"));
 310 }
 311
 312 /* If ENCODING is the name of an encoding that could begin with a byte-order
 313    mark, and in fact the N bytes in DATA do begin with a byte-order mark,
 314    returns the number of bytes in the byte-order mark.  Otherwise, returns 0.
 315
 316    N must be at least ENCODING_GUESS_MIN, unless the file is shorter than
 317    that. */
 318 size_t
 319 encoding_guess_bom_length (const char *encoding,
 320                            const void *data_, size_t n)
 321 {
 322   const uint8_t *data = data_;
 323
 324   return (is_utf8_bom (data, n) && is_encoding_utf8 (encoding) ? 3
 325           : is_utf16le_bom (data, n) && is_encoding_utf16 (encoding) ? 2
 326           : is_utf16be_bom (data, n) && is_encoding_utf16 (encoding) ? 2
 327           : is_utf32le_bom (data, n) && is_encoding_utf32 (encoding) ? 4
 328           : is_utf32be_bom (data, n) && is_encoding_utf32 (encoding) ? 4
 329           : 0);
 330 }
 331
 332 /* Returns an encoding guess based on ENCODING and the N bytes of text starting
 333    at DATA.  DATA should start with the first non-ASCII text character (as
 334    determined by encoding_guess_is_ascii_text()) found in the input.
 335
 336    The return value will either be "UTF-8" or the fallback encoding for
 337    ENCODING.
 338
 339    See encoding-guesser.h for intended use of this function.
 340
 341    N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than
 342    that starting with the first non-ASCII text character. */
 343 const char *
 344 encoding_guess_tail_encoding (const char *encoding,
 345                               const void *data, size_t n)
 346 {
 347
 348   if (encoding_guess_tail_is_utf8 (data, n) != 0)
 349     return "UTF-8";
 350   else
 351     {
 352       /* The data is not UTF-8. */
 353       const char *fallback_encoding = encoding_guess_parse_encoding (encoding);
 354
 355       /* If the fallback encoding is UTF-8, fall back on something else.*/
 356       if (is_encoding_utf8 (fallback_encoding))
 357         return "windows-1252";
 358
 359       return fallback_encoding;
 360     }
 361
 362 }
 363
 364 /* Returns an encoding guess based on ENCODING and the N bytes of text starting
 365    at DATA.  DATA should start with the first non-ASCII text character (as
 366    determined by encoding_guess_is_ascii_text()) found in the input.
 367
 368    The return value is:
 369
 370        0, if the encoding is definitely not UTF-8 (because the input contains
 371        byte sequences that are not valid in UTF-8).
 372
 373        1, if the encoding appears to be UTF-8 (because the input contains valid
 374        UTF-8 multibyte sequences).
 375
 376        -1, if the input contains only ASCII characters.  (This means that the
 377        input may be treated as UTF-8, since ASCII is a subset of UTF-8.)
 378
 379    See encoding-guesser.h for intended use of this function.
 380
 381    N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than
 382    that starting with the first non-ASCII text character. */
 383 int
 384 encoding_guess_tail_is_utf8 (const void *data, size_t n)
 385 {
 386   /* If all the bytes are in the ASCII range, it's just ASCII. */
 387   if (encoding_guess_count_ascii (data, n) == n)
 388     return -1;
 389
 390   return (n < ENCODING_GUESS_MIN
 391           ? u8_check (data, n) == NULL
 392           : is_all_utf8_text (data, n));
 393 }
 394
 395 /* Attempts to guess the encoding of a text file based on ENCODING, an encoding
 396    name in one of the forms described at the top of encoding-guesser.h, and the
 397    SIZE byts in DATA, which contains the entire contents of the file.  Returns
 398    the guessed encoding, which might be ENCODING itself or a suffix of it or a
 399    statically allocated string.
 400
 401    Encoding autodetection only takes place if ENCODING actually specifies
 402    autodetection.  See encoding-guesser.h for details. */
 403 const char *
 404 encoding_guess_whole_file (const char *encoding, const void *text, size_t size)
 405 {
 406   const char *guess;
 407
 408   guess = encoding_guess_head_encoding (encoding, text, size);
 409   if (!strcmp (guess, "ASCII") && encoding_guess_encoding_is_auto (encoding))
 410     return encoding_guess_tail_encoding (encoding, text, size);
 411   else
 412     return guess;
 413 }