pintos-os.org Git - pspp/blob - src/libpspp/encoding-guesser.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2011, 2012, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "libpspp/encoding-guesser.h"
  20
  21 #include <errno.h>
  22 #include <iconv.h>
  23 #include <stdbool.h>
  24 #include <stdio.h>
  25 #include <stdint.h>
  26 #include <string.h>
  27 #include <unistr.h>
  28
  29 #include "libpspp/cast.h"
  30 #include "libpspp/i18n.h"
  31
  32 #include "gl/localcharset.h"
  33 #include "gl/c-strcase.h"
  34
  35 /* http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info is a useful source
  36    of information about encoding detection.
  37 */
  38
  39 /* Returns the encoding specified by ENCODING, which must be in one of the
  40    forms described at the top of encoding-guesser.h.  The returned string might
  41    be ENCODING itself or a suffix of it, or it might be a statically allocated
  42    string. */
  43 const char *
  44 encoding_guess_parse_encoding (const char *encoding)
  45 {
  46   const char *fallback;
  47
  48   if (encoding == NULL
  49       || !c_strcasecmp (encoding, "auto")
  50       || !c_strcasecmp (encoding, "auto,locale")
  51       || !c_strcasecmp (encoding, "locale"))
  52     fallback = locale_charset ();
  53   else if (!c_strncasecmp (encoding, "auto,", 5))
  54     fallback = encoding + 5;
  55   else
  56     return encoding;
  57
  58   return is_encoding_utf8 (fallback) ? "windows-1252" : fallback;
  59 }
  60
  61 /* Returns true if ENCODING, which must be in one of the forms described at the
  62    top of encoding-guesser.h, is one that performs encoding autodetection,
  63    false otherwise. */
  64 bool
  65 encoding_guess_encoding_is_auto (const char *encoding)
  66 {
  67   return (encoding == NULL
  68           || (!c_strncasecmp (encoding, "auto", 4)
  69               && (encoding[4] == ',' || encoding[4] == '\0')));
  70 }
  71
  72 static uint16_t
  73 get_be16 (const uint8_t *data)
  74 {
  75   return (data[0] << 8) | data[1];
  76 }
  77
  78 static uint16_t
  79 get_le16 (const uint8_t *data)
  80 {
  81   return (data[1] << 8) | data[0];
  82 }
  83
  84 static uint32_t
  85 get_be32 (const uint8_t *data)
  86 {
  87   return (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3];
  88
  89 }
  90
  91 static uint32_t
  92 get_le32 (const uint8_t *data)
  93 {
  94   return (data[3] << 24) | (data[2] << 16) | (data[1] << 8) | data[0];
  95 }
  96
  97 static const char *
  98 guess_utf16 (const uint8_t *data, size_t n)
  99 {
 100   size_t even_nulls, odd_nulls;
 101
 102   if (n < ENCODING_GUESS_MIN && n % 2 != 0)
 103     return NULL;
 104
 105   even_nulls = odd_nulls = 0;
 106   while (n >= 2)
 107     {
 108       even_nulls += data[0] == 0;
 109       odd_nulls += data[1] == 0;
 110       if (data[0] == 0 && data[1] == 0)
 111         return NULL;
 112
 113       data += 2;
 114       n -= 2;
 115     }
 116
 117   if (odd_nulls > even_nulls)
 118     return "UTF-16LE";
 119   else if (even_nulls > 0)
 120     return "UTF-16BE";
 121   else
 122     return NULL;
 123 }
 124
 125 static bool
 126 is_utf32 (const uint8_t *data, size_t n, uint32_t (*get_u32) (const uint8_t *))
 127 {
 128   if (n < ENCODING_GUESS_MIN && n % 4 != 0)
 129     return false;
 130
 131   while (n >= 4)
 132     {
 133       uint32_t uc = get_u32 (data);
 134
 135       if (uc < 0x09 || uc > 0x10ffff)
 136         return false;
 137
 138       data += 4;
 139       n -= 4;
 140     }
 141
 142   return true;
 143 }
 144
 145 /* Counts and returns the number of bytes, but no more than N, starting at S
 146    that are ASCII text characters. */
 147 size_t
 148 encoding_guess_count_ascii (const void *s_, size_t n)
 149 {
 150   const uint8_t *s = s_;
 151   size_t ofs;
 152
 153   for (ofs = 0; ofs < n; ofs++)
 154     if (!encoding_guess_is_ascii_text (s[ofs]))
 155       break;
 156   return ofs;
 157 }
 158
 159 static bool
 160 is_all_utf8_text (const void *s_, size_t n)
 161 {
 162   const uint8_t *s = s_;
 163   size_t ofs;
 164
 165   ofs = 0;
 166   while (ofs < n)
 167     {
 168       uint8_t c = s[ofs];
 169       if (c < 0x80)
 170         {
 171           if (!encoding_guess_is_ascii_text (c))
 172             return false;
 173           ofs++;
 174         }
 175       else
 176         {
 177           ucs4_t uc;
 178           int mblen;
 179
 180           mblen = u8_mbtoucr (&uc, s + ofs, n - ofs);
 181           if (mblen < 0)
 182             return mblen == -2;
 183
 184           ofs += mblen;
 185         }
 186     }
 187   return true;
 188 }
 189
 190 static bool
 191 is_utf8_bom (const uint8_t *data, size_t n)
 192 {
 193   return n >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf;
 194 }
 195
 196 static bool
 197 is_utf16le_bom (const uint8_t *data, size_t n)
 198 {
 199   return (n >= ENCODING_GUESS_MIN || n % 2 == 0) && get_le16 (data) == 0xfeff;
 200 }
 201
 202 static bool
 203 is_utf16be_bom (const uint8_t *data, size_t n)
 204 {
 205   return (n >= ENCODING_GUESS_MIN || n % 2 == 0) && get_be16 (data) == 0xfeff;
 206 }
 207
 208 static bool
 209 is_utf32le_bom (const uint8_t *data, size_t n)
 210 {
 211   return (n >= ENCODING_GUESS_MIN || n % 4 == 0) && get_le32 (data) == 0xfeff;
 212 }
 213
 214 static bool
 215 is_utf32be_bom (const uint8_t *data, size_t n)
 216 {
 217   return (n >= ENCODING_GUESS_MIN || n % 4 == 0) && get_be32 (data) == 0xfeff;
 218 }
 219
 220 /* Attempts to guess the encoding of a text file based on ENCODING, an encoding
 221    name in one of the forms described at the top of encoding-guesser.h, and
 222    DATA, which contains the first N bytes of the file.  Returns the guessed
 223    encoding, which might be ENCODING itself or a suffix of it or a statically
 224    allocated string.
 225
 226    Encoding autodetection only takes place if ENCODING actually specifies
 227    autodetection.  See encoding-guesser.h for details.
 228
 229    UTF-8 cannot be distinguished from other ASCII-based encodings until a
 230    non-ASCII text character is encountered.  If ENCODING specifies
 231    autodetection and this function returns "ASCII", then the client should
 232    process the input until it encounters an non-ASCII character (as returned by
 233    encoding_guess_is_ascii_text()) and then use encoding_guess_tail_encoding()
 234    to make a final encoding guess.  See encoding-guesser.h for details.
 235
 236    N must be at least ENCODING_GUESS_MIN, unless the file is shorter than
 237    that. */
 238 const char *
 239 encoding_guess_head_encoding (const char *encoding,
 240                               const void *data_, size_t n)
 241 {
 242   const uint8_t *data = data_;
 243   const char *fallback_encoding;
 244   const char *guess;
 245
 246   fallback_encoding = encoding_guess_parse_encoding (encoding);
 247   if (!encoding_guess_encoding_is_auto (encoding))
 248     return fallback_encoding;
 249
 250   if (n == 0)
 251     return fallback_encoding;
 252
 253   if (is_utf32be_bom (data, n) || is_utf32le_bom (data, n))
 254     return "UTF-32";
 255
 256   if (n >= 4)
 257     {
 258       uint32_t x = get_be32 (data);
 259       if (x == 0x84319533)
 260         return "GB-18030";
 261       else if (x == 0xdd736673)
 262         return "UTF-EBCDIC";
 263     }
 264
 265   if (is_utf16be_bom (data, n) || is_utf16le_bom (data, n))
 266     return "UTF-16";
 267
 268   if (is_utf8_bom (data, n))
 269     return "UTF-8";
 270
 271   guess = guess_utf16 (data, n);
 272   if (guess != NULL)
 273     return guess;
 274
 275   if (is_utf32 (data, n, get_be32))
 276     return "UTF-32BE";
 277   if (is_utf32 (data, n, get_le32))
 278     return "UTF-32LE";
 279
 280   if (!is_encoding_ascii_compatible (fallback_encoding)
 281       || !encoding_guess_tail_is_utf8 (data, n))
 282     return fallback_encoding;
 283
 284   return "ASCII";
 285 }
 286
 287 static bool
 288 is_encoding_utf16 (const char *encoding)
 289 {
 290   return (!c_strcasecmp (encoding, "utf-16")
 291           || !c_strcasecmp (encoding, "utf16"));
 292 }
 293
 294 static bool
 295 is_encoding_utf32 (const char *encoding)
 296 {
 297   return (!c_strcasecmp (encoding, "utf-32")
 298           || !c_strcasecmp (encoding, "utf32"));
 299 }
 300
 301 /* If ENCODING is the name of an encoding that could begin with a byte-order
 302    mark, and in fact the N bytes in DATA do begin with a byte-order mark,
 303    returns the number of bytes in the byte-order mark.  Otherwise, returns 0.
 304
 305    N must be at least ENCODING_GUESS_MIN, unless the file is shorter than
 306    that. */
 307 size_t
 308 encoding_guess_bom_length (const char *encoding,
 309                            const void *data_, size_t n)
 310 {
 311   const uint8_t *data = data_;
 312
 313   return (is_utf8_bom (data, n) && is_encoding_utf8 (encoding) ? 3
 314           : is_utf16le_bom (data, n) && is_encoding_utf16 (encoding) ? 2
 315           : is_utf16be_bom (data, n) && is_encoding_utf16 (encoding) ? 2
 316           : is_utf32le_bom (data, n) && is_encoding_utf32 (encoding) ? 4
 317           : is_utf32be_bom (data, n) && is_encoding_utf32 (encoding) ? 4
 318           : 0);
 319 }
 320
 321 /* Returns an encoding guess based on ENCODING and the N bytes of text starting
 322    at DATA.  DATA should start with the first non-ASCII text character (as
 323    determined by encoding_guess_is_ascii_text()) found in the input.
 324
 325    The return value will either be "UTF-8" or the fallback encoding for
 326    ENCODING.
 327
 328    See encoding-guesser.h for intended use of this function.
 329
 330    N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than
 331    that starting with the first non-ASCII text character. */
 332 const char *
 333 encoding_guess_tail_encoding (const char *encoding,
 334                               const void *data, size_t n)
 335 {
 336   return (encoding_guess_tail_is_utf8 (data, n) != 0
 337           ? "UTF-8"
 338           : encoding_guess_parse_encoding (encoding));
 339 }
 340
 341 /* Returns an encoding guess based on ENCODING and the N bytes of text starting
 342    at DATA.  DATA should start with the first non-ASCII text character (as
 343    determined by encoding_guess_is_ascii_text()) found in the input.
 344
 345    The return value is:
 346
 347        0, if the encoding is definitely not UTF-8 (because the input contains
 348        byte sequences that are not valid in UTF-8).
 349
 350        1, if the encoding appears to be UTF-8 (because the input contains valid
 351        UTF-8 multibyte sequences).
 352
 353        -1, if the input contains only ASCII characters.  (This means that the
 354        input may be treated as UTF-8, since ASCII is a subset of UTF-8.)
 355
 356    See encoding-guesser.h for intended use of this function.
 357
 358    N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than
 359    that starting with the first non-ASCII text character. */
 360 int
 361 encoding_guess_tail_is_utf8 (const void *data, size_t n)
 362 {
 363   /* If all the bytes are in the ASCII range, it's just ASCII. */
 364   if (encoding_guess_count_ascii (data, n) == n)
 365     return -1;
 366
 367   return (n < ENCODING_GUESS_MIN
 368           ? u8_check (data, n) == NULL
 369           : is_all_utf8_text (data, n));
 370 }
 371
 372 /* Attempts to guess the encoding of a text file based on ENCODING, an encoding
 373    name in one of the forms described at the top of encoding-guesser.h, and the
 374    SIZE byts in DATA, which contains the entire contents of the file.  Returns
 375    the guessed encoding, which might be ENCODING itself or a suffix of it or a
 376    statically allocated string.
 377
 378    Encoding autodetection only takes place if ENCODING actually specifies
 379    autodetection.  See encoding-guesser.h for details. */
 380 const char *
 381 encoding_guess_whole_file (const char *encoding, const void *text, size_t size)
 382 {
 383   const char *guess;
 384
 385   guess = encoding_guess_head_encoding (encoding, text, size);
 386   if (!strcmp (guess, "ASCII") && encoding_guess_encoding_is_auto (encoding))
 387     return encoding_guess_tail_encoding (encoding, text, size);
 388   else
 389     return guess;
 390 }