1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2011, 2012, 2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "libpspp/encoding-guesser.h"
29 #include "libpspp/cast.h"
30 #include "libpspp/i18n.h"
32 #include "gl/localcharset.h"
33 #include "gl/c-strcase.h"
35 /* http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info is a useful source
36 of information about encoding detection.
39 /* Returns the encoding specified by ENCODING, which must be in one of the
40 forms described at the top of encoding-guesser.h. The returned string might
41 be ENCODING itself or a suffix of it, or it might be a statically allocated
44 encoding_guess_parse_encoding (const char *encoding)
49 || !c_strcasecmp (encoding, "auto")
50 || !c_strcasecmp (encoding, "auto,locale")
51 || !c_strcasecmp (encoding, "locale"))
52 fallback = locale_charset ();
53 else if (!c_strncasecmp (encoding, "auto,", 5))
54 fallback = encoding + 5;
58 return is_encoding_utf8 (fallback) ? "windows-1252" : fallback;
61 /* Returns true if ENCODING, which must be in one of the forms described at the
62 top of encoding-guesser.h, is one that performs encoding autodetection,
65 encoding_guess_encoding_is_auto (const char *encoding)
67 return (encoding == NULL
68 || (!c_strncasecmp (encoding, "auto", 4)
69 && (encoding[4] == ',' || encoding[4] == '\0')));
73 get_be16 (const uint8_t *data)
75 return (data[0] << 8) | data[1];
79 get_le16 (const uint8_t *data)
81 return (data[1] << 8) | data[0];
85 get_be32 (const uint8_t *data)
87 return (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3];
92 get_le32 (const uint8_t *data)
94 return (data[3] << 24) | (data[2] << 16) | (data[1] << 8) | data[0];
98 guess_utf16 (const uint8_t *data, size_t n)
100 size_t even_nulls, odd_nulls;
102 if (n < ENCODING_GUESS_MIN && n % 2 != 0)
105 even_nulls = odd_nulls = 0;
108 even_nulls += data[0] == 0;
109 odd_nulls += data[1] == 0;
110 if (data[0] == 0 && data[1] == 0)
117 if (odd_nulls > even_nulls)
119 else if (even_nulls > 0)
126 is_utf32 (const uint8_t *data, size_t n, uint32_t (*get_u32) (const uint8_t *))
128 if (n < ENCODING_GUESS_MIN && n % 4 != 0)
133 uint32_t uc = get_u32 (data);
135 if (uc < 0x09 || uc > 0x10ffff)
145 /* Counts and returns the number of bytes, but no more than N, starting at S
146 that are ASCII text characters. */
148 encoding_guess_count_ascii (const void *s_, size_t n)
150 const uint8_t *s = s_;
153 for (ofs = 0; ofs < n; ofs++)
154 if (!encoding_guess_is_ascii_text (s[ofs]))
160 is_all_utf8_text (const void *s_, size_t n)
162 const uint8_t *s = s_;
171 if (!encoding_guess_is_ascii_text (c))
180 mblen = u8_mbtoucr (&uc, s + ofs, n - ofs);
191 is_utf8_bom (const uint8_t *data, size_t n)
193 return n >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf;
197 is_utf16le_bom (const uint8_t *data, size_t n)
199 return (n >= ENCODING_GUESS_MIN || n % 2 == 0) && get_le16 (data) == 0xfeff;
203 is_utf16be_bom (const uint8_t *data, size_t n)
205 return (n >= ENCODING_GUESS_MIN || n % 2 == 0) && get_be16 (data) == 0xfeff;
209 is_utf32le_bom (const uint8_t *data, size_t n)
211 return (n >= ENCODING_GUESS_MIN || n % 4 == 0) && get_le32 (data) == 0xfeff;
215 is_utf32be_bom (const uint8_t *data, size_t n)
217 return (n >= ENCODING_GUESS_MIN || n % 4 == 0) && get_be32 (data) == 0xfeff;
220 /* Attempts to guess the encoding of a text file based on ENCODING, an encoding
221 name in one of the forms described at the top of encoding-guesser.h, and
222 DATA, which contains the first N bytes of the file. Returns the guessed
223 encoding, which might be ENCODING itself or a suffix of it or a statically
226 Encoding autodetection only takes place if ENCODING actually specifies
227 autodetection. See encoding-guesser.h for details.
229 UTF-8 cannot be distinguished from other ASCII-based encodings until a
230 non-ASCII text character is encountered. If ENCODING specifies
231 autodetection and this function returns "ASCII", then the client should
232 process the input until it encounters an non-ASCII character (as returned by
233 encoding_guess_is_ascii_text()) and then use encoding_guess_tail_encoding()
234 to make a final encoding guess. See encoding-guesser.h for details.
236 N must be at least ENCODING_GUESS_MIN, unless the file is shorter than
239 encoding_guess_head_encoding (const char *encoding,
240 const void *data_, size_t n)
242 const uint8_t *data = data_;
243 const char *fallback_encoding;
246 fallback_encoding = encoding_guess_parse_encoding (encoding);
247 if (!encoding_guess_encoding_is_auto (encoding))
248 return fallback_encoding;
251 return fallback_encoding;
253 if (is_utf32be_bom (data, n) || is_utf32le_bom (data, n))
258 uint32_t x = get_be32 (data);
261 else if (x == 0xdd736673)
265 if (is_utf16be_bom (data, n) || is_utf16le_bom (data, n))
268 if (is_utf8_bom (data, n))
271 guess = guess_utf16 (data, n);
275 if (is_utf32 (data, n, get_be32))
277 if (is_utf32 (data, n, get_le32))
280 if (!is_encoding_ascii_compatible (fallback_encoding)
281 || !encoding_guess_tail_is_utf8 (data, n))
282 return fallback_encoding;
288 is_encoding_utf16 (const char *encoding)
290 return (!c_strcasecmp (encoding, "utf-16")
291 || !c_strcasecmp (encoding, "utf16"));
295 is_encoding_utf32 (const char *encoding)
297 return (!c_strcasecmp (encoding, "utf-32")
298 || !c_strcasecmp (encoding, "utf32"));
301 /* If ENCODING is the name of an encoding that could begin with a byte-order
302 mark, and in fact the N bytes in DATA do begin with a byte-order mark,
303 returns the number of bytes in the byte-order mark. Otherwise, returns 0.
305 N must be at least ENCODING_GUESS_MIN, unless the file is shorter than
308 encoding_guess_bom_length (const char *encoding,
309 const void *data_, size_t n)
311 const uint8_t *data = data_;
313 return (is_utf8_bom (data, n) && is_encoding_utf8 (encoding) ? 3
314 : is_utf16le_bom (data, n) && is_encoding_utf16 (encoding) ? 2
315 : is_utf16be_bom (data, n) && is_encoding_utf16 (encoding) ? 2
316 : is_utf32le_bom (data, n) && is_encoding_utf32 (encoding) ? 4
317 : is_utf32be_bom (data, n) && is_encoding_utf32 (encoding) ? 4
321 /* Returns an encoding guess based on ENCODING and the N bytes of text starting
322 at DATA. DATA should start with the first non-ASCII text character (as
323 determined by encoding_guess_is_ascii_text()) found in the input.
325 The return value will either be "UTF-8" or the fallback encoding for
328 See encoding-guesser.h for intended use of this function.
330 N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than
331 that starting with the first non-ASCII text character. */
333 encoding_guess_tail_encoding (const char *encoding,
334 const void *data, size_t n)
336 return (encoding_guess_tail_is_utf8 (data, n) != 0
338 : encoding_guess_parse_encoding (encoding));
341 /* Returns an encoding guess based on ENCODING and the N bytes of text starting
342 at DATA. DATA should start with the first non-ASCII text character (as
343 determined by encoding_guess_is_ascii_text()) found in the input.
347 0, if the encoding is definitely not UTF-8 (because the input contains
348 byte sequences that are not valid in UTF-8).
350 1, if the encoding appears to be UTF-8 (because the input contains valid
351 UTF-8 multibyte sequences).
353 -1, if the input contains only ASCII characters. (This means that the
354 input may be treated as UTF-8, since ASCII is a subset of UTF-8.)
356 See encoding-guesser.h for intended use of this function.
358 N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than
359 that starting with the first non-ASCII text character. */
361 encoding_guess_tail_is_utf8 (const void *data, size_t n)
363 /* If all the bytes are in the ASCII range, it's just ASCII. */
364 if (encoding_guess_count_ascii (data, n) == n)
367 return (n < ENCODING_GUESS_MIN
368 ? u8_check (data, n) == NULL
369 : is_all_utf8_text (data, n));
372 /* Attempts to guess the encoding of a text file based on ENCODING, an encoding
373 name in one of the forms described at the top of encoding-guesser.h, and the
374 SIZE byts in DATA, which contains the entire contents of the file. Returns
375 the guessed encoding, which might be ENCODING itself or a suffix of it or a
376 statically allocated string.
378 Encoding autodetection only takes place if ENCODING actually specifies
379 autodetection. See encoding-guesser.h for details. */
381 encoding_guess_whole_file (const char *encoding, const void *text, size_t size)
385 guess = encoding_guess_head_encoding (encoding, text, size);
386 if (!strcmp (guess, "ASCII") && encoding_guess_encoding_is_auto (encoding))
387 return encoding_guess_tail_encoding (encoding, text, size);