X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flibpspp%2Fencoding-guesser.c;h=9518bd6396bd7d05472251e3ea779021dd459084;hb=cfe65232ff4bf28987ff1c1d02c9ec385de3c4af;hp=7d10015e2ea2961febf0efb8edb47d66e30033a8;hpb=fe8dc2171009e90d2335f159d05f7e6660e24780;p=pspp diff --git a/src/libpspp/encoding-guesser.c b/src/libpspp/encoding-guesser.c index 7d10015e2e..9518bd6396 100644 --- a/src/libpspp/encoding-guesser.c +++ b/src/libpspp/encoding-guesser.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2011 Free Software Foundation, Inc. + Copyright (C) 2011, 2012, 2013 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -36,10 +36,10 @@ of information about encoding detection. */ -/* Parses and returns the fallback encoding from ENCODING, which must be in one - of the forms described at the top of encoding-guesser.h. The returned - string might be ENCODING itself or a suffix of it, or it might be a - statically allocated string. */ +/* Returns the encoding specified by ENCODING, which must be in one of the + forms described at the top of encoding-guesser.h. The returned string might + be ENCODING itself or a suffix of it, or it might be a statically allocated + string. */ const char * encoding_guess_parse_encoding (const char *encoding) { @@ -88,7 +88,6 @@ static uint32_t get_le32 (const uint8_t *data) { return (data[3] << 24) | (data[2] << 16) | (data[1] << 8) | data[0]; - } static const char * @@ -184,6 +183,36 @@ is_all_utf8_text (const void *s_, size_t n) return true; } +static bool +is_utf8_bom (const uint8_t *data, size_t n) +{ + return n >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf; +} + +static bool +is_utf16le_bom (const uint8_t *data, size_t n) +{ + return (n >= ENCODING_GUESS_MIN || n % 2 == 0) && get_le16 (data) == 0xfeff; +} + +static bool +is_utf16be_bom (const uint8_t *data, size_t n) +{ + return (n >= ENCODING_GUESS_MIN || n % 2 == 0) && get_be16 (data) == 0xfeff; +} + +static bool +is_utf32le_bom (const uint8_t *data, size_t n) +{ + return (n >= ENCODING_GUESS_MIN || n % 4 == 0) && get_le32 (data) == 0xfeff; +} + +static bool +is_utf32be_bom (const uint8_t *data, size_t n) +{ + return (n >= ENCODING_GUESS_MIN || n % 4 == 0) && get_be32 (data) == 0xfeff; +} + /* Attempts to guess the encoding of a text file based on ENCODING, an encoding name in one of the forms described at the top of encoding-guesser.h, and DATA, which contains the first N bytes of the file. Returns the guessed @@ -217,8 +246,7 @@ encoding_guess_head_encoding (const char *encoding, if (n == 0) return fallback_encoding; - if ((n >= ENCODING_GUESS_MIN || n % 4 == 0) - && (get_be32 (data) == 0xfeff || get_le32 (data) == 0xfeff)) + if (is_utf32be_bom (data, n) || is_utf32le_bom (data, n)) return "UTF-32"; if (n >= 4) @@ -230,11 +258,10 @@ encoding_guess_head_encoding (const char *encoding, return "UTF-EBCDIC"; } - if ((n >= ENCODING_GUESS_MIN || n % 2 == 0) - && (get_be16 (data) == 0xfeff || get_le16 (data) == 0xfeff)) + if (is_utf16be_bom (data, n) || is_utf16le_bom (data, n)) return "UTF-16"; - if (n >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) + if (is_utf8_bom (data, n)) return "UTF-8"; guess = guess_utf16 (data, n); @@ -246,13 +273,62 @@ encoding_guess_head_encoding (const char *encoding, if (is_utf32 (data, n, get_le32)) return "UTF-32LE"; - if (!is_encoding_ascii_compatible (fallback_encoding) - || !encoding_guess_tail_is_utf8 (data, n)) + /* We've tried all the "giveaways" that make the encoding obvious. That + rules out, incidentally, all the encodings with multibyte units + (e.g. UTF-16, UTF-32). Our remaining goal is to try to distinguish UTF-8 + from some ASCII-based fallback encoding. */ + + /* If the fallback encoding isn't ASCII compatible, give up. */ + if (!is_encoding_ascii_compatible (fallback_encoding)) return fallback_encoding; + /* If the data we have clearly is not UTF-8, give up. */ + if (!encoding_guess_tail_is_utf8 (data, n)) + { + /* If the fallback encoding is UTF-8, fall back on something else.*/ + if (is_encoding_utf8 (fallback_encoding)) + return "windows-1252"; + + return fallback_encoding; + } + return "ASCII"; } +static bool +is_encoding_utf16 (const char *encoding) +{ + return (!c_strcasecmp (encoding, "utf-16") + || !c_strcasecmp (encoding, "utf16")); +} + +static bool +is_encoding_utf32 (const char *encoding) +{ + return (!c_strcasecmp (encoding, "utf-32") + || !c_strcasecmp (encoding, "utf32")); +} + +/* If ENCODING is the name of an encoding that could begin with a byte-order + mark, and in fact the N bytes in DATA do begin with a byte-order mark, + returns the number of bytes in the byte-order mark. Otherwise, returns 0. + + N must be at least ENCODING_GUESS_MIN, unless the file is shorter than + that. */ +size_t +encoding_guess_bom_length (const char *encoding, + const void *data_, size_t n) +{ + const uint8_t *data = data_; + + return (is_utf8_bom (data, n) && is_encoding_utf8 (encoding) ? 3 + : is_utf16le_bom (data, n) && is_encoding_utf16 (encoding) ? 2 + : is_utf16be_bom (data, n) && is_encoding_utf16 (encoding) ? 2 + : is_utf32le_bom (data, n) && is_encoding_utf32 (encoding) ? 4 + : is_utf32be_bom (data, n) && is_encoding_utf32 (encoding) ? 4 + : 0); +} + /* Returns an encoding guess based on ENCODING and the N bytes of text starting at DATA. DATA should start with the first non-ASCII text character (as determined by encoding_guess_is_ascii_text()) found in the input. @@ -268,16 +344,49 @@ const char * encoding_guess_tail_encoding (const char *encoding, const void *data, size_t n) { - return (encoding_guess_tail_is_utf8 (data, n) - ? "UTF-8" - : encoding_guess_parse_encoding (encoding)); + + if (encoding_guess_tail_is_utf8 (data, n) != 0) + return "UTF-8"; + else + { + /* The data is not UTF-8. */ + const char *fallback_encoding = encoding_guess_parse_encoding (encoding); + + /* If the fallback encoding is UTF-8, fall back on something else.*/ + if (is_encoding_utf8 (fallback_encoding)) + return "windows-1252"; + + return fallback_encoding; + } + } -/* Same as encoding_guess_tail_encoding() but returns true for UTF-8 or false - for the fallback encoding. */ -bool +/* Returns an encoding guess based on ENCODING and the N bytes of text starting + at DATA. DATA should start with the first non-ASCII text character (as + determined by encoding_guess_is_ascii_text()) found in the input. + + The return value is: + + 0, if the encoding is definitely not UTF-8 (because the input contains + byte sequences that are not valid in UTF-8). + + 1, if the encoding appears to be UTF-8 (because the input contains valid + UTF-8 multibyte sequences). + + -1, if the input contains only ASCII characters. (This means that the + input may be treated as UTF-8, since ASCII is a subset of UTF-8.) + + See encoding-guesser.h for intended use of this function. + + N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than + that starting with the first non-ASCII text character. */ +int encoding_guess_tail_is_utf8 (const void *data, size_t n) { + /* If all the bytes are in the ASCII range, it's just ASCII. */ + if (encoding_guess_count_ascii (data, n) == n) + return -1; + return (n < ENCODING_GUESS_MIN ? u8_check (data, n) == NULL : is_all_utf8_text (data, n)); @@ -298,15 +407,7 @@ encoding_guess_whole_file (const char *encoding, const void *text, size_t size) guess = encoding_guess_head_encoding (encoding, text, size); if (!strcmp (guess, "ASCII") && encoding_guess_encoding_is_auto (encoding)) - { - size_t ofs = encoding_guess_count_ascii (text, size); - if (ofs < size) - return encoding_guess_tail_encoding (encoding, - (const char *) text + ofs, - size - ofs); - else - return encoding_guess_parse_encoding (encoding); - } + return encoding_guess_tail_encoding (encoding, text, size); else return guess; }