X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flibpspp%2Fencoding-guesser.c;h=2f3438ff811ad954359261686871cd061db7bc48;hb=588d9107cd4b6eee3a0c3ece3cf53868e22c52f4;hp=b55f24a1de442db23f2cc603f60d7333853bcaab;hpb=0b0ca44889e637251cb5f2dbf3c7fdc4ec8b9bd7;p=pspp diff --git a/src/libpspp/encoding-guesser.c b/src/libpspp/encoding-guesser.c index b55f24a1de..2f3438ff81 100644 --- a/src/libpspp/encoding-guesser.c +++ b/src/libpspp/encoding-guesser.c @@ -43,19 +43,15 @@ const char * encoding_guess_parse_encoding (const char *encoding) { - const char *fallback; - if (encoding == NULL || !c_strcasecmp (encoding, "auto") || !c_strcasecmp (encoding, "auto,locale") || !c_strcasecmp (encoding, "locale")) - fallback = locale_charset (); + return locale_charset (); else if (!c_strncasecmp (encoding, "auto,", 5)) - fallback = encoding + 5; + return encoding + 5; else return encoding; - - return is_encoding_utf8 (fallback) ? "windows-1252" : fallback; } /* Returns true if ENCODING, which must be in one of the forms described at the @@ -193,28 +189,34 @@ is_utf8_bom (const uint8_t *data, size_t n) return n >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf; } +static bool +is_bom_length (size_t n, size_t w) +{ + return n >= ENCODING_GUESS_MIN || (n && n % w == 0); +} + static bool is_utf16le_bom (const uint8_t *data, size_t n) { - return (n >= ENCODING_GUESS_MIN || n % 2 == 0) && get_le16 (data) == 0xfeff; + return is_bom_length (n, 2) && get_le16 (data) == 0xfeff; } static bool is_utf16be_bom (const uint8_t *data, size_t n) { - return (n >= ENCODING_GUESS_MIN || n % 2 == 0) && get_be16 (data) == 0xfeff; + return is_bom_length (n, 2) && get_be16 (data) == 0xfeff; } static bool is_utf32le_bom (const uint8_t *data, size_t n) { - return (n >= ENCODING_GUESS_MIN || n % 4 == 0) && get_le32 (data) == 0xfeff; + return is_bom_length (n, 4) && get_le32 (data) == 0xfeff; } static bool is_utf32be_bom (const uint8_t *data, size_t n) { - return (n >= ENCODING_GUESS_MIN || n % 4 == 0) && get_be32 (data) == 0xfeff; + return is_bom_length (n, 4) && get_be32 (data) == 0xfeff; } /* Attempts to guess the encoding of a text file based on ENCODING, an encoding @@ -277,10 +279,25 @@ encoding_guess_head_encoding (const char *encoding, if (is_utf32 (data, n, get_le32)) return "UTF-32LE"; - if (!is_encoding_ascii_compatible (fallback_encoding) - || !encoding_guess_tail_is_utf8 (data, n)) + /* We've tried all the "giveaways" that make the encoding obvious. That + rules out, incidentally, all the encodings with multibyte units + (e.g. UTF-16, UTF-32). Our remaining goal is to try to distinguish UTF-8 + from some ASCII-based fallback encoding. */ + + /* If the fallback encoding isn't ASCII compatible, give up. */ + if (!is_encoding_ascii_compatible (fallback_encoding)) return fallback_encoding; + /* If the data we have clearly is not UTF-8, give up. */ + if (!encoding_guess_tail_is_utf8 (data, n)) + { + /* If the fallback encoding is UTF-8, fall back on something else.*/ + if (is_encoding_utf8 (fallback_encoding)) + return "windows-1252"; + + return fallback_encoding; + } + return "ASCII"; } @@ -333,9 +350,21 @@ const char * encoding_guess_tail_encoding (const char *encoding, const void *data, size_t n) { - return (encoding_guess_tail_is_utf8 (data, n) != 0 - ? "UTF-8" - : encoding_guess_parse_encoding (encoding)); + + if (encoding_guess_tail_is_utf8 (data, n) != 0) + return "UTF-8"; + else + { + /* The data is not UTF-8. */ + const char *fallback_encoding = encoding_guess_parse_encoding (encoding); + + /* If the fallback encoding is UTF-8, fall back on something else.*/ + if (is_encoding_utf8 (fallback_encoding)) + return "windows-1252"; + + return fallback_encoding; + } + } /* Returns an encoding guess based on ENCODING and the N bytes of text starting