X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flibpspp%2Fencoding-guesser.c;h=bee29782f2a5e582d4cc7c463c4b8d269de17799;hb=ddf35b11f8525281dd9e524a5d322a80bf56a24f;hp=9042e93a2fd41db02012d19f206c378748adc7c8;hpb=f3668539947d5baed813a4f8436d6cf36abeedd2;p=pspp diff --git a/src/libpspp/encoding-guesser.c b/src/libpspp/encoding-guesser.c index 9042e93a2f..bee29782f2 100644 --- a/src/libpspp/encoding-guesser.c +++ b/src/libpspp/encoding-guesser.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2011 Free Software Foundation, Inc. + Copyright (C) 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -36,22 +36,26 @@ of information about encoding detection. */ -/* Parses and returns the fallback encoding from ENCODING, which must be in one - of the forms described at the top of encoding-guesser.h. The returned - string might be ENCODING itself or a suffix of it, or it might be a - statically allocated string. */ +/* Returns the encoding specified by ENCODING, which must be in one of the + forms described at the top of encoding-guesser.h. The returned string might + be ENCODING itself or a suffix of it, or it might be a statically allocated + string. */ const char * encoding_guess_parse_encoding (const char *encoding) { + const char *fallback; + if (encoding == NULL || !c_strcasecmp (encoding, "auto") || !c_strcasecmp (encoding, "auto,locale") || !c_strcasecmp (encoding, "locale")) - return locale_charset (); + fallback = locale_charset (); else if (!c_strncasecmp (encoding, "auto,", 5)) - return encoding + 5; + fallback = encoding + 5; else return encoding; + + return is_encoding_utf8 (fallback) ? "windows-1252" : fallback; } /* Returns true if ENCODING, which must be in one of the forms described at the @@ -88,7 +92,6 @@ static uint32_t get_le32 (const uint8_t *data) { return (data[3] << 24) | (data[2] << 16) | (data[1] << 8) | data[0]; - } static const char * @@ -250,10 +253,6 @@ encoding_guess_head_encoding (const char *encoding, || !encoding_guess_tail_is_utf8 (data, n)) return fallback_encoding; - if (!c_strcasecmp (fallback_encoding, "UTF-8") - || !c_strcasecmp (fallback_encoding, "UTF8")) - return "UTF-8"; - return "ASCII"; } @@ -272,18 +271,58 @@ const char * encoding_guess_tail_encoding (const char *encoding, const void *data, size_t n) { - return (encoding_guess_tail_is_utf8 (data, n) + return (encoding_guess_tail_is_utf8 (data, n) != 0 ? "UTF-8" : encoding_guess_parse_encoding (encoding)); } -/* Same as encoding_guess_tail_encoding() but returns true for UTF-8 or false - for the fallback encoding. */ -bool +/* Returns an encoding guess based on ENCODING and the N bytes of text starting + at DATA. DATA should start with the first non-ASCII text character (as + determined by encoding_guess_is_ascii_text()) found in the input. + + The return value is: + + 0, if the encoding is definitely not UTF-8 (because the input contains + byte sequences that are not valid in UTF-8). + + 1, if the encoding appears to be UTF-8 (because the input contains valid + UTF-8 multibyte sequences). + + -1, if the input contains only ASCII characters. (This means that the + input may be treated as UTF-8, since ASCII is a subset of UTF-8.) + + See encoding-guesser.h for intended use of this function. + + N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than + that starting with the first non-ASCII text character. */ +int encoding_guess_tail_is_utf8 (const void *data, size_t n) { + /* If all the bytes are in the ASCII range, it's just ASCII. */ + if (encoding_guess_count_ascii (data, n) == n) + return -1; + return (n < ENCODING_GUESS_MIN ? u8_check (data, n) == NULL : is_all_utf8_text (data, n)); } +/* Attempts to guess the encoding of a text file based on ENCODING, an encoding + name in one of the forms described at the top of encoding-guesser.h, and the + SIZE byts in DATA, which contains the entire contents of the file. Returns + the guessed encoding, which might be ENCODING itself or a suffix of it or a + statically allocated string. + + Encoding autodetection only takes place if ENCODING actually specifies + autodetection. See encoding-guesser.h for details. */ +const char * +encoding_guess_whole_file (const char *encoding, const void *text, size_t size) +{ + const char *guess; + + guess = encoding_guess_head_encoding (encoding, text, size); + if (!strcmp (guess, "ASCII") && encoding_guess_encoding_is_auto (encoding)) + return encoding_guess_tail_encoding (encoding, text, size); + else + return guess; +}