From: Ben Pfaff Date: Thu, 12 May 2011 05:24:59 +0000 (-0700) Subject: encoding-guesser: New function encoding_guess_whole_file(). X-Git-Tag: v0.7.9~294 X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2a1131829704f7ceec7fa856f34e9d2001961a43;p=pspp-builds.git encoding-guesser: New function encoding_guess_whole_file(). This will be used for the first time in an upcoming commit. --- diff --git a/src/libpspp/encoding-guesser.c b/src/libpspp/encoding-guesser.c index 298861e0..7d10015e 100644 --- a/src/libpspp/encoding-guesser.c +++ b/src/libpspp/encoding-guesser.c @@ -283,3 +283,30 @@ encoding_guess_tail_is_utf8 (const void *data, size_t n) : is_all_utf8_text (data, n)); } +/* Attempts to guess the encoding of a text file based on ENCODING, an encoding + name in one of the forms described at the top of encoding-guesser.h, and the + SIZE byts in DATA, which contains the entire contents of the file. Returns + the guessed encoding, which might be ENCODING itself or a suffix of it or a + statically allocated string. + + Encoding autodetection only takes place if ENCODING actually specifies + autodetection. See encoding-guesser.h for details. */ +const char * +encoding_guess_whole_file (const char *encoding, const void *text, size_t size) +{ + const char *guess; + + guess = encoding_guess_head_encoding (encoding, text, size); + if (!strcmp (guess, "ASCII") && encoding_guess_encoding_is_auto (encoding)) + { + size_t ofs = encoding_guess_count_ascii (text, size); + if (ofs < size) + return encoding_guess_tail_encoding (encoding, + (const char *) text + ofs, + size - ofs); + else + return encoding_guess_parse_encoding (encoding); + } + else + return guess; +} diff --git a/src/libpspp/encoding-guesser.h b/src/libpspp/encoding-guesser.h index 2ec2fee2..0a7d1f99 100644 --- a/src/libpspp/encoding-guesser.h +++ b/src/libpspp/encoding-guesser.h @@ -115,6 +115,10 @@ bool encoding_guess_tail_is_utf8 (const void *, size_t); const char *encoding_guess_tail_encoding (const char *encoding, const void *, size_t); +/* Guessing from entire file contents. */ +const char *encoding_guess_whole_file (const char *encoding, + const void *, size_t); + /* Returns true if C is a byte that might appear in an ASCII text file, false otherwise. */ static inline bool