From 2a1131829704f7ceec7fa856f34e9d2001961a43 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Wed, 11 May 2011 22:24:59 -0700 Subject: [PATCH] encoding-guesser: New function encoding_guess_whole_file(). This will be used for the first time in an upcoming commit. --- src/libpspp/encoding-guesser.c | 27 +++++++++++++++++++++++++++ src/libpspp/encoding-guesser.h | 4 ++++ 2 files changed, 31 insertions(+) diff --git a/src/libpspp/encoding-guesser.c b/src/libpspp/encoding-guesser.c index 298861e0..7d10015e 100644 --- a/src/libpspp/encoding-guesser.c +++ b/src/libpspp/encoding-guesser.c @@ -283,3 +283,30 @@ encoding_guess_tail_is_utf8 (const void *data, size_t n) : is_all_utf8_text (data, n)); } +/* Attempts to guess the encoding of a text file based on ENCODING, an encoding + name in one of the forms described at the top of encoding-guesser.h, and the + SIZE byts in DATA, which contains the entire contents of the file. Returns + the guessed encoding, which might be ENCODING itself or a suffix of it or a + statically allocated string. + + Encoding autodetection only takes place if ENCODING actually specifies + autodetection. See encoding-guesser.h for details. */ +const char * +encoding_guess_whole_file (const char *encoding, const void *text, size_t size) +{ + const char *guess; + + guess = encoding_guess_head_encoding (encoding, text, size); + if (!strcmp (guess, "ASCII") && encoding_guess_encoding_is_auto (encoding)) + { + size_t ofs = encoding_guess_count_ascii (text, size); + if (ofs < size) + return encoding_guess_tail_encoding (encoding, + (const char *) text + ofs, + size - ofs); + else + return encoding_guess_parse_encoding (encoding); + } + else + return guess; +} diff --git a/src/libpspp/encoding-guesser.h b/src/libpspp/encoding-guesser.h index 2ec2fee2..0a7d1f99 100644 --- a/src/libpspp/encoding-guesser.h +++ b/src/libpspp/encoding-guesser.h @@ -115,6 +115,10 @@ bool encoding_guess_tail_is_utf8 (const void *, size_t); const char *encoding_guess_tail_encoding (const char *encoding, const void *, size_t); +/* Guessing from entire file contents. */ +const char *encoding_guess_whole_file (const char *encoding, + const void *, size_t); + /* Returns true if C is a byte that might appear in an ASCII text file, false otherwise. */ static inline bool -- 2.30.2