src/libpspp/encoding-guesser.h

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #ifndef LIBPSPP_ENCODING_GUESSER_H
  18 #define LIBPSPP_ENCODING_GUESSER_H 1
  19
  20 #include <stdbool.h>
  21 #include <stddef.h>
  22 #include <stdint.h>
  23
  24 /* A library for autodetecting the encoding of a text file.
  25
  26    Naming Encodings
  27    ----------------
  28
  29    The encoding guesser starts with an encoding name in one of various
  30    different forms.  Some of the forms do not actually do any autodetection.
  31    The encoding guesser will return the specified encoding without looking at
  32    any file data:
  33
  34      - A valid IANA or system encoding name: These are returned as-is.
  35
  36      - "Locale": Translated to the encoding used by the system locale, as
  37        returned by locale_charset().
  38
  39    The remaining forms that do perform autodetection are:
  40
  41      - "Auto," followed by a valid IANA or system encoding name (the "fallback
  42        encoding"): Requests detection whether the input is encoded in UTF-8,
  43        UTF-16, UTF-32, or a few other easily identifiable charsets.  When a
  44        particular character set cannot be recognized, the guesser falls back to
  45        the encoding following the comma.  UTF-8 detection works only for
  46        ASCII-compatible character sets.
  47
  48      - NULL or "Auto": As above, with the encoding used by the system locale as
  49        the fallback encoding.
  50
  51    The above are suggested capitalizations but encoding names are not
  52    case-sensitive.
  53
  54    The encoding_guess_parse_encoding() and encoding_guess_encoding_is_auto()
  55    functions work with encoding names in these forms.
  56
  57    Usage
  58    -----
  59
  60    1. Call encoding_guess_head_encoding() with several bytes from the start of
  61       the text file.  Feed in at least ENCODING_GUESS_MIN bytes, unless the
  62       file is shorter than that, but as many more as are conveniently
  63       available.  ENCODING_GUESS_SUGGESTED is a reasonable amount.
  64
  65       encoding_guess_head_encoding() returns its best guess at the file's
  66       encoding.  Ordinarily it returns a final guess that the client can use to
  67       interpret the file, and you're all done.  However, if it returns "ASCII"
  68       and the original encoding name requests autodetection (which you can find
  69       out by calling encoding_guess_encoding_is_auto()), then proceed to the
  70       next step.
  71
  72    2. The encoding guesser is confident that the stream uses an ASCII
  73       compatible encoding, either UTF-8 or the fallback encoding.  The client
  74       may safely read and process the stream up to the first non-ASCII
  75       character.  If the stream continues to be ASCII all the way to its end,
  76       then we're done.
  77
  78       The encoding guesser provides a pair of functions to detect non-ASCII
  79       characters: encoding_guess_is_ascii_text() for single characters and
  80       encoding_guess_count_ascii() as a convenient wrapper for whole buffers.
  81
  82    3. Otherwise, the stream contains some non-ASCII data at some point.  Now
  83       the client should gather several bytes starting at this point, at least
  84       ENCODING_GUESS_MIN, unless the file ends before that, but as many more as
  85       are conveniently available.  ENCODING_GUESS_SUGGESTED is a reasonable
  86       amount.
  87
  88       The client should pass these bytes to encoding_guess_tail_encoding(),
  89       which returns a best and final guess at the file's encoding, which is
  90       either UTF-8 or the fallback encoding.  Another alternative is
  91       encoding_guess_tail_is_utf8(), which guesses the same way but has a
  92       different form of return value.
  93 */
  94
  95 /* Minimum number of bytes for use in autodetection.
  96    You should only pass fewer bytes to the autodetection routines if the file
  97    is actually shorter than this. */
  98 #define ENCODING_GUESS_MIN              16
  99
 100 /* Suggested minimum buffer size to use for autodetection. */
 101 #define ENCODING_GUESS_SUGGESTED        1024
 102
 103 /* Parsing encoding names. */
 104 const char *encoding_guess_parse_encoding (const char *encoding);
 105 bool encoding_guess_encoding_is_auto (const char *encoding);
 106
 107 /* Making an initial coding guess based on the start of a file. */
 108 const char *encoding_guess_head_encoding (const char *encoding,
 109                                           const void *, size_t);
 110
 111 /* Refining an initial ASCII coding guess using later non-ASCII bytes. */
 112 static inline bool encoding_guess_is_ascii_text (uint8_t c);
 113 size_t encoding_guess_count_ascii (const void *, size_t);
 114 bool encoding_guess_tail_is_utf8 (const void *, size_t);
 115 const char *encoding_guess_tail_encoding (const char *encoding,
 116                                           const void *, size_t);
 117
 118 /* Returns true if C is a byte that might appear in an ASCII text file,
 119    false otherwise. */
 120 static inline bool
 121 encoding_guess_is_ascii_text (uint8_t c)
 122 {
 123   return (c >= 0x20 && c < 0x7f) || (c >= 0x09 && c < 0x0e);
 124 }
 125
 126 #endif /* libpspp/encoding-guesser.h */