pintos-os.org Git - pspp/blob - src/libpspp/encoding-guesser.h

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2011, 2012, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #ifndef LIBPSPP_ENCODING_GUESSER_H
  18 #define LIBPSPP_ENCODING_GUESSER_H 1
  19
  20 #include <stdbool.h>
  21 #include <stddef.h>
  22 #include <stdint.h>
  23
  24 /* A library for autodetecting the encoding of a text file.
  25
  26    Naming Encodings
  27    ----------------
  28
  29    The encoding guesser starts with an encoding name in one of various
  30    different forms.  Some of the forms do not actually do any autodetection.
  31    The encoding guesser will return the specified encoding without looking at
  32    any file data:
  33
  34      - A valid IANA or system encoding name: These are returned as-is.
  35
  36      - "Locale": Translated to the encoding used by the system locale, as
  37        returned by locale_charset().
  38
  39    The remaining forms that do perform autodetection are:
  40
  41      - "Auto," followed by a valid IANA or system encoding name (the "fallback
  42        encoding"): Requests detection whether the input is encoded in UTF-8,
  43        UTF-16, UTF-32, or a few other easily identifiable charsets.  When a
  44        particular character set cannot be recognized, the guesser falls back to
  45        the encoding following the comma.  When the fallback encoding is UTF-8,
  46        but the input is invalid UTF-8, then the windows-1252 encoding (closely
  47        related to ISO 8859-1) is used instead.  UTF-8 detection works only for
  48        ASCII-compatible character sets.
  49
  50      - NULL or "Auto": As above, with the encoding used by the system locale as
  51        the fallback encoding.
  52
  53    The above are suggested capitalizations but encoding names are not
  54    case-sensitive.
  55
  56    The encoding_guess_parse_encoding() and encoding_guess_encoding_is_auto()
  57    functions work with encoding names in these forms.
  58
  59    Endian variants
  60    ---------------
  61
  62    These functions identify three different variants of UTF-16:
  63
  64      - "UTF-16BE": Big-endian UTF-16 byte order without a byte-order mark
  65        (BOM),
  66
  67      - "UTF-16LE": Little-endian UTF-16 byte order without a BOM,
  68
  69      - "UTF-16": Big-endian or little-endian UTF-16 byte order *with* a BOM,
  70
  71    and similarly for UTF-32.
  72
  73    Unicode requires these distinctions.  The UTF-8, UTF-16, UTF-32 & BOM FAQ at
  74    https://unicode.org/faq/utf_bom.html, for example, says:
  75
  76      Q: Why do some of the UTFs have a BE or LE in their label, such as
  77      UTF-16LE?
  78
  79      A: UTF-16 and UTF-32 use code units that are two and four bytes long
  80      respectively. For these UTFs, there are three sub-flavors: BE, LE and
  81      unmarked. The BE form uses big-endian byte serialization (most significant
  82      byte first), the LE form uses little-endian byte serialization (least
  83      significant byte first) and the unmarked form uses big-endian byte
  84      serialization by default, but may include a byte order mark at the
  85      beginning to indicate the actual byte serialization used.
  86
  87    ...
  88
  89      Q: How do I tag data that does not interpret U+FEFF as a BOM?
  90
  91      A: Use the tag UTF-16BE to indicate big-endian UTF-16 text, and UTF-16LE
  92      to indicate little-endian UTF-16 text. If you do use a BOM, tag the text
  93      as simply UTF-16. [MD]
  94
  95      Q: Why wouldn’t I always use a protocol that requires a BOM?
  96
  97      A: Where the data has an associated type, such as a field in a database, a
  98      BOM is unnecessary. In particular, if a text data stream is marked as
  99      UTF-16BE, UTF-16LE, UTF-32BE or UTF-32LE, a BOM is neither necessary nor
 100      permitted. Any U+FEFF would be interpreted as a ZWNBSP...
 101
 102    Usage
 103    -----
 104
 105    1. Call encoding_guess_head_encoding() with several bytes from the start of
 106       the text file.  Feed in at least ENCODING_GUESS_MIN bytes, unless the
 107       file is shorter than that, but as many more as are conveniently
 108       available.  ENCODING_GUESS_SUGGESTED is a reasonable amount.
 109
 110       encoding_guess_head_encoding() returns its best guess at the file's
 111       encoding.  Ordinarily it returns a final guess that the client can use to
 112       interpret the file, and you're all done.  However, if it returns "ASCII"
 113       and the original encoding name requests autodetection (which you can find
 114       out by calling encoding_guess_encoding_is_auto()), then proceed to the
 115       next step.
 116
 117    2. The encoding guesser is confident that the stream uses an ASCII
 118       compatible encoding, either UTF-8 or the fallback encoding.  The client
 119       may safely read and process the stream up to the first non-ASCII
 120       character.  If the stream continues to be ASCII all the way to its end,
 121       then we're done.
 122
 123       The encoding guesser provides a pair of functions to detect non-ASCII
 124       characters: encoding_guess_is_ascii_text() for single characters and
 125       encoding_guess_count_ascii() as a convenient wrapper for whole buffers.
 126
 127    3. Otherwise, the stream contains some non-ASCII data at some point.  Now
 128       the client should gather several bytes starting at this point, at least
 129       ENCODING_GUESS_MIN, unless the file ends before that, but as many more as
 130       are conveniently available.  ENCODING_GUESS_SUGGESTED is a reasonable
 131       amount.
 132
 133       The client should pass these bytes to encoding_guess_tail_encoding(),
 134       which returns a best and final guess at the file's encoding, which is
 135       either UTF-8 or the fallback encoding.  Another alternative is
 136       encoding_guess_tail_is_utf8(), which guesses the same way but has a
 137       different form of return value.
 138 */
 139
 140 /* Minimum number of bytes for use in autodetection.
 141    You should only pass fewer bytes to the autodetection routines if the file
 142    is actually shorter than this. */
 143 #define ENCODING_GUESS_MIN              16
 144
 145 /* Suggested minimum buffer size to use for autodetection. */
 146 #define ENCODING_GUESS_SUGGESTED        1024
 147
 148 /* Parsing encoding names. */
 149 const char *encoding_guess_parse_encoding (const char *encoding);
 150 bool encoding_guess_encoding_is_auto (const char *encoding);
 151
 152 /* Making an initial coding guess based on the start of a file. */
 153 const char *encoding_guess_head_encoding (const char *encoding,
 154                                           const void *, size_t);
 155 size_t encoding_guess_bom_length (const char *encoding,
 156                                   const void *, size_t n);
 157
 158 /* Refining an initial ASCII coding guess using later non-ASCII bytes. */
 159 static inline bool encoding_guess_is_ascii_text (uint8_t c);
 160 size_t encoding_guess_count_ascii (const void *, size_t);
 161 int encoding_guess_tail_is_utf8 (const void *, size_t);
 162 const char *encoding_guess_tail_encoding (const char *encoding,
 163                                           const void *, size_t);
 164
 165 /* Guessing from entire file contents. */
 166 const char *encoding_guess_whole_file (const char *encoding,
 167                                        const void *, size_t);
 168
 169 /* Returns true if C is a byte that might appear in an ASCII text file,
 170    false otherwise. */
 171 static inline bool
 172 encoding_guess_is_ascii_text (uint8_t c)
 173 {
 174   return (c >= 0x20 && c < 0x7f) || (c >= 0x09 && c < 0x0e);
 175 }
 176
 177 #endif /* libpspp/encoding-guesser.h */