encoding-guesser: Fall back to windows-1252 when UTF-8 can't be right.

[pspp] / src / libpspp / i18n.c
diff --git a/src/libpspp/i18n.c b/src/libpspp/i18n.c

index 0e461db160dc5f69fcc65fcba1284192e1c44758..c04dd5acaf9f6470e03b65e8544b8f7948f385cb 100644 (file)
--- a/src/libpspp/i18n.c
+++ b/src/libpspp/i18n.c
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 2006, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -47,14 +47,15 @@ struct converter
      char *tocode;
      char *fromcode;
      iconv_t conv;
+    int error;
    };
  
  static char *default_encoding;
  static struct hmapx map;
  
  /* A wrapper around iconv_open */
-static iconv_t
-create_iconv (const char* tocode, const char* fromcode)
+static struct converter *
+create_iconv__ (const char* tocode, const char* fromcode)
  {
    size_t hash;
    struct hmapx_node *node;
@@ -65,23 +66,34 @@ create_iconv (const char* tocode, const char* fromcode)
    HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map)
      if (!strcmp (tocode, converter->tocode)
          && !strcmp (fromcode, converter->fromcode))
-      return converter->conv;
+      return converter;
  
    converter = xmalloc (sizeof *converter);
    converter->tocode = xstrdup (tocode);
    converter->fromcode = xstrdup (fromcode);
    converter->conv = iconv_open (tocode, fromcode);
+  converter->error = converter->conv == (iconv_t) -1 ? errno : 0;
    hmapx_insert (&map, converter, hash);
  
+  return converter;
+}
+
+static iconv_t
+create_iconv (const char* tocode, const char* fromcode)
+{
+  struct converter *converter;
+
+  converter = create_iconv__ (tocode, fromcode);
+
    /* I don't think it's safe to translate this string or to use messaging
       as the converters have not yet been set up */
-  if ( (iconv_t) -1 == converter->conv && 0 != strcmp (tocode, fromcode))
+  if (converter->error && strcmp (tocode, fromcode))
      {
-      const int err = errno;
        fprintf (stderr,
                 "Warning: "
                 "cannot create a converter for `%s' to `%s': %s\n",
-               fromcode, tocode, strerror (err));
+               fromcode, tocode, strerror (converter->error));
+      converter->error = 0;
      }
  
    return converter->conv;
@@ -283,7 +295,7 @@ utf8_encoding_concat__ (const char *head, size_t head_len,
        else
          {
            size_t copy_len;
-          size_t prev;
+          ucs4_t prev;
            size_t ofs;
            int mblen;
  
@@ -324,7 +336,7 @@ utf8_encoding_concat__ (const char *head, size_t head_len,
          {
            bool correct_result = false;
            size_t copy_len;
-          size_t prev;
+          ucs4_t prev;
            size_t ofs;
            int mblen;
  
@@ -609,7 +621,8 @@ i18n_done (void)
      {
        free (cvtr->tocode);
        free (cvtr->fromcode);
-      iconv_close (cvtr->conv);
+      if (cvtr->conv != (iconv_t) -1)
+        iconv_close (cvtr->conv);
        free (cvtr);
      }
  
@@ -681,34 +694,52 @@ get_encoding_info (struct encoding_info *e, const char *name)
      "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
      "abcdefghijklmnopqrstuvwxyz{|}~");
  
-  struct substring out, cr, lf;
+  struct substring out, cr, lf, space;
    bool ok;
  
    memset (e, 0, sizeof *e);
  
    cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
    lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
-  ok = cr.length >= 1 && cr.length <= MAX_UNIT && cr.length == lf.length;
+  space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
+  ok = (cr.length >= 1
+        && cr.length <= MAX_UNIT
+        && cr.length == lf.length
+        && cr.length == space.length);
    if (!ok)
      {
        fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
        ss_dealloc (&cr);
        ss_dealloc (&lf);
+      ss_dealloc (&space);
        ss_alloc_substring (&cr, ss_cstr ("\r"));
        ss_alloc_substring (&lf, ss_cstr ("\n"));
+      ss_alloc_substring (&space, ss_cstr (" "));
      }
  
    e->unit = cr.length;
    memcpy (e->cr, cr.string, e->unit);
    memcpy (e->lf, lf.string, e->unit);
+  memcpy (e->space, space.string, e->unit);
  
    ss_dealloc (&cr);
    ss_dealloc (&lf);
+  ss_dealloc (&space);
  
    out = recode_substring_pool ("UTF-8", name, in, NULL);
    e->is_ascii_compatible = ss_equals (in, out);
    ss_dealloc (&out);
  
+  if (!e->is_ascii_compatible && e->unit == 1)
+    {
+      out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL);
+      e->is_ebcdic_compatible = (out.length == 1
+                                 && (uint8_t) out.string[0] == 0xc1);
+      ss_dealloc (&out);
+    }
+  else
+    e->is_ebcdic_compatible = false;
+
    return ok;
  }
  
@@ -720,3 +751,35 @@ is_encoding_ascii_compatible (const char *encoding)
    get_encoding_info (&e, encoding);
    return e.is_ascii_compatible;
  }
+
+bool
+is_encoding_ebcdic_compatible (const char *encoding)
+{
+  struct encoding_info e;
+
+  get_encoding_info (&e, encoding);
+  return e.is_ebcdic_compatible;
+}
+
+/* Returns true if iconv can convert ENCODING to and from UTF-8,
+   otherwise false. */
+bool
+is_encoding_supported (const char *encoding)
+{
+  return (create_iconv__ ("UTF-8", encoding)->conv != (iconv_t) -1
+          && create_iconv__ (encoding, "UTF-8")->conv != (iconv_t) -1);
+}
+
+/* Returns true if E is the name of a UTF-8 encoding.
+
+   XXX Possibly we should test not E as a string but its properties via
+   iconv. */
+bool
+is_encoding_utf8 (const char *e)
+{
+  return ((e[0] == 'u' || e[0] == 'U')
+          && (e[1] == 't' || e[1] == 'T')
+          && (e[2] == 'f' || e[2] == 'F')
+          && ((e[3] == '8' && e[4] == '\0')
+              || (e[3] == '-' && e[4] == '8' && e[5] == '\0')));
+}