Make SYSFILE INFO able to analyze valid encodings for a system file.

author Ben Pfaff <blp@cs.stanford.edu>

Sun, 16 Feb 2014 22:59:54 +0000 (14:59 -0800)

committer Ben Pfaff <blp@cs.stanford.edu>

Sun, 16 Feb 2014 22:59:54 +0000 (14:59 -0800)
author Ben Pfaff <blp@cs.stanford.edu>
Sun, 16 Feb 2014 22:59:54 +0000 (14:59 -0800)
committer Ben Pfaff <blp@cs.stanford.edu>
Sun, 16 Feb 2014 22:59:54 +0000 (14:59 -0800)
diff --git a/NEWS b/NEWS

index 57de7877b2ad907e3d758c89c6451035698cb358..51dc42cd1c5d82a390429e3d59fe73d3bda49bc5 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -9,9 +9,6 @@ Changes since 0.8.2:
   * REGRESSION now recognises /STATISTICS=CI(x) which causes confidence
     intervals for the coefficients to be printed.
  
- * The SYSFILE INFO command now accepts an ENCODING subcommand to
-   specify the character encoding of string data in the system file.
-
   * PSPPIRE graphical user interface improvements:
  
     - File|Open now allows an encoding to be selected for opening
@@ -20,8 +17,17 @@ Changes since 0.8.2:
     - File|Display Data File Information|External File... now allows an
       encoding to be selected.
  
- * System files that contain duplicate variable names may now be read
-   successfully (bug #41475).
+ * System file related improvements:
+
+   - With ENCODING="DETECT", SYSFILE INFO can now help the user to
+     pick an encoding for reading a system file that does not identify
+     its own encoding
+
+   - SYSFILE INFO now accepts an ENCODING subcommand to specify the
+     character encoding of string data in the system file.
+
+   - System files that contain duplicate variable names may now be
+     read successfully (bug #41475).
  
  Changes from 0.8.1 to 0.8.2:
  
diff --git a/doc/files.texi b/doc/files.texi

index 318c887265eb83df35ed85437ff4db5564b3603b..889af5d2a96cf055182f8cf1cec235a3605379a6 100644 (file)
--- a/doc/files.texi
+++ b/doc/files.texi
@@ -179,8 +179,9 @@ is affected by these subcommands.
  file.  Sometimes, however, this does not work well,
  especially for files written by old versions of SPSS or @pspp{}.  Specify
  the @subcmd{ENCODING} subcommand with an @acronym{IANA} character set name as its string
-argument to override the default.  The @subcmd{ENCODING} subcommand is a @pspp{}
-extension.
+argument to override the default.  Use @cmd{SYSFILE INFO} to analyze
+the encodings that might be valid for a system file.  The
+@subcmd{ENCODING} subcommand is a @pspp{} extension.
  
  @cmd{GET} does not cause the data to be read, only the dictionary.  The data
  is read later, when a procedure is executed.
@@ -923,8 +924,10 @@ a system file and displays information on its dictionary.
  the file.  Sometimes, however, this does not work well, especially for
  files written by old versions of SPSS or @pspp{}.  Specify the
  @subcmd{ENCODING} subcommand with an @acronym{IANA} character set name
-as its string argument to override the default.  The @subcmd{ENCODING}
-subcommand is a @pspp{} extension.
+as its string argument to override the default, or specify
+@code{ENCODING='DETECT'} to analyze and report possibly valid
+encodings for the system file.  The @subcmd{ENCODING} subcommand is a
+@pspp{} extension.
  
  @cmd{SYSFILE INFO} does not affect the current active dataset.
  
diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c

index 9fab76f89df109dae340c5166a3705352ce3fbcb..a93f75fa77ad08dda9b4adc332f6d8ff131224cb 100644 (file)
--- a/src/data/sys-file-reader.c
+++ b/src/data/sys-file-reader.c
@@ -110,7 +110,7 @@ struct sfm_var_record
    {
      off_t pos;
      int width;
-    char name[8];
+    char name[9];
      int print_format;
      int write_format;
      int missing_value_code;
@@ -567,6 +567,156 @@ sfm_get_encoding (const struct sfm_reader *r)
    return NULL;
  }
  
+struct get_strings_aux
+  {
+    struct pool *pool;
+    char **titles;
+    char **strings;
+    bool *ids;
+    size_t allocated;
+    size_t n;
+  };
+
+static void
+add_string__ (struct get_strings_aux *aux,
+              const char *string, bool id, char *title)
+{
+  if (aux->n >= aux->allocated)
+    {
+      aux->allocated = 2 * (aux->allocated + 1);
+      aux->titles = pool_realloc (aux->pool, aux->titles,
+                                  aux->allocated * sizeof *aux->titles);
+      aux->strings = pool_realloc (aux->pool, aux->strings,
+                                   aux->allocated * sizeof *aux->strings);
+      aux->ids = pool_realloc (aux->pool, aux->ids,
+                               aux->allocated * sizeof *aux->ids);
+    }
+
+  aux->titles[aux->n] = title;
+  aux->strings[aux->n] = pool_strdup (aux->pool, string);
+  aux->ids[aux->n] = id;
+  aux->n++;
+}
+
+static void PRINTF_FORMAT (3, 4)
+add_string (struct get_strings_aux *aux,
+            const char *string, const char *title, ...)
+{
+  va_list args;
+
+  va_start (args, title);
+  add_string__ (aux, string, false, pool_vasprintf (aux->pool, title, args));
+  va_end (args);
+}
+
+static void PRINTF_FORMAT (3, 4)
+add_id (struct get_strings_aux *aux, const char *id, const char *title, ...)
+{
+  va_list args;
+
+  va_start (args, title);
+  add_string__ (aux, id, true, pool_vasprintf (aux->pool, title, args));
+  va_end (args);
+}
+
+/* Retrieves significant string data from R in its raw format, to allow the
+   caller to try to detect the encoding in use.
+
+   Returns the number of strings retrieved N.  Sets each of *TITLESP, *IDSP,
+   and *STRINGSP to an array of N elements allocated from POOL.  For each I in
+   0...N-1, UTF-8 string *TITLESP[I] describes *STRINGSP[I], which is in
+   whatever encoding system file R uses.  *IDS[I] is true if *STRINGSP[I] must
+   be a valid PSPP language identifier, false if *STRINGSP[I] is free-form
+   text. */
+size_t
+sfm_get_strings (const struct sfm_reader *r, struct pool *pool,
+                 char ***titlesp, bool **idsp, char ***stringsp)
+{
+  const struct sfm_mrset *mrset;
+  struct get_strings_aux aux;
+  size_t var_idx;
+  size_t i, j, k;
+
+  aux.pool = pool;
+  aux.titles = NULL;
+  aux.strings = NULL;
+  aux.ids = NULL;
+  aux.allocated = 0;
+  aux.n = 0;
+
+  var_idx = 0;
+  for (i = 0; i < r->n_vars; i++)
+    if (r->vars[i].width != -1)
+      add_id (&aux, r->vars[i].name, _("Variable %zu"), ++var_idx);
+
+  var_idx = 0;
+  for (i = 0; i < r->n_vars; i++)
+    if (r->vars[i].width != -1)
+      {
+        var_idx++;
+        if (r->vars[i].label)
+          add_string (&aux, r->vars[i].label, _("Variable %zu Label"),
+                      var_idx);
+      }
+
+  k = 0;
+  for (i = 0; i < r->n_labels; i++)
+    for (j = 0; j < r->labels[i].n_labels; j++)
+      add_string (&aux, r->labels[i].labels[j].label,
+                  _("Value Label %zu"), k++);
+
+  add_string (&aux, r->header.creation_date, _("Creation Date"));
+  add_string (&aux, r->header.creation_time, _("Creation Time"));
+  add_string (&aux, r->header.eye_catcher, _("Product"));
+  add_string (&aux, r->header.file_label, _("File Label"));
+
+  if (r->extensions[EXT_PRODUCT_INFO])
+    add_string (&aux, r->extensions[EXT_PRODUCT_INFO]->data,
+                _("Extra Product Info"));
+
+  if (r->document)
+    {
+      size_t i;
+
+      for (i = 0; i < r->document->n_lines; i++)
+        {
+          char line[81];
+
+          memcpy (line, r->document->documents + i * 80, 80);
+          line[80] = '\0';
+
+          add_string (&aux, line, _("Document Line %zu"), i + 1);
+        }
+    }
+
+  for (mrset = r->mrsets; mrset < &r->mrsets[r->n_mrsets]; mrset++)
+    {
+      size_t mrset_idx = mrset - r->mrsets + 1;
+
+      add_id (&aux, mrset->name, _("MRSET %zu"), mrset_idx);
+      if (mrset->label[0])
+        add_string (&aux, mrset->label, _("MRSET %zu Label"), mrset_idx);
+
+      /* Skip the variables because they ought to be duplicates. */
+
+      if (mrset->counted)
+        add_string (&aux, mrset->counted, _("MRSET %zu Counted Value"),
+                    mrset_idx);
+    }
+
+  /*  */
+  /* data file attributes */
+  /* variable attributes */
+  /* long var map */
+  /* long string value labels */
+  /* long string missing values */
+
+  *titlesp = aux.titles;
+  *idsp = aux.ids;
+  *stringsp = aux.strings;
+  return aux.n;
+}
+
  /* Decodes the dictionary read from R, saving it into into *DICT.  Character
     strings in R are decoded using ENCODING, or an encoding obtained from R if
     ENCODING is null, or the locale encoding if R specifies no encoding.
@@ -588,7 +738,16 @@ sfm_decode (struct sfm_reader *r, const char *encoding,
      {
        encoding = sfm_get_encoding (r);
        if (encoding == NULL)
-        encoding = locale_charset ();
+        {
+          sys_warn (r, -1, _("This system file does not indicate its own "
+                             "character encoding.  Using default encoding "
+                             "%s.  For best results, specify an encoding "
+                             "explicitly.  Use SYSFILE INFO with "
+                             "ENCODING=\"DETECT\" to analyze the possible "
+                             "encodings."),
+                    locale_charset ());
+          encoding = locale_charset ();
+        }
      }
  
    dict = dict_create (encoding);
@@ -906,7 +1065,7 @@ read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
        || !read_int (r, &record->missing_value_code)
        || !read_int (r, &record->print_format)
        || !read_int (r, &record->write_format)
-      || !read_bytes (r, record->name, sizeof record->name))
+      || !read_string (r, record->name, sizeof record->name))
      return false;
  
    if (has_variable_label == 1)
@@ -1242,7 +1401,7 @@ parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
        size_t i;
  
        name = recode_string_pool ("UTF-8", dict_encoding,
-                                 rec->name, 8, r->pool);
+                                 rec->name, -1, r->pool);
        name[strcspn (name, " ")] = '\0';
  
        if (!dict_id_is_valid (dict, name, false)
diff --git a/src/data/sys-file-reader.h b/src/data/sys-file-reader.h

index 254e810a6c83d6f60de25e7f1203f720e9aa910d..849da670fce2f1f0ebe21b830dd41386eee536e0 100644 (file)
--- a/src/data/sys-file-reader.h
+++ b/src/data/sys-file-reader.h
@@ -50,6 +50,8 @@ bool sfm_close (struct sfm_reader *);
  
  /* Obtaining information about an sfm_reader before . */
  const char *sfm_get_encoding (const struct sfm_reader *);
+size_t sfm_get_strings (const struct sfm_reader *, struct pool *pool,
+                        char ***labels, bool **ids, char ***values);
  
  /* Decoding a system file's dictionary and obtaining a casereader. */
  struct casereader *sfm_decode (struct sfm_reader *, const char *encoding,
diff --git a/src/language/dictionary/sys-file-info.c b/src/language/dictionary/sys-file-info.c

index 6b74f75a6622d0860b8679496063bbfd02df0806..05f9be91fbd4224b87f9789597d4a9d09d292f24 100644 (file)
--- a/src/language/dictionary/sys-file-info.c
+++ b/src/language/dictionary/sys-file-info.c
@@ -17,6 +17,7 @@
  #include <config.h>
  
  #include <ctype.h>
+#include <errno.h>
  #include <float.h>
  #include <stdlib.h>
  
@@ -36,11 +37,16 @@
  #include "language/lexer/lexer.h"
  #include "language/lexer/variable-parser.h"
  #include "libpspp/array.h"
+#include "libpspp/hash-functions.h"
+#include "libpspp/i18n.h"
  #include "libpspp/message.h"
  #include "libpspp/misc.h"
+#include "libpspp/pool.h"
  #include "libpspp/string-array.h"
  #include "output/tab.h"
+#include "output/text-item.h"
  
+#include "gl/localcharset.h"
  #include "gl/minmax.h"
  #include "gl/xalloc.h"
  
@@ -64,6 +70,9 @@ enum
  static int describe_variable (const struct variable *v, struct tab_table *t,
                                int r, int pc, int flags);
  
+static void report_encodings (const struct file_handle *,
+                              const struct sfm_reader *);
+
  /* SYSFILE INFO utility. */
  int
  cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED)
@@ -118,6 +127,13 @@ cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED)
    if (sfm_reader == NULL)
      goto error;
  
+  if (encoding && !strcasecmp (encoding, "detect"))
+    {
+      report_encodings (h, sfm_reader);
+      fh_unref (h);
+      return CMD_SUCCESS;
+    }
+
    reader = sfm_decode (sfm_reader, encoding, &d, &info);
    if (!reader)
      goto error;
@@ -191,7 +207,7 @@ cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED)
                     : info.compression == SFM_COMP_SIMPLE ? "SAV"
                     : "ZSAV");
  
-  tab_text (t, 0, r, TAB_LEFT, _("Charset:"));
+  tab_text (t, 0, r, TAB_LEFT, _("Encoding:"));
    tab_text (t, 1, r++, TAB_LEFT, dict_get_encoding (d));
  
    tab_submit (t);
@@ -745,3 +761,341 @@ display_vectors (const struct dictionary *dict, int sorted)
  
    free (vl);
  }
+\f
+/* Encoding analysis. */
+
+/* This list of encodings is taken from http://encoding.spec.whatwg.org/, as
+   retrieved February 2014.  Encodings not supported by glibc and encodings
+   relevant only to HTML have been removed. */
+static const char *encoding_names[] = {
+  "utf-8",
+  "windows-1252",
+  "iso-8859-2",
+  "iso-8859-3",
+  "iso-8859-4",
+  "iso-8859-5",
+  "iso-8859-6",
+  "iso-8859-7",
+  "iso-8859-8",
+  "iso-8859-10",
+  "iso-8859-13",
+  "iso-8859-14",
+  "iso-8859-16",
+  "macintosh",
+  "windows-874",
+  "windows-1250",
+  "windows-1251",
+  "windows-1253",
+  "windows-1254",
+  "windows-1255",
+  "windows-1256",
+  "windows-1257",
+  "windows-1258",
+  "koi8-r",
+  "koi8-u",
+  "ibm866",
+  "gb18030",
+  "big5",
+  "euc-jp",
+  "iso-2022-jp",
+  "shift_jis",
+  "euc-kr",
+};
+#define N_ENCODING_NAMES (sizeof encoding_names / sizeof *encoding_names)
+
+struct encoding
+  {
+    uint64_t encodings;
+    char **utf8_strings;
+    unsigned int hash;
+  };
+
+static char **
+recode_strings (struct pool *pool,
+                char **strings, bool *ids, size_t n,
+                const char *encoding)
+{
+  char **utf8_strings;
+  size_t i;
+
+  utf8_strings = pool_alloc (pool, n * sizeof *utf8_strings);
+  for (i = 0; i < n; i++)
+    {
+      struct substring utf8;
+      int error;
+
+      error = recode_pedantically ("UTF-8", encoding, ss_cstr (strings[i]),
+                                   pool, &utf8);
+      if (!error)
+        {
+          ss_rtrim (&utf8, ss_cstr (" "));
+          utf8.string[utf8.length] = '\0';
+
+          if (ids[i] && !id_is_plausible (utf8.string, false))
+            error = EINVAL;
+        }
+
+      if (error)
+        return NULL;
+
+      utf8_strings[i] = utf8.string;
+    }
+
+  return utf8_strings;
+}
+
+static struct encoding *
+find_duplicate_encoding (struct encoding *encodings, size_t n_encodings,
+                         char **utf8_strings, size_t n_strings,
+                         unsigned int hash)
+{
+  struct encoding *e;
+
+  for (e = encodings; e < &encodings[n_encodings]; e++)
+    {
+      int i;
+
+      if (e->hash != hash)
+        goto next_encoding;
+
+      for (i = 0; i < n_strings; i++)
+        if (strcmp (utf8_strings[i], e->utf8_strings[i]))
+          goto next_encoding;
+
+      return e;
+    next_encoding:;
+    }
+
+  return NULL;
+}
+
+static bool
+all_equal (const struct encoding *encodings, size_t n_encodings,
+           size_t string_idx)
+{
+  const char *s0;
+  size_t i;
+
+  s0 = encodings[0].utf8_strings[string_idx];
+  for (i = 1; i < n_encodings; i++)
+    if (strcmp (s0, encodings[i].utf8_strings[string_idx]))
+      return false;
+
+  return true;
+}
+
+static int
+equal_prefix (const struct encoding *encodings, size_t n_encodings,
+              size_t string_idx)
+{
+  const char *s0;
+  size_t prefix;
+  size_t i;
+
+  s0 = encodings[0].utf8_strings[string_idx];
+  prefix = strlen (s0);
+  for (i = 1; i < n_encodings; i++)
+    {
+      const char *si = encodings[i].utf8_strings[string_idx];
+      size_t j;
+
+      for (j = 0; j < prefix; j++)
+        if (s0[j] != si[j])
+          {
+            prefix = j;
+            if (!prefix)
+              return 0;
+            break;
+          }
+    }
+
+  while (prefix > 0 && s0[prefix - 1] != ' ')
+    prefix--;
+  return prefix;
+}
+
+static int
+equal_suffix (const struct encoding *encodings, size_t n_encodings,
+              size_t string_idx)
+{
+  const char *s0;
+  size_t s0_len;
+  size_t suffix;
+  size_t i;
+
+  s0 = encodings[0].utf8_strings[string_idx];
+  s0_len = strlen (s0);
+  suffix = s0_len;
+  for (i = 1; i < n_encodings; i++)
+    {
+      const char *si = encodings[i].utf8_strings[string_idx];
+      size_t si_len = strlen (si);
+      size_t j;
+
+      if (si_len < suffix)
+        suffix = si_len;
+      for (j = 0; j < suffix; j++)
+        if (s0[s0_len - j - 1] != si[si_len - j - 1])
+          {
+            suffix = j;
+            if (!suffix)
+              return 0;
+            break;
+          }
+    }
+
+  while (suffix > 0 && s0[s0_len - suffix] != ' ')
+    suffix--;
+  return suffix;
+}
+
+static void
+report_encodings (const struct file_handle *h, const struct sfm_reader *r)
+{
+  char **titles;
+  char **strings;
+  bool *ids;
+  struct encoding encodings[N_ENCODING_NAMES];
+  size_t n_encodings, n_strings, n_unique_strings;
+  size_t i, j;
+  struct tab_table *t;
+  struct text_item *text;
+  struct pool *pool;
+  size_t row;
+
+  pool = pool_create ();
+  n_strings = sfm_get_strings (r, pool, &titles, &ids, &strings);
+
+  n_encodings = 0;
+  for (i = 0; i < N_ENCODING_NAMES; i++)
+    {
+      char **utf8_strings;
+      struct encoding *e;
+      unsigned int hash;
+
+      utf8_strings = recode_strings (pool, strings, ids, n_strings,
+                                     encoding_names[i]);
+      if (!utf8_strings)
+        continue;
+
+      /* Hash utf8_strings. */
+      hash = 0;
+      for (j = 0; j < n_strings; j++)
+        hash = hash_string (utf8_strings[j], hash);
+
+      /* If there's a duplicate encoding, just mark it. */
+      e = find_duplicate_encoding (encodings, n_encodings,
+                                   utf8_strings, n_strings, hash);
+      if (e)
+        {
+          e->encodings |= UINT64_C (1) << i;
+          continue;
+        }
+
+      e = &encodings[n_encodings++];
+      e->encodings = UINT64_C (1) << i;
+      e->utf8_strings = utf8_strings;
+      e->hash = hash;
+    }
+  if (!n_encodings)
+    {
+      msg (SW, _("No valid encodings found."));
+      pool_destroy (pool);
+      return;
+    }
+
+  text = text_item_create_format (
+    TEXT_ITEM_PARAGRAPH,
+    _("The following table lists the encodings that can successfully read %s, "
+      "by specifying the encoding name on the GET command's ENCODING "
+      "subcommand.  Encodings that yield identical text are listed "
+      "together."), fh_get_name (h));
+  text_item_submit (text);
+
+  t = tab_create (2, n_encodings + 1);
+  tab_title (t, _("Usable encodings for %s."), fh_get_name (h));
+  tab_headers (t, 1, 0, 1, 0);
+  tab_box (t, TAL_1, TAL_1, -1, -1, 0, 0, 1, n_encodings);
+  tab_hline (t, TAL_1, 0, 1, 1);
+  tab_text (t, 0, 0, TAB_RIGHT, "#");
+  tab_text (t, 1, 0, TAB_LEFT, _("Encodings"));
+  for (i = 0; i < n_encodings; i++)
+    {
+      struct string s;
+
+      ds_init_empty (&s);
+      for (j = 0; j < 64; j++)
+        if (encodings[i].encodings & (UINT64_C (1) << j))
+          ds_put_format (&s, "%s, ", encoding_names[j]);
+      ds_chomp (&s, ss_cstr (", "));
+
+      tab_text_format (t, 0, i + 1, TAB_RIGHT, "%d", i + 1);
+      tab_text (t, 1, i + 1, TAB_LEFT, ds_cstr (&s));
+      ds_destroy (&s);
+    }
+  tab_submit (t);
+
+  n_unique_strings = 0;
+  for (i = 0; i < n_strings; i++)
+    if (!all_equal (encodings, n_encodings, i))
+      n_unique_strings++;
+  if (!n_unique_strings)
+    {
+      pool_destroy (pool);
+      return;
+    }
+
+  text = text_item_create_format (
+    TEXT_ITEM_PARAGRAPH,
+    _("The following table lists text strings in the file dictionary that "
+      "the encodings above interpret differently, along with those "
+      "interpretations."));
+  text_item_submit (text);
+
+  t = tab_create (3, (n_encodings * n_unique_strings) + 1);
+  tab_title (t, _("%s encoded text strings."), fh_get_name (h));
+  tab_headers (t, 1, 0, 1, 0);
+  tab_box (t, TAL_1, TAL_1, -1, -1, 0, 0, 2, n_encodings * n_unique_strings);
+  tab_hline (t, TAL_1, 0, 2, 1);
+
+  tab_text (t, 0, 0, TAB_LEFT, _("Purpose"));
+  tab_text (t, 1, 0, TAB_RIGHT, "#");
+  tab_text (t, 2, 0, TAB_LEFT, _("Text"));
+
+  row = 1;
+  for (i = 0; i < n_strings; i++)
+    if (!all_equal (encodings, n_encodings, i))
+      {
+        int prefix = equal_prefix (encodings, n_encodings, i);
+        int suffix = equal_suffix (encodings, n_encodings, i);
+
+        tab_joint_text (t, 0, row, 0, row + n_encodings - 1,
+                        TAB_LEFT, titles[i]);
+        tab_hline (t, TAL_1, 0, 2, row);
+        for (j = 0; j < n_encodings; j++)
+          {
+            const char *s = encodings[j].utf8_strings[i] + prefix;
+
+            tab_text_format (t, 1, row, TAB_RIGHT, "%d", j + 1);
+            if (prefix || suffix)
+              {
+                size_t len = strlen (s) - suffix;
+                struct string entry;
+
+                ds_init_empty (&entry);
+                if (prefix)
+                  ds_put_cstr (&entry, "...");
+                ds_put_substring (&entry, ss_buffer (s, len));
+                if (suffix)
+                  ds_put_cstr (&entry, "...");
+                tab_text (t, 2, row, TAB_LEFT, ds_cstr (&entry));
+              }
+            else
+              tab_text (t, 2, row, TAB_LEFT, s);
+            row++;
+          }
+      }
+  tab_submit (t);
+
+  pool_destroy (pool);
+}
diff --git a/src/libpspp/i18n.c b/src/libpspp/i18n.c

index 10b3927f9efd1040b6fbc010b33aa352c66b7f66..df7ae67da0d9695bfa511953a0a0e5e6f3c23e80 100644 (file)
--- a/src/libpspp/i18n.c
+++ b/src/libpspp/i18n.c
@@ -148,12 +148,10 @@ recode_string_len (const char *to, const char *from,
     Returns the output length if successful, -1 if the output buffer is too
     small. */
  static ssize_t
-try_recode (iconv_t conv,
+try_recode (iconv_t conv, char fallbackchar,
              const char *in, size_t inbytes,
              char *out_, size_t outbytes)
  {
-  /* FIXME: Need to ensure that this char is valid in the target encoding */
-  const char fallbackchar = '?';
    char *out = out_;
    int i;
  
@@ -181,14 +179,18 @@ try_recode (iconv_t conv,
            {
            case EINVAL:
              if (outbytes < 2)
-              return -1;
+              return -E2BIG;
+            if (!fallbackchar)
+              return -EINVAL;
              *out++ = fallbackchar;
              *out = '\0';
              return out - out_;
  
            case EILSEQ:
              if (outbytes == 0)
-              return -1;
+              return -E2BIG;
+            if (!fallbackchar)
+              return -EILSEQ;
              *out++ = fallbackchar;
              outbytes--;
              if (inp)
@@ -199,7 +201,7 @@ try_recode (iconv_t conv,
              break;
  
            case E2BIG:
-            return -1;
+            return -E2BIG;
  
            default:
              /* should never happen */
@@ -211,7 +213,7 @@ try_recode (iconv_t conv,
      }
  
    if (outbytes == 0)
-    return -1;
+    return -E2BIG;
  
    *out = '\0';
    return out - out_;
@@ -518,6 +520,57 @@ filename_to_utf8 (const char *filename)
    return recode_string ("UTF-8", filename_encoding (), filename, -1);
  }
  
+static int
+recode_substring_pool__ (const char *to, const char *from,
+                         struct substring text, char fallbackchar,
+                         struct pool *pool, struct substring *out)
+{
+  size_t bufsize;
+  iconv_t conv ;
+
+  if (to == NULL)
+    to = default_encoding;
+
+  if (from == NULL)
+    from = default_encoding;
+
+  conv = create_iconv (to, from);
+
+  if ( (iconv_t) -1 == conv )
+    {
+      if (fallbackchar)
+        {
+          out->string = pool_malloc (pool, text.length + 1);
+          out->length = text.length;
+          memcpy (out->string, text.string, text.length);
+          out->string[out->length] = '\0';
+          return 0;
+        }
+      else
+        return EPROTO;
+    }
+
+  for (bufsize = text.length + 1; bufsize > text.length; bufsize *= 2)
+    {
+      char *output = pool_malloc (pool, bufsize);
+      ssize_t retval;
+
+      retval = try_recode (conv, fallbackchar, text.string, text.length,
+                           output, bufsize);
+      if (retval >= 0)
+        {
+          *out = ss_buffer (output, retval);
+          return 0;
+        }
+      pool_free (pool, output);
+
+      if (retval != -E2BIG)
+        return -retval;
+    }
+
+  NOT_REACHED ();
+}
+
  /* Converts the string TEXT, which should be encoded in FROM-encoding, to a
     dynamically allocated string in TO-encoding.  Any characters which cannot be
     converted will be represented by '?'.
@@ -533,42 +586,32 @@ struct substring
  recode_substring_pool (const char *to, const char *from,
                         struct substring text, struct pool *pool)
  {
-  size_t outbufferlength;
-  iconv_t conv ;
-
-  if (to == NULL)
-    to = default_encoding;
-
-  if (from == NULL)
-    from = default_encoding;
-
-  conv = create_iconv (to, from);
+  struct substring out;
  
-  if ( (iconv_t) -1 == conv )
-    {
-      struct substring out;
+  recode_substring_pool__ (to, from, text, '?', pool, &out);
+  return out;
+}
  
-      out.string = pool_malloc (pool, text.length + 1);
-      out.length = text.length;
-      memcpy (out.string, text.string, text.length);
-      out.string[out.length] = '\0';
-      return out;
-    }
+/* Converts the string TEXT, which should be encoded in FROM-encoding, to a
+   dynamically allocated string in TO-encoding.  On success, returns 0, and the
+   converted null-terminated string, allocated from POOL with pool_malloc(), is
+   stored in *OUT.  On failure, returns a positive errno value.
  
-  for ( outbufferlength = 1 ; outbufferlength != 0; outbufferlength <<= 1 )
-    if ( outbufferlength > text.length)
-      {
-        char *output = pool_malloc (pool, outbufferlength);
-        ssize_t output_len = try_recode (conv, text.string, text.length,
-                                         output, outbufferlength);
-        if (output_len >= 0)
-          return ss_buffer (output, output_len);
-        pool_free (pool, output);
-      }
+   The function fails with an error if any part of the input string is not
+   valid in the declared input encoding. */
+int
+recode_pedantically (const char *to, const char *from,
+                     struct substring text, struct pool *pool,
+                     struct substring *out)
+{
+  int error;
  
-  NOT_REACHED ();
+  error = recode_substring_pool__ (to, from, text, 0, pool, out);
+  if (error)
+    *out = ss_empty ();
+  return error;
  }
-
+\f
  void
  i18n_init (void)
  {
diff --git a/src/libpspp/i18n.h b/src/libpspp/i18n.h

index 6722b5cec9c3cec743facd4238d50e4f9140c78a..54717bcaa983d955df4e30e19ad8f594bad2749d 100644 (file)
--- a/src/libpspp/i18n.h
+++ b/src/libpspp/i18n.h
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 2006, 2010, 2011, 2012 Free Software Foundation, Inc.
+   Copyright (C) 2006, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -40,6 +40,9 @@ char *recode_string_pool (const char *to, const char *from,
                           const char *text, int length, struct pool *);
  struct substring recode_substring_pool (const char *to, const char *from,
                                          struct substring text, struct pool *);
+int recode_pedantically (const char *to, const char *from,
+                         struct substring text, struct pool *,
+                         struct substring *out);
  
  size_t recode_string_len (const char *to, const char *from,
                            const char *text, int len);
diff --git a/tests/data/sys-file-reader.at b/tests/data/sys-file-reader.at

index c4d16b3294038f0ff8d83954be9770cb31ba9cca..c63e751f230d1f7b4c84001950e18dd9065891d7 100644 (file)
--- a/tests/data/sys-file-reader.at
+++ b/tests/data/sys-file-reader.at
@@ -1530,6 +1530,42 @@ AT_CLEANUP
  \f
  AT_BANNER([system file reader - negative])
  
+AT_SETUP([unspecified character encoding])
+AT_KEYWORDS([sack synthetic system file positive])
+AT_DATA([sys-file.sack], [dnl
+dnl File header.
+"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
+2; dnl Layout code
+4; dnl Nominal case size
+0; dnl Not compressed
+0; dnl Not weighted
+0; dnl No cases.
+100.0; dnl Bias.
+"01 Jan 11"; "20:53:52";
+"PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 "";
+i8 0 *3;
+
+dnl Numeric variables.
+2; 0; 0; 0; 0x050800 *2; s8 "A";
+2; 0; 0; 0; 0x050800 *2; s8 "B";
+2; 0; 0; 0; 0x050800 *2; s8 "C";
+2; 0; 0; 0; 0x050800 *2; s8 "D";
+
+dnl Dictionary termination record.
+999; 0;
+])
+for variant in be le; do
+  AT_CHECK([sack --$variant sys-file.sack > sys-file.sav])
+  AT_DATA([sys-file.sps], [dnl
+GET 'sys-file.sav'.
+])
+  AT_CHECK([pspp -O format=csv sys-file.sps], [0], [stdout])
+  AT_CHECK([sed 's/default encoding.*For/default encoding.  For/' stdout], [0], [dnl
+"warning: `sys-file.sav': This system file does not indicate its own character encoding.  Using default encoding.  For best results, specify an encoding explicitly.  Use SYSFILE INFO with ENCODING=""DETECT"" to analyze the possible encodings."
+])
+done
+AT_CLEANUP
+
  AT_SETUP([misplaced type 4 record])
  AT_KEYWORDS([sack synthetic system file negative])
  AT_DATA([sys-file.sack], [dnl
diff --git a/tests/data/sys-file.at b/tests/data/sys-file.at

index 033a7da9af414433bffaeb970c8ce8263e7f3a59..bd5671cb01d6b717ac1d188e5dde112d781bb67c 100644 (file)
--- a/tests/data/sys-file.at
+++ b/tests/data/sys-file.at
@@ -321,7 +321,7 @@ AT_BANNER([system files -- very long strings])
  AT_SETUP([read very long strings written by SPSS 13])
  AT_CHECK([cp $top_srcdir/tests/data/v13.sav .])
  AT_DATA([sys-file.sps], [dnl
-GET FILE='v13.sav'.
+GET FILE='v13.sav' ENCODING='utf-8'.
  DISPLAY VARIABLES.
  LIST.
  ])
@@ -355,7 +355,7 @@ AT_CLEANUP
  AT_SETUP([read very long strings written by SPSS 14])
  AT_CHECK([cp $top_srcdir/tests/data/v14.sav .])
  AT_DATA([sys-file.sps], [dnl
-GET FILE='v14.sav'.
+GET FILE='v14.sav' ENCODING='utf-8'.
  DISPLAY VARIABLES.
  LIST.
  ])
diff --git a/tests/language/dictionary/sys-file-info.at b/tests/language/dictionary/sys-file-info.at

index 6a5e4adfef1413b53674fda4abc7a64b1ae0ce6c..45eb3afbdf1e90c4d9bcfc84099445ef131505ef 100644 (file)
--- a/tests/language/dictionary/sys-file-info.at
+++ b/tests/language/dictionary/sys-file-info.at
@@ -18,7 +18,7 @@ AT_CHECK(
         -e '/^Endian:,/d' \
         -e '/^Integer Format:,/d' \
         -e '/^Real Format:,/d' \
-       -e '/^Charset:,/d' pspp.csv],
+       -e '/^Encoding:,/d' pspp.csv],
    [0], [dnl
  Table: Reading free-form data from INLINE.
  Variable,Format
author	Ben Pfaff <blp@cs.stanford.edu>
	Sun, 16 Feb 2014 22:59:54 +0000 (14:59 -0800)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Sun, 16 Feb 2014 22:59:54 +0000 (14:59 -0800)
NEWS		patch \| blob \| history
doc/files.texi		patch \| blob \| history
src/data/sys-file-reader.c		patch \| blob \| history
src/data/sys-file-reader.h		patch \| blob \| history
src/language/dictionary/sys-file-info.c		patch \| blob \| history
src/libpspp/i18n.c		patch \| blob \| history
src/libpspp/i18n.h		patch \| blob \| history
tests/data/sys-file-reader.at		patch \| blob \| history
tests/data/sys-file.at		patch \| blob \| history
tests/language/dictionary/sys-file-info.at		patch \| blob \| history