X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fdata%2Fsys-file-reader.c;h=f63a122fe83b96776b632cef57b51e49c83c4bd1;hb=3bbb4370239deb29ebbf813d258aef6249e2a431;hp=cbbbdd153095ec710b216848c99c0486a4db6f74;hpb=d5429be34170d121aeffdb7c56a0ad0ab09b1748;p=pspp-builds.git

diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c
index cbbbdd15..f63a122f 100644
--- a/src/data/sys-file-reader.c
+++ b/src/data/sys-file-reader.c
@@ -25,6 +25,7 @@
 #include <setjmp.h>
 #include <stdlib.h>
 
+#include <libpspp/i18n.h>
 #include <libpspp/assertion.h>
 #include <libpspp/message.h>
 #include <libpspp/compiler.h>
@@ -87,6 +88,7 @@ struct sfm_reader
     double bias;		/* Compression bias, usually 100.0. */
     uint8_t opcodes[8];         /* Current block of opcodes. */
     size_t opcode_idx;          /* Next opcode to interpret, 8 if none left. */
+    bool corruption_warning;    /* Warned about possible corruption? */
   };
 
 static const struct casereader_class sys_file_casereader_class;
@@ -186,6 +188,62 @@ static void read_long_string_value_labels (struct sfm_reader *,
 					   size_t size, size_t count,
 					   struct dictionary *);
 
+/* Convert all the strings in DICT from the dict encoding to UTF8 */
+static void
+recode_strings (struct dictionary *dict)
+{
+  int i;
+
+  const char *enc = dict_get_encoding (dict);
+
+  if ( NULL == enc)
+    enc = get_default_encoding ();
+
+  for (i = 0 ; i < dict_get_var_cnt (dict); ++i)
+    {
+      /* Convert the long variable name */
+      struct variable *var = dict_get_var (dict, i);
+      const char *native_name = var_get_name (var);
+      char *utf8_name = recode_string (UTF8, enc, native_name, -1);
+      if ( 0 != strcmp (utf8_name, native_name))
+	{
+	  if ( NULL == dict_lookup_var (dict, utf8_name))
+	    dict_rename_var (dict, var, utf8_name);
+	  else
+	    msg (MW,
+	     _("Recoded variable name duplicates an existing `%s' within system file."), utf8_name);
+    }
+
+      free (utf8_name);
+
+      /* Convert the variable label */
+      if (var_has_label (var))
+	{
+	  char *utf8_label = recode_string (UTF8, enc, var_get_label (var), -1);
+	  var_set_label (var, utf8_label);
+	  free (utf8_label);
+	}
+
+      if (var_has_value_labels (var))
+	{
+	  const struct val_lab *vl = NULL;
+	  const struct val_labs *vlabs = var_get_value_labels (var);
+
+	  for (vl = val_labs_first (vlabs); vl != NULL; vl = val_labs_next (vlabs, vl))
+	    {
+	      const union value *val = val_lab_get_value (vl);
+	      const char *label = val_lab_get_label (vl);
+	      char *new_label = NULL;
+
+	      new_label = recode_string (UTF8, enc, label, -1);
+
+	      var_replace_value_label (var, val, new_label);
+	      free (new_label);
+	    }
+	}
+    }
+}
+
 /* Opens the system file designated by file handle FH for
    reading.  Reads the system file's dictionary into *DICT.
    If INFO is non-null, then it receives additional info about the
@@ -213,6 +271,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
   r->oct_cnt = 0;
   r->has_long_var_names = false;
   r->opcode_idx = sizeof r->opcodes;
+  r->corruption_warning = false;
 
   /* TRANSLATORS: this fragment will be interpolated into
      messages in fh_lock() that identify types of files. */
@@ -303,6 +362,8 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
       r->has_long_var_names = true;
     }
 
+  recode_strings (*dict);
+
   /* Read record 999 data, which is just filler. */
   read_int (r);
 
@@ -446,9 +507,21 @@ read_header (struct sfm_reader *r, struct dictionary *dict,
   read_bytes (r, raw_bias, sizeof raw_bias);
   if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
     {
-      sys_warn (r, _("Compression bias is not the usual "
-                     "value of 100, or system file uses unrecognized "
-                     "floating-point format."));
+      uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+      if (memcmp (raw_bias, zero_bias, 8))
+        sys_warn (r, _("Compression bias is not the usual "
+                       "value of 100, or system file uses unrecognized "
+                       "floating-point format."));
+      else
+        {
+          /* Some software is known to write all-zeros to this
+             field.  Such software also writes floating-point
+             numbers in the format that we expect by default
+             (it seems that all software most likely does, in
+             reality), so don't warn in this case. */
+        }
+
       if (r->integer_format == INTEGER_MSB_FIRST)
         r->float_format = FLOAT_IEEE_DOUBLE_BE;
       else
@@ -582,7 +655,7 @@ read_variable_record (struct sfm_reader *r, struct dictionary *dict,
           value_set_missing (&value, mv_width);
           for (i = 0; i < missing_value_code; i++)
             {
-              char *s = value_str_rw (&value, mv_width);
+              uint8_t *s = value_str_rw (&value, mv_width);
               read_bytes (r, s, 8);
               mv_add_str (&mv, s);
             }
@@ -860,7 +933,7 @@ read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count,
     NOT_REACHED ();
   if (integer_representation != expected_integer_format)
     {
-      static const char *const endian[] = {N_("little-endian"), N_("big-endian")};
+      static const char *const endian[] = {N_("Little Endian"), N_("Big Endian")};
       sys_warn (r, _("Integer format indicated by system file (%s) "
                      "differs from expected (%s)."),
                 gettext (endian[integer_representation == 1]),
@@ -1138,7 +1211,7 @@ read_value_labels (struct sfm_reader *r,
 
   struct label
     {
-      char raw_value[8];        /* Value as uninterpreted bytes. */
+      uint8_t raw_value[8];        /* Value as uninterpreted bytes. */
       union value value;        /* Value. */
       char *label;              /* Null-terminated label string. */
     };
@@ -1236,7 +1309,7 @@ read_value_labels (struct sfm_reader *r,
 
       value_init_pool (subpool, &label->value, max_width);
       if (var_is_alpha (var[0]))
-        buf_copy_rpad (value_str_rw (&label->value, max_width), max_width,
+        u8_buf_copy_rpad (value_str_rw (&label->value, max_width), max_width,
                        label->raw_value, sizeof label->raw_value, ' ');
       else
         label->value.f = float_get_double (r->float_format, label->raw_value);
@@ -1416,7 +1489,7 @@ read_long_string_value_labels (struct sfm_reader *r,
           /* Read value. */
           value_length = read_int (r);
           if (value_length == width)
-            read_string (r, value_str_rw (&value, width), width + 1);
+            read_bytes (r, value_str_rw (&value, width), width);
           else
             {
               sys_warn (r, _("Ignoring long string value %zu for variable %s, "
@@ -1473,11 +1546,11 @@ static void partial_record (struct sfm_reader *r)
 static void read_error (struct casereader *, const struct sfm_reader *);
 
 static bool read_case_number (struct sfm_reader *, double *);
-static bool read_case_string (struct sfm_reader *, char *, size_t);
+static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
 static int read_opcode (struct sfm_reader *);
 static bool read_compressed_number (struct sfm_reader *, double *);
-static bool read_compressed_string (struct sfm_reader *, char *);
-static bool read_whole_strings (struct sfm_reader *, char *, size_t);
+static bool read_compressed_string (struct sfm_reader *, uint8_t *);
+static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
 static bool skip_whole_strings (struct sfm_reader *, size_t);
 
 /* Reads and returns one case from READER's file.  Returns a null
@@ -1512,7 +1585,7 @@ sys_file_casereader_read (struct casereader *reader, void *r_)
         }
       else
         {
-          char *s = value_str_rw (v, sv->var_width);
+          uint8_t *s = value_str_rw (v, sv->var_width);
           if (!read_case_string (r, s + sv->offset, sv->segment_width))
             goto eof;
           if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
@@ -1574,7 +1647,7 @@ read_case_number (struct sfm_reader *r, double *d)
    Returns true if successful, false if end of file is
    reached immediately. */
 static bool
-read_case_string (struct sfm_reader *r, char *s, size_t length)
+read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
 {
   size_t whole = ROUND_DOWN (length, 8);
   size_t partial = length % 8;
@@ -1587,7 +1660,7 @@ read_case_string (struct sfm_reader *r, char *s, size_t length)
 
   if (partial)
     {
-      char bounce[8];
+      uint8_t bounce[8];
       if (!read_whole_strings (r, bounce, sizeof bounce))
         {
           if (whole)
@@ -1639,7 +1712,14 @@ read_compressed_number (struct sfm_reader *r, double *d)
       break;
 
     case 254:
-      sys_error (r, _("Compressed data is corrupt."));
+      float_convert (r->float_format, "        ", FLOAT_NATIVE_DOUBLE, d);
+      if (!r->corruption_warning)
+        {
+          r->corruption_warning = true;
+          sys_warn (r, _("Possible compressed data corruption: "
+                         "compressed spaces appear in numeric field."));
+        }
+      break;
 
     case 255:
       *d = SYSMIS;
@@ -1658,9 +1738,10 @@ read_compressed_number (struct sfm_reader *r, double *d)
    Returns true if successful, false if end of file is
    reached immediately. */
 static bool
-read_compressed_string (struct sfm_reader *r, char *dst)
+read_compressed_string (struct sfm_reader *r, uint8_t *dst)
 {
-  switch (read_opcode (r))
+  int opcode = read_opcode (r);
+  switch (opcode)
     {
     case -1:
     case 252:
@@ -1675,7 +1756,25 @@ read_compressed_string (struct sfm_reader *r, char *dst)
       break;
 
     default:
-      sys_error (r, _("Compressed data is corrupt."));
+      {
+        double value = opcode - r->bias;
+        float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
+        if (value == 0.0)
+          {
+            /* This has actually been seen "in the wild".  The submitter of the
+               file that showed that the contents decoded as spaces, but they
+               were at the end of the field so it's possible that the null
+               bytes just acted as null terminators. */
+          }
+        else if (!r->corruption_warning)
+          {
+            r->corruption_warning = true;
+            sys_warn (r, _("Possible compressed data corruption: "
+                           "string contains compressed integer (opcode %d)"),
+                      opcode);
+          }
+      }
+      break;
     }
 
   return true;
@@ -1687,7 +1786,7 @@ read_compressed_string (struct sfm_reader *r, char *dst)
    Returns true if successful, false if end of file is
    reached immediately. */
 static bool
-read_whole_strings (struct sfm_reader *r, char *s, size_t length)
+read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
 {
   assert (length % 8 == 0);
   if (!r->compressed)
@@ -1715,7 +1814,7 @@ read_whole_strings (struct sfm_reader *r, char *s, size_t length)
 static bool
 skip_whole_strings (struct sfm_reader *r, size_t length)
 {
-  char buffer[1024];
+  uint8_t buffer[1024];
   assert (length < sizeof buffer);
   return read_whole_strings (r, buffer, length);
 }