Use UTF-8 case-insensitive hashes and comparisons for language identifiers.

[pspp] / src / data / sys-file-reader.c
diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c

index 336549b7614248bee9ebaf7077dbcd1b0f2add1d..defe460f5cb6cc8f5249bfba6c9254d41ee6b7a6 100644 (file)
--- a/src/data/sys-file-reader.c
+++ b/src/data/sys-file-reader.c
@@ -50,6 +50,7 @@
  #include "libpspp/str.h"
  #include "libpspp/stringi-set.h"
  
+#include "gl/c-strtod.h"
  #include "gl/c-ctype.h"
  #include "gl/inttostr.h"
  #include "gl/localcharset.h"
@@ -88,6 +89,7 @@ enum
  /* Fields from the top-level header record. */
  struct sfm_header_record
    {
+    char magic[5];              /* First 4 bytes of file, then null. */
      int weight_idx;             /* 0 if unweighted, otherwise a var index. */
      int nominal_case_size;      /* Number of var positions. */
  
@@ -213,6 +215,7 @@ static void skip_extension_record (struct sfm_reader *, int subtype);
  
  static const char *choose_encoding (
    struct sfm_reader *,
+  const struct sfm_header_record *,
    const struct sfm_extension_record *ext_integer,
    const struct sfm_extension_record *ext_encoding);
  
@@ -310,15 +313,20 @@ sfm_read_info_destroy (struct sfm_read_info *info)
  /* Opens the system file designated by file handle FH for reading.  Reads the
     system file's dictionary into *DICT.
  
+   Ordinarily the reader attempts to automatically detect the character
+   encoding based on the file's contents.  This isn't always possible,
+   especially for files written by old versions of SPSS or PSPP, so specifying
+   a nonnull ENCODING overrides the choice of character encoding.
+
     If INFO is non-null, then it receives additional info about the system file,
     which the caller must eventually free with sfm_read_info_destroy() when it
     is no longer needed. */
  struct casereader *
-sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
-                 struct sfm_read_info *infop)
+sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
+                 struct dictionary **dictp, struct sfm_read_info *infop)
  {
    struct sfm_reader *volatile r = NULL;
-  struct sfm_read_info info;
+  struct sfm_read_info *volatile info;
  
    struct sfm_header_record header;
  
@@ -332,7 +340,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
  
    struct sfm_extension_record *extensions[32];
  
-  struct dictionary *dict = NULL;
+  struct dictionary *volatile dict = NULL;
    size_t i;
  
    /* Create and initialize reader. */
@@ -345,7 +353,8 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
    r->opcode_idx = sizeof r->opcodes;
    r->corruption_warning = false;
  
-  memset (&info, 0, sizeof info);
+  info = infop ? infop : xmalloc (sizeof *info);
+  memset (info, 0, sizeof *info);
  
    /* TRANSLATORS: this fragment will be interpolated into
       messages in fh_lock() that identify types of files. */
@@ -365,7 +374,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
      goto error;
  
    /* Read header. */
-  read_header (r, &info, &header);
+  read_header (r, info, &header);
  
    vars = NULL;
    n_vars = allocated_vars = 0;
@@ -452,8 +461,10 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
  
       First, figure out the correct character encoding, because this determines
       how the rest of the header data is to be interpreted. */
-  dict = dict_create (choose_encoding (r, extensions[EXT_INTEGER],
-                                       extensions[EXT_ENCODING]));
+  dict = dict_create (encoding
+                      ? encoding
+                      : choose_encoding (r, &header, extensions[EXT_INTEGER],
+                                         extensions[EXT_ENCODING]));
    r->encoding = dict_get_encoding (dict);
  
    /* These records don't use variables at all. */
@@ -461,7 +472,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
      parse_document (dict, document);
  
    if (extensions[EXT_INTEGER] != NULL)
-    parse_machine_integer_info (r, extensions[EXT_INTEGER], &info);
+    parse_machine_integer_info (r, extensions[EXT_INTEGER], info);
  
    if (extensions[EXT_FLOAT] != NULL)
      parse_machine_float_info (r, extensions[EXT_FLOAT]);
@@ -469,7 +480,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
    if (extensions[EXT_FILE_ATTRS] != NULL)
      parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
  
-  parse_header (r, &header, &info, dict);
+  parse_header (r, &header, info, dict);
  
    /* Parse the variable records, the basis of almost everything else. */
    parse_variable_records (r, dict, vars, n_vars);
@@ -522,7 +533,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
       wrong when very long strings are involved, so don't warn in
       that case. */
    if (header.nominal_case_size != -1 && header.nominal_case_size != n_vars
-      && info.version_major != 13)
+      && info->version_major != 13)
      sys_warn (r, -1, _("File header claims %d variable positions but "
                         "%zu were read from file."),
                header.nominal_case_size, n_vars);
@@ -536,10 +547,11 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
    r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
  
    *dictp = dict;
-  if (infop)
-    *infop = info;
-  else
-    sfm_read_info_destroy (&info);
+  if (infop != info)
+    {
+      sfm_read_info_destroy (info);
+      free (info);
+    }
  
    return casereader_create_sequential
      (NULL, r->proto,
@@ -547,7 +559,12 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
                                         &sys_file_casereader_class, r);
  
  error:
-  sfm_read_info_destroy (&info);
+  if (infop != info)
+    {
+      sfm_read_info_destroy (info);
+      free (info);
+    }
+
    close_reader (r);
    dict_destroy (dict);
    *dictp = NULL;
@@ -598,13 +615,13 @@ sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
  bool
  sfm_detect (FILE *file)
  {
-  char rec_type[5];
+  char magic[5];
  
-  if (fread (rec_type, 4, 1, file) != 1)
+  if (fread (magic, 4, 1, file) != 1)
      return false;
-  rec_type[4] = '\0';
+  magic[4] = '\0';
  
-  return !strcmp ("$FL2", rec_type);
+  return !strcmp (ASCII_MAGIC, magic) || !strcmp (EBCDIC_MAGIC, magic);
  }
  \f
  /* Reads the global header of the system file.  Initializes *HEADER and *INFO,
@@ -614,14 +631,14 @@ static void
  read_header (struct sfm_reader *r, struct sfm_read_info *info,
               struct sfm_header_record *header)
  {
-  char rec_type[5];
    uint8_t raw_layout_code[4];
    uint8_t raw_bias[8];
  
-  read_string (r, rec_type, sizeof rec_type);
+  read_string (r, header->magic, sizeof header->magic);
    read_string (r, header->eye_catcher, sizeof header->eye_catcher);
  
-  if (strcmp ("$FL2", rec_type) != 0)
+  if (strcmp (ASCII_MAGIC, header->magic)
+      && strcmp (EBCDIC_MAGIC, header->magic))
      sys_error (r, 0, _("This is not an SPSS system file."));
  
    /* Identify integer format. */
@@ -1185,6 +1202,7 @@ parse_machine_integer_info (struct sfm_reader *r,
  
  static const char *
  choose_encoding (struct sfm_reader *r,
+                 const struct sfm_header_record *header,
                   const struct sfm_extension_record *ext_integer,
                   const struct sfm_extension_record *ext_encoding)
  {
@@ -1223,6 +1241,10 @@ choose_encoding (struct sfm_reader *r,
          }
      }
  
+  /* If the file magic number is EBCDIC then its character data is too. */
+  if (!strcmp (header->magic, EBCDIC_MAGIC))
+    return "EBCDIC-US";
+
    return locale_charset ();
  }
  
@@ -1407,6 +1429,7 @@ parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
                      _("MRSET %s has only %zu variables."), mrset->name,
                      mrset->n_vars);
            mrset_destroy (mrset);
+         stringi_set_destroy (&var_names);
            continue;
          }
  
@@ -1415,7 +1438,7 @@ parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
            mrset->width = width;
            value_init (&mrset->counted, width);
            if (width == 0)
-            mrset->counted.f = strtod (counted, NULL);
+            mrset->counted.f = c_strtod (counted, NULL);
            else
              value_copy_str_rpad (&mrset->counted, width,
                                   (const uint8_t *) counted, ' ');
@@ -1577,7 +1600,6 @@ parse_long_var_name_map (struct sfm_reader *r,
    while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
      {
        /* Validate long name. */
-      /* XXX need to reencode name to UTF-8 */
        if (!dict_id_is_valid (dict, long_name, false))
          {
            sys_warn (r, record->pos,
@@ -1588,7 +1610,7 @@ parse_long_var_name_map (struct sfm_reader *r,
          }
  
        /* Identify any duplicates. */
-      if (strcasecmp (var_get_short_name (var, 0), long_name)
+      if (utf8_strcasecmp (var_get_short_name (var, 0), long_name)
            && dict_lookup_var (dict, long_name) != NULL)
          {
            sys_warn (r, record->pos,
@@ -2425,7 +2447,7 @@ text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
  
    start = text->pos;
    n = 0;
-  for (;;)
+  while (text->pos < text->buffer.length)
      {
        int c = text->buffer.string[text->pos];
        if (c < '0' || c > '9')
@@ -2433,7 +2455,7 @@ text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
        n = (n * 10) + (c - '0');
        text->pos++;
      }
-  if (start == text->pos)
+  if (text->pos >= text->buffer.length || start == text->pos)
      {
        sys_warn (r, text->start,
                  _("Expecting digit at offset %zu in MRSETS record."),