GET: Add an ENCODING subcommand.

author Ben Pfaff <blp@cs.stanford.edu>

Wed, 8 Feb 2012 06:58:09 +0000 (22:58 -0800)

committer Ben Pfaff <blp@cs.stanford.edu>

Wed, 8 Feb 2012 06:59:16 +0000 (22:59 -0800)
author Ben Pfaff <blp@cs.stanford.edu>
Wed, 8 Feb 2012 06:58:09 +0000 (22:58 -0800)
committer Ben Pfaff <blp@cs.stanford.edu>
Wed, 8 Feb 2012 06:59:16 +0000 (22:59 -0800)
diff --git a/NEWS b/NEWS

index 916c7d484890a3e10f009b0ce790bebab5d51f77..aa8c0df021fba1c811e5a9ba7bd84d2ea4fe7670 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -40,7 +40,7 @@ Changes from 0.6.2 to 0.7.9:
  
     - HOST has been updated to use more modern syntax.
  
-   - INCLUDE and INSERT have a new ENCODING subcommand.
+   - GET, INCLUDE, and INSERT have a new ENCODING subcommand.
  
     - MISSING VALUES can now assign missing values to long string
       variables.
diff --git a/doc/files.texi b/doc/files.texi

index cdce0a3c4689875560ef57dad998253c5f51b5e2..86aecab52c97b31025458e7af0a4d47e20ec9c07 100644 (file)
--- a/doc/files.texi
+++ b/doc/files.texi
@@ -139,6 +139,7 @@ GET
          /DROP=var_list
          /KEEP=var_list
          /RENAME=(src_names=target_names)@dots{}
+        /ENCODING='encoding'
  @end display
  
  @cmd{GET} clears the current dictionary and active dataset and
@@ -171,6 +172,13 @@ Each may be present any number of times.  @cmd{GET} never modifies a
  file on disk.  Only the active dataset read from the file
  is affected by these subcommands.
  
+PSPP tries to automatically detect the encoding of string data in the
+file.  Sometimes, however, this does not work well encoding,
+especially for files written by old versions of SPSS or PSPP.  Specify
+the ENCODING subcommand with an IANA character set name as its string
+argument to override the default.  The ENCODING subcommand is a PSPP
+extension.
+
  @cmd{GET} does not cause the data to be read, only the dictionary.  The data
  is read later, when a procedure is executed.
  
diff --git a/perl-module/PSPP.xs b/perl-module/PSPP.xs

index f6afa29b10098ee7614e8fa880cccaf1269b5097..834ec401f483900d5cd1cb5bfb5d3e08045e7e4a 100644 (file)
--- a/perl-module/PSPP.xs
+++ b/perl-module/PSPP.xs
@@ -709,7 +709,7 @@ CODE:
          fh_create_file (NULL, name, fh_default_properties () );
  
   sri = xmalloc (sizeof (*sri));
- sri->reader = sfm_open_reader (fh, &sri->dict, &sri->opts);
+ sri->reader = sfm_open_reader (fh, NULL, &sri->dict, &sri->opts);
  
   if ( NULL == sri->reader)
   {
diff --git a/src/data/any-reader.c b/src/data/any-reader.c

index 50feb6892e936f4fbf9ea54d14f90707c11acf67..1b488f208a84c78f27936f4c1b513d46864c4c0d 100644 (file)
--- a/src/data/any-reader.c
+++ b/src/data/any-reader.c
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 2006, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 2006, 2010, 2011, 2012 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -80,9 +80,15 @@ any_reader_may_open (const char *file)
  
  /* Returns a casereader for HANDLE.  On success, returns the new
     casereader and stores the file's dictionary into *DICT.  On
-   failure, returns a null pointer. */
+   failure, returns a null pointer.
+
+   Ordinarily the reader attempts to automatically detect the character
+   encoding based on the file's contents.  This isn't always possible,
+   especially for files written by old versions of SPSS or PSPP, so specifying
+   a nonnull ENCODING overrides the choice of character encoding.  */
  struct casereader *
-any_reader_open (struct file_handle *handle, struct dictionary **dict)
+any_reader_open (struct file_handle *handle, const char *encoding,
+                 struct dictionary **dict)
  {
    switch (fh_get_referent (handle))
      {
@@ -94,7 +100,7 @@ any_reader_open (struct file_handle *handle, struct dictionary **dict)
          if (result == IO_ERROR)
            return NULL;
          else if (result == YES)
-          return sfm_open_reader (handle, dict, NULL);
+          return sfm_open_reader (handle, encoding, dict, NULL);
  
          result = try_detect (fh_get_file_name (handle), pfm_detect);
          if (result == IO_ERROR)
diff --git a/src/data/any-reader.h b/src/data/any-reader.h

index e999aa33f5be28543d18c1f0b120d47ded4264d0..fb36e99ccec87af844dd94249296901d69b0d130 100644 (file)
--- a/src/data/any-reader.h
+++ b/src/data/any-reader.h
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 2006, 2010 Free Software Foundation, Inc.
+   Copyright (C) 2006, 2010, 2012 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -22,7 +22,7 @@
  struct file_handle;
  struct dictionary;
  bool any_reader_may_open (const char *file_name);
-struct casereader *any_reader_open (struct file_handle *,
+struct casereader *any_reader_open (struct file_handle *, const char *encoding,
                                      struct dictionary **);
  
  #endif /* any-reader.h */
diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c

index 024b4ae1827994d1d6d60a7820e4535295190f5d..7e8bcf0de38d316ea6d770976caff1f86ab76018 100644 (file)
--- a/src/data/sys-file-reader.c
+++ b/src/data/sys-file-reader.c
@@ -312,12 +312,17 @@ sfm_read_info_destroy (struct sfm_read_info *info)
  /* Opens the system file designated by file handle FH for reading.  Reads the
     system file's dictionary into *DICT.
  
+   Ordinarily the reader attempts to automatically detect the character
+   encoding based on the file's contents.  This isn't always possible,
+   especially for files written by old versions of SPSS or PSPP, so specifying
+   a nonnull ENCODING overrides the choice of character encoding.
+
     If INFO is non-null, then it receives additional info about the system file,
     which the caller must eventually free with sfm_read_info_destroy() when it
     is no longer needed. */
  struct casereader *
-sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
-                 struct sfm_read_info *infop)
+sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
+                 struct dictionary **dictp, struct sfm_read_info *infop)
  {
    struct sfm_reader *volatile r = NULL;
    struct sfm_read_info info;
@@ -454,8 +459,10 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
  
       First, figure out the correct character encoding, because this determines
       how the rest of the header data is to be interpreted. */
-  dict = dict_create (choose_encoding (r, &header, extensions[EXT_INTEGER],
-                                       extensions[EXT_ENCODING]));
+  dict = dict_create (encoding
+                      ? encoding
+                      : choose_encoding (r, &header, extensions[EXT_INTEGER],
+                                         extensions[EXT_ENCODING]));
    r->encoding = dict_get_encoding (dict);
  
    /* These records don't use variables at all. */
diff --git a/src/data/sys-file-reader.h b/src/data/sys-file-reader.h

index be01277235717ef14456a7333d69ef4aab29ce95..a8f16e10db5e452299bb4ffbdb7b24f8e91dd98f 100644 (file)
--- a/src/data/sys-file-reader.h
+++ b/src/data/sys-file-reader.h
@@ -52,7 +52,7 @@ void sfm_read_info_destroy (struct sfm_read_info *);
  
  struct dictionary;
  struct file_handle;
-struct casereader *sfm_open_reader (struct file_handle *,
+struct casereader *sfm_open_reader (struct file_handle *, const char *encoding,
                                      struct dictionary **,
                                      struct sfm_read_info *);
  bool sfm_detect (FILE *);
diff --git a/src/data/sys-file-writer.c b/src/data/sys-file-writer.c

index cf7bc70b2229127193d77028ad9e8d0341a9a39c..5003ca2b89bf1d1b4a4fbf3b98efdec02040f8b1 100644 (file)
--- a/src/data/sys-file-writer.c
+++ b/src/data/sys-file-writer.c
@@ -74,6 +74,7 @@ struct sfm_writer
  
      bool compress;             /* 1=compressed, 0=not compressed. */
      casenumber case_cnt;       /* Number of cases written so far. */
+    uint8_t space;              /* ' ' in the file's character encoding. */
  
      /* Compression buffering.
  
@@ -176,6 +177,7 @@ struct casewriter *
  sfm_open_writer (struct file_handle *fh, struct dictionary *d,
                   struct sfm_write_options opts)
  {
+  struct encoding_info encoding_info;
    struct sfm_writer *w;
    mode_t mode;
    int i;
@@ -227,6 +229,9 @@ sfm_open_writer (struct file_handle *fh, struct dictionary *d,
        goto error;
      }
  
+  get_encoding_info (&encoding_info, dict_get_encoding (d));
+  w->space = encoding_info.space[0];
+
    /* Write the file header. */
    write_header (w, d);
  
@@ -712,6 +717,12 @@ write_mrsets (struct sfm_writer *w, const struct dictionary *dict,
    size_t n_mrsets;
    size_t i;
  
+  if (is_encoding_ebcdic_compatible (encoding))
+    {
+      /* FIXME. */
+      return;
+    }
+
    n_mrsets = dict_get_n_mrsets (dict);
    if (n_mrsets == 0)
      return;
@@ -1251,7 +1262,7 @@ put_cmp_string (struct sfm_writer *w, const void *data, size_t size)
    assert (w->data_cnt < 8);
    assert (size <= 8);
  
-  memset (w->data[w->data_cnt], ' ', 8);
+  memset (w->data[w->data_cnt], w->space, 8);
    memcpy (w->data[w->data_cnt], data, size);
    w->data_cnt++;
  }
@@ -1313,7 +1324,7 @@ write_string (struct sfm_writer *w, const char *string, size_t width)
    size_t pad_bytes = width - data_bytes;
    write_bytes (w, string, data_bytes);
    while (pad_bytes-- > 0)
-    putc (' ', w->file);
+    putc (w->space, w->file);
  }
  
  /* Recodes null-terminated UTF-8 encoded STRING into ENCODING, and writes the
@@ -1374,5 +1385,5 @@ static void
  write_spaces (struct sfm_writer *w, size_t n)
  {
    while (n-- > 0)
-    putc (' ', w->file);
+    putc (w->space, w->file);
  }
diff --git a/src/language/data-io/combine-files.c b/src/language/data-io/combine-files.c

index f306cad6f645c1b10ce408233e78da63fe2d4fe6..21736da8c645239dec3dadcde4cae59765d0f712 100644 (file)
--- a/src/language/data-io/combine-files.c
+++ b/src/language/data-io/combine-files.c
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -227,7 +227,7 @@ combine_files (enum comb_command_type command,
            if (file->handle == NULL)
              goto error;
  
-          file->reader = any_reader_open (file->handle, &file->dict);
+          file->reader = any_reader_open (file->handle, NULL, &file->dict);
            if (file->reader == NULL)
              goto error;
          }
diff --git a/src/language/data-io/get.c b/src/language/data-io/get.c

index d32f25567a25dcfd28cf7b3dad7189cc2bee29f6..35b894a75037868ba3119b437e7a01e0bfd7b4c5 100644 (file)
--- a/src/language/data-io/get.c
+++ b/src/language/data-io/get.c
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2006, 2007, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2006, 2007, 2010, 2011, 2012 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -73,6 +73,7 @@ parse_read_command (struct lexer *lexer, struct dataset *ds,
    struct file_handle *fh = NULL;
    struct dictionary *dict = NULL;
    struct case_map *map = NULL;
+  char *encoding = NULL;
  
    for (;;)
      {
@@ -87,6 +88,18 @@ parse_read_command (struct lexer *lexer, struct dataset *ds,
           if (fh == NULL)
              goto error;
         }
+      else if (command == GET_CMD && lex_match_id (lexer, "ENCODING"))
+        {
+         lex_match (lexer, T_EQUALS);
+
+          if (!lex_force_string (lexer))
+            goto error;
+
+          free (encoding);
+          encoding = ss_xstrdup (lex_tokss (lexer));
+
+          lex_get (lexer);
+        }
        else if (command == IMPORT_CMD && lex_match_id (lexer, "TYPE"))
         {
           lex_match (lexer, T_EQUALS);
@@ -108,7 +121,7 @@ parse_read_command (struct lexer *lexer, struct dataset *ds,
        goto error;
      }
  
-  reader = any_reader_open (fh, &dict);
+  reader = any_reader_open (fh, encoding, &dict);
    if (reader == NULL)
      goto error;
  
@@ -130,6 +143,7 @@ parse_read_command (struct lexer *lexer, struct dataset *ds,
    dataset_set_source (ds, reader);
  
    fh_unref (fh);
+  free (encoding);
    return CMD_SUCCESS;
  
   error:
@@ -137,5 +151,6 @@ parse_read_command (struct lexer *lexer, struct dataset *ds,
    casereader_destroy (reader);
    if (dict != NULL)
      dict_destroy (dict);
+  free (encoding);
    return CMD_CASCADING_FAILURE;
  }
diff --git a/src/language/dictionary/apply-dictionary.c b/src/language/dictionary/apply-dictionary.c

index c2de9318ae81b13462c940aae839a580f886c2d4..8531ba56d2d257d30e3cc085ab42fafa91bca19c 100644 (file)
--- a/src/language/dictionary/apply-dictionary.c
+++ b/src/language/dictionary/apply-dictionary.c
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2009, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -53,7 +53,7 @@ cmd_apply_dictionary (struct lexer *lexer, struct dataset *ds)
    handle = fh_parse (lexer, FH_REF_FILE, dataset_session (ds));
    if (!handle)
      return CMD_FAILURE;
-  reader = any_reader_open (handle, &dict);
+  reader = any_reader_open (handle, NULL, &dict);
    fh_unref (handle);
    if (dict == NULL)
      return CMD_FAILURE;
diff --git a/src/language/dictionary/sys-file-info.c b/src/language/dictionary/sys-file-info.c

index bb990c79d9e093aa28d292e2368faa34eaccdc13..31a685aa824276afd1d38326140bf9cac530504d 100644 (file)
--- a/src/language/dictionary/sys-file-info.c
+++ b/src/language/dictionary/sys-file-info.c
@@ -81,7 +81,7 @@ cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED)
    if (!h)
      return CMD_FAILURE;
  
-  reader = sfm_open_reader (h, &d, &info);
+  reader = sfm_open_reader (h, NULL, &d, &info);
    if (!reader)
      {
        fh_unref (h);
diff --git a/src/libpspp/i18n.c b/src/libpspp/i18n.c

index 149ad6fb2fa4a396a60fcb4d1d11963123c180ef..9658866056f01f7911c905e1d2d4896fb0288cbe 100644 (file)
--- a/src/libpspp/i18n.c
+++ b/src/libpspp/i18n.c
@@ -694,29 +694,37 @@ get_encoding_info (struct encoding_info *e, const char *name)
      "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
      "abcdefghijklmnopqrstuvwxyz{|}~");
  
-  struct substring out, cr, lf;
+  struct substring out, cr, lf, space;
    bool ok;
  
    memset (e, 0, sizeof *e);
  
    cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
    lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
-  ok = cr.length >= 1 && cr.length <= MAX_UNIT && cr.length == lf.length;
+  space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
+  ok = (cr.length >= 1
+        && cr.length <= MAX_UNIT
+        && cr.length == lf.length
+        && cr.length == space.length);
    if (!ok)
      {
        fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
        ss_dealloc (&cr);
        ss_dealloc (&lf);
+      ss_dealloc (&space);
        ss_alloc_substring (&cr, ss_cstr ("\r"));
        ss_alloc_substring (&lf, ss_cstr ("\n"));
+      ss_alloc_substring (&space, ss_cstr (" "));
      }
  
    e->unit = cr.length;
    memcpy (e->cr, cr.string, e->unit);
    memcpy (e->lf, lf.string, e->unit);
+  memcpy (e->space, space.string, e->unit);
  
    ss_dealloc (&cr);
    ss_dealloc (&lf);
+  ss_dealloc (&space);
  
    out = recode_substring_pool ("UTF-8", name, in, NULL);
    e->is_ascii_compatible = ss_equals (in, out);
diff --git a/src/libpspp/i18n.h b/src/libpspp/i18n.h

index 27ccce361e5ccfa8a6bf03c85967e96eaf4174ae..383ff12da53ab6437ede88f815292f4ad2aa86ab 100644 (file)
--- a/src/libpspp/i18n.h
+++ b/src/libpspp/i18n.h
@@ -134,6 +134,7 @@ struct encoding_info
      int unit;                   /* Unit width, in bytes. */
      char cr[MAX_UNIT];          /* \r in encoding, 'unit' bytes long. */
      char lf[MAX_UNIT];          /* \n in encoding, 'unit' bytes long. */
+    char space[MAX_UNIT];       /* ' ' in encoding, 'unit' bytes long. */
    };
  
  bool get_encoding_info (struct encoding_info *, const char *name);
author	Ben Pfaff <blp@cs.stanford.edu>
	Wed, 8 Feb 2012 06:58:09 +0000 (22:58 -0800)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Wed, 8 Feb 2012 06:59:16 +0000 (22:59 -0800)
NEWS		patch \| blob \| history
doc/files.texi		patch \| blob \| history
perl-module/PSPP.xs		patch \| blob \| history
src/data/any-reader.c		patch \| blob \| history
src/data/any-reader.h		patch \| blob \| history
src/data/sys-file-reader.c		patch \| blob \| history
src/data/sys-file-reader.h		patch \| blob \| history
src/data/sys-file-writer.c		patch \| blob \| history
src/language/data-io/combine-files.c		patch \| blob \| history
src/language/data-io/get.c		patch \| blob \| history
src/language/dictionary/apply-dictionary.c		patch \| blob \| history
src/language/dictionary/sys-file-info.c		patch \| blob \| history
src/libpspp/i18n.c		patch \| blob \| history
src/libpspp/i18n.h		patch \| blob \| history