From: Ben Pfaff <blp@cs.stanford.edu>
Date: Wed, 8 Feb 2012 06:58:09 +0000 (-0800)
Subject: GET: Add an ENCODING subcommand.
X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?p=pspp;a=commitdiff_plain;h=21559edd9991628d96df331e5b391ca6bade3497

GET: Add an ENCODING subcommand.

For example, this allows a Swedish EBCDIC file that doesn't contain
any indication of its codepage to be read with "ENCODING='IBM278'".
---

diff --git a/NEWS b/NEWS
index 916c7d4848..aa8c0df021 100644
--- a/NEWS
+++ b/NEWS
@@ -40,7 +40,7 @@ Changes from 0.6.2 to 0.7.9:
 
    - HOST has been updated to use more modern syntax.
 
-   - INCLUDE and INSERT have a new ENCODING subcommand.
+   - GET, INCLUDE, and INSERT have a new ENCODING subcommand.
 
    - MISSING VALUES can now assign missing values to long string
      variables.
diff --git a/doc/files.texi b/doc/files.texi
index cdce0a3c46..86aecab52c 100644
--- a/doc/files.texi
+++ b/doc/files.texi
@@ -139,6 +139,7 @@ GET
         /DROP=var_list
         /KEEP=var_list
         /RENAME=(src_names=target_names)@dots{}
+        /ENCODING='encoding'
 @end display
 
 @cmd{GET} clears the current dictionary and active dataset and
@@ -171,6 +172,13 @@ Each may be present any number of times.  @cmd{GET} never modifies a
 file on disk.  Only the active dataset read from the file
 is affected by these subcommands.
 
+PSPP tries to automatically detect the encoding of string data in the
+file.  Sometimes, however, this does not work well encoding,
+especially for files written by old versions of SPSS or PSPP.  Specify
+the ENCODING subcommand with an IANA character set name as its string
+argument to override the default.  The ENCODING subcommand is a PSPP
+extension.
+
 @cmd{GET} does not cause the data to be read, only the dictionary.  The data
 is read later, when a procedure is executed.
 
diff --git a/perl-module/PSPP.xs b/perl-module/PSPP.xs
index f6afa29b10..834ec401f4 100644
--- a/perl-module/PSPP.xs
+++ b/perl-module/PSPP.xs
@@ -709,7 +709,7 @@ CODE:
  	 fh_create_file (NULL, name, fh_default_properties () );
 
  sri = xmalloc (sizeof (*sri));
- sri->reader = sfm_open_reader (fh, &sri->dict, &sri->opts);
+ sri->reader = sfm_open_reader (fh, NULL, &sri->dict, &sri->opts);
 
  if ( NULL == sri->reader)
  {
diff --git a/src/data/any-reader.c b/src/data/any-reader.c
index 50feb6892e..1b488f208a 100644
--- a/src/data/any-reader.c
+++ b/src/data/any-reader.c
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 2006, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 2006, 2010, 2011, 2012 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -80,9 +80,15 @@ any_reader_may_open (const char *file)
 
 /* Returns a casereader for HANDLE.  On success, returns the new
    casereader and stores the file's dictionary into *DICT.  On
-   failure, returns a null pointer. */
+   failure, returns a null pointer.
+
+   Ordinarily the reader attempts to automatically detect the character
+   encoding based on the file's contents.  This isn't always possible,
+   especially for files written by old versions of SPSS or PSPP, so specifying
+   a nonnull ENCODING overrides the choice of character encoding.  */
 struct casereader *
-any_reader_open (struct file_handle *handle, struct dictionary **dict)
+any_reader_open (struct file_handle *handle, const char *encoding,
+                 struct dictionary **dict)
 {
   switch (fh_get_referent (handle))
     {
@@ -94,7 +100,7 @@ any_reader_open (struct file_handle *handle, struct dictionary **dict)
         if (result == IO_ERROR)
           return NULL;
         else if (result == YES)
-          return sfm_open_reader (handle, dict, NULL);
+          return sfm_open_reader (handle, encoding, dict, NULL);
 
         result = try_detect (fh_get_file_name (handle), pfm_detect);
         if (result == IO_ERROR)
diff --git a/src/data/any-reader.h b/src/data/any-reader.h
index e999aa33f5..fb36e99cce 100644
--- a/src/data/any-reader.h
+++ b/src/data/any-reader.h
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 2006, 2010 Free Software Foundation, Inc.
+   Copyright (C) 2006, 2010, 2012 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -22,7 +22,7 @@
 struct file_handle;
 struct dictionary;
 bool any_reader_may_open (const char *file_name);
-struct casereader *any_reader_open (struct file_handle *,
+struct casereader *any_reader_open (struct file_handle *, const char *encoding,
                                     struct dictionary **);
 
 #endif /* any-reader.h */
diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c
index 024b4ae182..7e8bcf0de3 100644
--- a/src/data/sys-file-reader.c
+++ b/src/data/sys-file-reader.c
@@ -312,12 +312,17 @@ sfm_read_info_destroy (struct sfm_read_info *info)
 /* Opens the system file designated by file handle FH for reading.  Reads the
    system file's dictionary into *DICT.
 
+   Ordinarily the reader attempts to automatically detect the character
+   encoding based on the file's contents.  This isn't always possible,
+   especially for files written by old versions of SPSS or PSPP, so specifying
+   a nonnull ENCODING overrides the choice of character encoding.
+
    If INFO is non-null, then it receives additional info about the system file,
    which the caller must eventually free with sfm_read_info_destroy() when it
    is no longer needed. */
 struct casereader *
-sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
-                 struct sfm_read_info *infop)
+sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
+                 struct dictionary **dictp, struct sfm_read_info *infop)
 {
   struct sfm_reader *volatile r = NULL;
   struct sfm_read_info info;
@@ -454,8 +459,10 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
 
      First, figure out the correct character encoding, because this determines
      how the rest of the header data is to be interpreted. */
-  dict = dict_create (choose_encoding (r, &header, extensions[EXT_INTEGER],
-                                       extensions[EXT_ENCODING]));
+  dict = dict_create (encoding
+                      ? encoding
+                      : choose_encoding (r, &header, extensions[EXT_INTEGER],
+                                         extensions[EXT_ENCODING]));
   r->encoding = dict_get_encoding (dict);
 
   /* These records don't use variables at all. */
diff --git a/src/data/sys-file-reader.h b/src/data/sys-file-reader.h
index be01277235..a8f16e10db 100644
--- a/src/data/sys-file-reader.h
+++ b/src/data/sys-file-reader.h
@@ -52,7 +52,7 @@ void sfm_read_info_destroy (struct sfm_read_info *);
 
 struct dictionary;
 struct file_handle;
-struct casereader *sfm_open_reader (struct file_handle *,
+struct casereader *sfm_open_reader (struct file_handle *, const char *encoding,
                                     struct dictionary **,
                                     struct sfm_read_info *);
 bool sfm_detect (FILE *);
diff --git a/src/data/sys-file-writer.c b/src/data/sys-file-writer.c
index cf7bc70b22..5003ca2b89 100644
--- a/src/data/sys-file-writer.c
+++ b/src/data/sys-file-writer.c
@@ -74,6 +74,7 @@ struct sfm_writer
 
     bool compress;		/* 1=compressed, 0=not compressed. */
     casenumber case_cnt;	/* Number of cases written so far. */
+    uint8_t space;              /* ' ' in the file's character encoding. */
 
     /* Compression buffering.
 
@@ -176,6 +177,7 @@ struct casewriter *
 sfm_open_writer (struct file_handle *fh, struct dictionary *d,
                  struct sfm_write_options opts)
 {
+  struct encoding_info encoding_info;
   struct sfm_writer *w;
   mode_t mode;
   int i;
@@ -227,6 +229,9 @@ sfm_open_writer (struct file_handle *fh, struct dictionary *d,
       goto error;
     }
 
+  get_encoding_info (&encoding_info, dict_get_encoding (d));
+  w->space = encoding_info.space[0];
+
   /* Write the file header. */
   write_header (w, d);
 
@@ -712,6 +717,12 @@ write_mrsets (struct sfm_writer *w, const struct dictionary *dict,
   size_t n_mrsets;
   size_t i;
 
+  if (is_encoding_ebcdic_compatible (encoding))
+    {
+      /* FIXME. */
+      return;
+    }
+
   n_mrsets = dict_get_n_mrsets (dict);
   if (n_mrsets == 0)
     return;
@@ -1251,7 +1262,7 @@ put_cmp_string (struct sfm_writer *w, const void *data, size_t size)
   assert (w->data_cnt < 8);
   assert (size <= 8);
 
-  memset (w->data[w->data_cnt], ' ', 8);
+  memset (w->data[w->data_cnt], w->space, 8);
   memcpy (w->data[w->data_cnt], data, size);
   w->data_cnt++;
 }
@@ -1313,7 +1324,7 @@ write_string (struct sfm_writer *w, const char *string, size_t width)
   size_t pad_bytes = width - data_bytes;
   write_bytes (w, string, data_bytes);
   while (pad_bytes-- > 0)
-    putc (' ', w->file);
+    putc (w->space, w->file);
 }
 
 /* Recodes null-terminated UTF-8 encoded STRING into ENCODING, and writes the
@@ -1374,5 +1385,5 @@ static void
 write_spaces (struct sfm_writer *w, size_t n)
 {
   while (n-- > 0)
-    putc (' ', w->file);
+    putc (w->space, w->file);
 }
diff --git a/src/language/data-io/combine-files.c b/src/language/data-io/combine-files.c
index f306cad6f6..21736da8c6 100644
--- a/src/language/data-io/combine-files.c
+++ b/src/language/data-io/combine-files.c
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -227,7 +227,7 @@ combine_files (enum comb_command_type command,
           if (file->handle == NULL)
             goto error;
 
-          file->reader = any_reader_open (file->handle, &file->dict);
+          file->reader = any_reader_open (file->handle, NULL, &file->dict);
           if (file->reader == NULL)
             goto error;
         }
diff --git a/src/language/data-io/get.c b/src/language/data-io/get.c
index d32f25567a..35b894a750 100644
--- a/src/language/data-io/get.c
+++ b/src/language/data-io/get.c
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2006, 2007, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2006, 2007, 2010, 2011, 2012 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -73,6 +73,7 @@ parse_read_command (struct lexer *lexer, struct dataset *ds,
   struct file_handle *fh = NULL;
   struct dictionary *dict = NULL;
   struct case_map *map = NULL;
+  char *encoding = NULL;
 
   for (;;)
     {
@@ -87,6 +88,18 @@ parse_read_command (struct lexer *lexer, struct dataset *ds,
 	  if (fh == NULL)
             goto error;
 	}
+      else if (command == GET_CMD && lex_match_id (lexer, "ENCODING"))
+        {
+	  lex_match (lexer, T_EQUALS);
+
+          if (!lex_force_string (lexer))
+            goto error;
+
+          free (encoding);
+          encoding = ss_xstrdup (lex_tokss (lexer));
+
+          lex_get (lexer);
+        }
       else if (command == IMPORT_CMD && lex_match_id (lexer, "TYPE"))
 	{
 	  lex_match (lexer, T_EQUALS);
@@ -108,7 +121,7 @@ parse_read_command (struct lexer *lexer, struct dataset *ds,
       goto error;
     }
 
-  reader = any_reader_open (fh, &dict);
+  reader = any_reader_open (fh, encoding, &dict);
   if (reader == NULL)
     goto error;
 
@@ -130,6 +143,7 @@ parse_read_command (struct lexer *lexer, struct dataset *ds,
   dataset_set_source (ds, reader);
 
   fh_unref (fh);
+  free (encoding);
   return CMD_SUCCESS;
 
  error:
@@ -137,5 +151,6 @@ parse_read_command (struct lexer *lexer, struct dataset *ds,
   casereader_destroy (reader);
   if (dict != NULL)
     dict_destroy (dict);
+  free (encoding);
   return CMD_CASCADING_FAILURE;
 }
diff --git a/src/language/dictionary/apply-dictionary.c b/src/language/dictionary/apply-dictionary.c
index c2de9318ae..8531ba56d2 100644
--- a/src/language/dictionary/apply-dictionary.c
+++ b/src/language/dictionary/apply-dictionary.c
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2009, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -53,7 +53,7 @@ cmd_apply_dictionary (struct lexer *lexer, struct dataset *ds)
   handle = fh_parse (lexer, FH_REF_FILE, dataset_session (ds));
   if (!handle)
     return CMD_FAILURE;
-  reader = any_reader_open (handle, &dict);
+  reader = any_reader_open (handle, NULL, &dict);
   fh_unref (handle);
   if (dict == NULL)
     return CMD_FAILURE;
diff --git a/src/language/dictionary/sys-file-info.c b/src/language/dictionary/sys-file-info.c
index bb990c79d9..31a685aa82 100644
--- a/src/language/dictionary/sys-file-info.c
+++ b/src/language/dictionary/sys-file-info.c
@@ -81,7 +81,7 @@ cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED)
   if (!h)
     return CMD_FAILURE;
 
-  reader = sfm_open_reader (h, &d, &info);
+  reader = sfm_open_reader (h, NULL, &d, &info);
   if (!reader)
     {
       fh_unref (h);
diff --git a/src/libpspp/i18n.c b/src/libpspp/i18n.c
index 149ad6fb2f..9658866056 100644
--- a/src/libpspp/i18n.c
+++ b/src/libpspp/i18n.c
@@ -694,29 +694,37 @@ get_encoding_info (struct encoding_info *e, const char *name)
     "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
     "abcdefghijklmnopqrstuvwxyz{|}~");
 
-  struct substring out, cr, lf;
+  struct substring out, cr, lf, space;
   bool ok;
 
   memset (e, 0, sizeof *e);
 
   cr = recode_substring_pool (name, "UTF-8", ss_cstr ("\r"), NULL);
   lf = recode_substring_pool (name, "UTF-8", ss_cstr ("\n"), NULL);
-  ok = cr.length >= 1 && cr.length <= MAX_UNIT && cr.length == lf.length;
+  space = recode_substring_pool (name, "UTF-8", ss_cstr (" "), NULL);
+  ok = (cr.length >= 1
+        && cr.length <= MAX_UNIT
+        && cr.length == lf.length
+        && cr.length == space.length);
   if (!ok)
     {
       fprintf (stderr, "warning: encoding `%s' is not supported.\n", name);
       ss_dealloc (&cr);
       ss_dealloc (&lf);
+      ss_dealloc (&space);
       ss_alloc_substring (&cr, ss_cstr ("\r"));
       ss_alloc_substring (&lf, ss_cstr ("\n"));
+      ss_alloc_substring (&space, ss_cstr (" "));
     }
 
   e->unit = cr.length;
   memcpy (e->cr, cr.string, e->unit);
   memcpy (e->lf, lf.string, e->unit);
+  memcpy (e->space, space.string, e->unit);
 
   ss_dealloc (&cr);
   ss_dealloc (&lf);
+  ss_dealloc (&space);
 
   out = recode_substring_pool ("UTF-8", name, in, NULL);
   e->is_ascii_compatible = ss_equals (in, out);
diff --git a/src/libpspp/i18n.h b/src/libpspp/i18n.h
index 27ccce361e..383ff12da5 100644
--- a/src/libpspp/i18n.h
+++ b/src/libpspp/i18n.h
@@ -134,6 +134,7 @@ struct encoding_info
     int unit;                   /* Unit width, in bytes. */
     char cr[MAX_UNIT];          /* \r in encoding, 'unit' bytes long. */
     char lf[MAX_UNIT];          /* \n in encoding, 'unit' bytes long. */
+    char space[MAX_UNIT];       /* ' ' in encoding, 'unit' bytes long. */
   };
 
 bool get_encoding_info (struct encoding_info *, const char *name);