Add ENCODING subcommand to several commands.

author Ben Pfaff <blp@cs.stanford.edu>

Wed, 20 Jun 2012 05:43:24 +0000 (22:43 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Wed, 20 Jun 2012 06:09:38 +0000 (23:09 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Wed, 20 Jun 2012 05:43:24 +0000 (22:43 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Wed, 20 Jun 2012 06:09:38 +0000 (23:09 -0700)
diff --git a/NEWS b/NEWS

index 3974f542b3440606e8e504f573255d2a82e6966b..71ba057c00f2d3f2ab704ddc080b13d0fe01ac45 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -42,7 +42,8 @@ Changes from 0.6.2 to 0.7.9:
    
     - HOST has been updated to use more modern syntax.
  
    
     - HOST has been updated to use more modern syntax.
  
-   - GET, INCLUDE, and INSERT have a new ENCODING subcommand.
+   - Most commands that work with data files now support a new
+     ENCODING subcommand.
  
     - MISSING VALUES can now assign missing values to long string
       variables.
  
     - MISSING VALUES can now assign missing values to long string
       variables.
diff --git a/doc/data-io.texi b/doc/data-io.texi

index 4862ccc5964e4d2e9783e5900b1f28901b542caf..da1712e3d46a7b9c322cf772aeb337f895096617 100644 (file)
--- a/doc/data-io.texi
+++ b/doc/data-io.texi
@@ -277,8 +277,9 @@ external file.  It may be used to specify a file name as a string or a
  file handle (@pxref{File Handles}).  If the @subcmd{FILE} subcommand is not used,
  then input is assumed to be specified within the command file using
  @cmd{BEGIN DATA}@dots{}@cmd{END DATA} (@pxref{BEGIN DATA}).
  file handle (@pxref{File Handles}).  If the @subcmd{FILE} subcommand is not used,
  then input is assumed to be specified within the command file using
  @cmd{BEGIN DATA}@dots{}@cmd{END DATA} (@pxref{BEGIN DATA}).
-The @subcmd{ENCODING} subcommand may only be used if the @subcmd{FILE} subcommand is also used.
-It specifies the character encoding of the file.
+The @subcmd{ENCODING} subcommand may only be used if the @subcmd{FILE}
+subcommand is also used.  It specifies the character encoding of the
+file.  @xref{INSERT}, for information on supported encodings.
  
  The optional @subcmd{RECORDS} subcommand, which takes a single integer as an
  argument, is used to specify the number of lines per record.
  
  The optional @subcmd{RECORDS} subcommand, which takes a single integer as an
  argument, is used to specify the number of lines per record.
@@ -503,7 +504,8 @@ of quoting is allowed.
  The @subcmd{NOTABLE} and @subcmd{TABLE} subcommands are as in @cmd{DATA LIST FIXED} above.
  @subcmd{NOTABLE} is the default.
  
  The @subcmd{NOTABLE} and @subcmd{TABLE} subcommands are as in @cmd{DATA LIST FIXED} above.
  @subcmd{NOTABLE} is the default.
  
-The @subcmd{FILE} and @subcmd{SKIP} subcommands are as in @cmd{DATA LIST FIXED} above.
+The @subcmd{FILE}, @subcmd{SKIP}, and @subcmd{ENCODING} subcommands
+are as in @cmd{DATA LIST FIXED} above.
  
  The variables to be parsed are given as a single list of variable names.
  This list must be introduced by a single slash (@samp{/}).  The set of
  
  The variables to be parsed are given as a single list of variable names.
  This list must be introduced by a single slash (@samp{/}).  The set of
@@ -525,7 +527,7 @@ on field width apply, but they are honored on output.
  DATA LIST LIST
          [(@{TAB,'@var{c}'@}, @dots{})]
          [@{NOTABLE,TABLE@}]
  DATA LIST LIST
          [(@{TAB,'@var{c}'@}, @dots{})]
          [@{NOTABLE,TABLE@}]
-        [FILE='@var{file_name'} [ENCODING='@var{encoding}']]
+        [FILE='@var{file_name}' [ENCODING='@var{encoding}']]
          [SKIP=@var{record_count}]
          /@var{var_spec}@dots{}
  
          [SKIP=@var{record_count}]
          /@var{var_spec}@dots{}
  
@@ -572,18 +574,21 @@ For text files:
                  /NAME='@var{file_name}
                  [/MODE=CHARACTER]
                  /TABWIDTH=@var{tab_width}
                  /NAME='@var{file_name}
                  [/MODE=CHARACTER]
                  /TABWIDTH=@var{tab_width}
+                [ENCODING='@var{encoding}']
  
  For binary files in native encoding with fixed-length records:
          FILE HANDLE @var{handle_name}
                  /NAME='@var{file_name}'
                  /MODE=IMAGE
                  [/LRECL=@var{rec_len}]
  
  For binary files in native encoding with fixed-length records:
          FILE HANDLE @var{handle_name}
                  /NAME='@var{file_name}'
                  /MODE=IMAGE
                  [/LRECL=@var{rec_len}]
+                [ENCODING='@var{encoding}']
  
  For binary files in native encoding with variable-length records:
          FILE HANDLE @var{handle_name}
                  /NAME='@var{file_name}'
                  /MODE=BINARY
                  [/LRECL=@var{rec_len}]
  
  For binary files in native encoding with variable-length records:
          FILE HANDLE @var{handle_name}
                  /NAME='@var{file_name}'
                  /MODE=BINARY
                  [/LRECL=@var{rec_len}]
+                [ENCODING='@var{encoding}']
  
  For binary files encoded in EBCDIC:
          FILE HANDLE @var{handle_name}
  
  For binary files encoded in EBCDIC:
          FILE HANDLE @var{handle_name}
@@ -591,6 +596,7 @@ For binary files encoded in EBCDIC:
                  /MODE=360
                  /RECFORM=@{FIXED,VARIABLE,SPANNED@}
                  [/LRECL=@var{rec_len}]
                  /MODE=360
                  /RECFORM=@{FIXED,VARIABLE,SPANNED@}
                  [/LRECL=@var{rec_len}]
+                [ENCODING='@var{encoding}']
  @end display
  
  Use @cmd{FILE HANDLE} to associate a file handle name with a file and
  @end display
  
  Use @cmd{FILE HANDLE} to associate a file handle name with a file and
@@ -726,6 +732,14 @@ The @subcmd{NAME} subcommand specifies the name of the file associated with the
  handle.  It is required in all modes but SCRATCH mode, in which its
  use is forbidden.
  
  handle.  It is required in all modes but SCRATCH mode, in which its
  use is forbidden.
  
+The ENCODING subcommand specifies the encoding of text in the file.
+For reading text files in CHARACTER mode, all of the forms described
+for ENCODING on the INSERT command are supported (@pxref{INSERT}).
+For reading in other file-based modes, encoding autodetection is not
+supported; if the specified encoding requests autodetection then the
+default encoding will be used.  This is also true when a file handle
+is used for writing a file in any mode.
+
  @node INPUT PROGRAM
  @section INPUT PROGRAM
  @vindex INPUT PROGRAM
  @node INPUT PROGRAM
  @section INPUT PROGRAM
  @vindex INPUT PROGRAM
@@ -942,9 +956,10 @@ active dataset.
  
  @display
  PRINT 
  
  @display
  PRINT 
-        OUTFILE='@var{file_name}'
-        RECORDS=@var{n_lines}
-        @{NOTABLE,TABLE@}
+        [OUTFILE='@var{file_name}']
+        [RECORDS=@var{n_lines}]
+        [@{NOTABLE,TABLE@}]
+        [ENCODING='@var{encoding}']
          [/[@var{line_no}] @var{arg}@dots{}]
  
  @var{arg} takes one of the following forms:
          [/[@var{line_no}] @var{arg}@dots{}]
  
  @var{arg} takes one of the following forms:
@@ -969,6 +984,11 @@ Handles}).  If @subcmd{OUTFILE} is not present then output will be sent to
  inserted at beginning of each output line, even lines that otherwise
  would be blank.
  
  inserted at beginning of each output line, even lines that otherwise
  would be blank.
  
+The @subcmd{ENCODING} subcommand may only be used if the
+@subcmd{OUTFILE} subcommand is also used.  It specifies the character
+encoding of the file.  @xref{INSERT}, for information on supported
+encodings.
+
  The @subcmd{RECORDS} subcommand specifies the number of lines to be output.  The
  number of lines may optionally be surrounded by parentheses.
  
  The @subcmd{RECORDS} subcommand specifies the number of lines to be output.  The
  number of lines may optionally be surrounded by parentheses.
  
@@ -983,7 +1003,6 @@ line number, the next line number will be specified.  Multiple lines may
  be specified using multiple slashes with the intended output for a line
  following its respective slash.
  
  be specified using multiple slashes with the intended output for a line
  following its respective slash.
  
-
  Literal strings may be printed.  Specify the string itself.  Optionally
  the string may be followed by a column number or range of column
  numbers, specifying the location on the line for the string to be
  Literal strings may be printed.  Specify the string itself.  Optionally
  the string may be followed by a column number or range of column
  numbers, specifying the location on the line for the string to be
@@ -1043,7 +1062,7 @@ written with a space inserted in the first column, as with @subcmd{PRINT}.
  @vindex PRINT SPACE
  
  @display
  @vindex PRINT SPACE
  
  @display
-PRINT SPACE OUTFILE='file_name' n_lines.
+PRINT SPACE [OUTFILE='file_name'] [ENCODING='@var{encoding}'] [n_lines].
  @end display
  
  @cmd{PRINT SPACE} prints one or more blank lines to an output file.
  @end display
  
  @cmd{PRINT SPACE} prints one or more blank lines to an output file.
@@ -1053,6 +1072,10 @@ a file specified by file name as a string or file handle (@pxref{File
  Handles}).  If OUTFILE is not specified then output will be directed to
  the listing file.
  
  Handles}).  If OUTFILE is not specified then output will be directed to
  the listing file.
  
+The @subcmd{ENCODING} subcommand may only be used if @subcmd{OUTFILE}
+is also used.  It specifies the character encoding of the file.
+@xref{INSERT}, for information on supported encodings.
+
  n_lines is also optional.  If present, it is an expression
  (@pxref{Expressions}) specifying the number of blank lines to be
  printed.  The expression must evaluate to a nonnegative value.
  n_lines is also optional.  If present, it is an expression
  (@pxref{Expressions}) specifying the number of blank lines to be
  printed.  The expression must evaluate to a nonnegative value.
@@ -1062,7 +1085,7 @@ printed.  The expression must evaluate to a nonnegative value.
  @vindex REREAD
  
  @display
  @vindex REREAD
  
  @display
-REREAD FILE=handle COLUMN=column.
+REREAD [FILE=handle] [COLUMN=column] [ENCODING='@var{encoding}'].
  @end display
  
  The @cmd{REREAD} transformation allows the previous input line in a
  @end display
  
  The @cmd{REREAD} transformation allows the previous input line in a
@@ -1082,6 +1105,10 @@ re-reading.  Specify an expression (@pxref{Expressions}) evaluating to
  the first column that should be included in the re-read line.  Columns
  are numbered from 1 at the left margin.
  
  the first column that should be included in the re-read line.  Columns
  are numbered from 1 at the left margin.
  
+The @subcmd{ENCODING} subcommand may only be used if the @subcmd{FILE}
+subcommand is also used.  It specifies the character encoding of the
+file.   @xref{INSERT}, for information on supported encodings.
+
  Issuing @code{REREAD} multiple times will not back up in the data
  file.  Instead, it will re-read the same line multiple times.
  
  Issuing @code{REREAD} multiple times will not back up in the data
  file.  Instead, it will re-read the same line multiple times.
  
diff --git a/doc/files.texi b/doc/files.texi

index 04bbff40fc00d01dc92b778e5b70cb6df7a56b17..511cf5d43ad06de7828b5f8b871e6410aab3cd63 100644 (file)
--- a/doc/files.texi
+++ b/doc/files.texi
@@ -366,6 +366,7 @@ GET DATA /TYPE=PSQL
  @display
  GET DATA /TYPE=TXT
          /FILE=@{'@var{file_name}',@var{file_handle}@}
  @display
  GET DATA /TYPE=TXT
          /FILE=@{'@var{file_name}',@var{file_handle}@}
+        [ENCODING='@var{encoding}']
          [/ARRANGEMENT=@{DELIMITED,FIXED@}]
          [/FIRSTCASE=@{@var{first_case}@}]
          [/IMPORTCASE=@{ALL,FIRST @var{max_cases},PERCENT @var{percent}@}]
          [/ARRANGEMENT=@{DELIMITED,FIXED@}]
          [/FIRSTCASE=@{@var{first_case}@}]
          [/IMPORTCASE=@{ALL,FIRST @var{max_cases},PERCENT @var{percent}@}]
@@ -381,6 +382,10 @@ The @subcmd{FILE} subcommand is mandatory.  Specify the file to be read as
  a string file name or (for textual data only) a
  file handle (@pxref{File Handles}).
  
  a string file name or (for textual data only) a
  file handle (@pxref{File Handles}).
  
+The @subcmd{ENCODING} subcommand specifies the character encoding of
+the file to be read.  @xref{INSERT}, for information on supported
+encodings.
+
  The @subcmd{ARRANGEMENT} subcommand determines the file's basic format.
  DELIMITED, the default setting, specifies that fields in the input
  data are separated by spaces, tabs, or other user-specified
  The @subcmd{ARRANGEMENT} subcommand determines the file's basic format.
  DELIMITED, the default setting, specifies that fields in the input
  data are separated by spaces, tabs, or other user-specified
diff --git a/src/data/file-handle-def.c b/src/data/file-handle-def.c

index 6ca6977c87733d792fce4c7f31d48f8afab97e80..9a46bfad43c851b4d87759c8d9dca1ab3ee4fbaf 100644 (file)
--- a/src/data/file-handle-def.c
+++ b/src/data/file-handle-def.c
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -26,12 +26,13 @@
  #include "data/dataset.h"
  #include "data/file-name.h"
  #include "data/variable.h"
  #include "data/dataset.h"
  #include "data/file-name.h"
  #include "data/variable.h"
+#include "libpspp/cast.h"
  #include "libpspp/compiler.h"
  #include "libpspp/compiler.h"
+#include "libpspp/hash-functions.h"
  #include "libpspp/hmap.h"
  #include "libpspp/i18n.h"
  #include "libpspp/message.h"
  #include "libpspp/str.h"
  #include "libpspp/hmap.h"
  #include "libpspp/i18n.h"
  #include "libpspp/message.h"
  #include "libpspp/str.h"
-#include "libpspp/hash-functions.h"
  
  #include "gl/xalloc.h"
  
  
  #include "gl/xalloc.h"
  
@@ -50,11 +51,11 @@ struct file_handle
      /* FH_REF_FILE only. */
      char *file_name;           /* File name as provided by user. */
      enum fh_mode mode;         /* File mode. */
      /* FH_REF_FILE only. */
      char *file_name;           /* File name as provided by user. */
      enum fh_mode mode;         /* File mode. */
-    const char *encoding;       /* File encoding. */
  
      /* FH_REF_FILE and FH_REF_INLINE only. */
      size_t record_width;        /* Length of fixed-format records. */
      size_t tab_width;           /* Tab width, 0=do not expand tabs. */
  
      /* FH_REF_FILE and FH_REF_INLINE only. */
      size_t record_width;        /* Length of fixed-format records. */
      size_t tab_width;           /* Tab width, 0=do not expand tabs. */
+    char *encoding;             /* Charset for contents. */
  
      /* FH_REF_DATASET only. */
      struct dataset *ds;         /* Dataset. */
  
      /* FH_REF_DATASET only. */
      struct dataset *ds;         /* Dataset. */
@@ -71,7 +72,8 @@ static struct file_handle *default_handle;
  static struct file_handle *inline_file;
  
  static struct file_handle *create_handle (const char *id,
  static struct file_handle *inline_file;
  
  static struct file_handle *create_handle (const char *id,
-                                          char *name, enum fh_referent);
+                                          char *name, enum fh_referent,
+                                          const char *encoding);
  static void free_handle (struct file_handle *);
  static void unname_handle (struct file_handle *);
  
  static void free_handle (struct file_handle *);
  static void unname_handle (struct file_handle *);
  
@@ -82,7 +84,8 @@ static struct hmap locks = HMAP_INITIALIZER (locks);
  void
  fh_init (void)
  {
  void
  fh_init (void)
  {
-  inline_file = create_handle ("INLINE", xstrdup ("INLINE"), FH_REF_INLINE);
+  inline_file = create_handle ("INLINE", xstrdup ("INLINE"), FH_REF_INLINE,
+                               "Auto");
    inline_file->record_width = 80;
    inline_file->tab_width = 8;
  }
    inline_file->record_width = 80;
    inline_file->tab_width = 8;
  }
@@ -110,6 +113,7 @@ free_handle (struct file_handle *handle)
    free (handle->id);
    free (handle->name);
    free (handle->file_name);
    free (handle->id);
    free (handle->name);
    free (handle->file_name);
+  free (handle->encoding);
    free (handle);
  }
  
    free (handle);
  }
  
@@ -189,7 +193,8 @@ fh_from_id (const char *id)
     The new handle is not fully initialized.  The caller is
     responsible for completing its initialization. */
  static struct file_handle *
     The new handle is not fully initialized.  The caller is
     responsible for completing its initialization. */
  static struct file_handle *
-create_handle (const char *id, char *handle_name, enum fh_referent referent)
+create_handle (const char *id, char *handle_name, enum fh_referent referent,
+               const char *encoding)
  {
    struct file_handle *handle = xzalloc (sizeof *handle);
  
  {
    struct file_handle *handle = xzalloc (sizeof *handle);
  
@@ -197,6 +202,7 @@ create_handle (const char *id, char *handle_name, enum fh_referent referent)
    handle->id = id != NULL ? xstrdup (id) : NULL;
    handle->name = handle_name;
    handle->referent = referent;
    handle->id = id != NULL ? xstrdup (id) : NULL;
    handle->name = handle_name;
    handle->referent = referent;
+  handle->encoding = xstrdup (encoding);
  
    if (id != NULL)
      {
  
    if (id != NULL)
      {
@@ -231,12 +237,11 @@ fh_create_file (const char *id, const char *file_name,
    struct file_handle *handle;
  
    handle_name = id != NULL ? xstrdup (id) : xasprintf ("`%s'", file_name);
    struct file_handle *handle;
  
    handle_name = id != NULL ? xstrdup (id) : xasprintf ("`%s'", file_name);
-  handle = create_handle (id, handle_name, FH_REF_FILE);
+  handle = create_handle (id, handle_name, FH_REF_FILE, properties->encoding);
    handle->file_name = xstrdup (file_name);
    handle->mode = properties->mode;
    handle->record_width = properties->record_width;
    handle->tab_width = properties->tab_width;
    handle->file_name = xstrdup (file_name);
    handle->mode = properties->mode;
    handle->record_width = properties->record_width;
    handle->tab_width = properties->tab_width;
-  handle->encoding = properties->encoding;
    return handle;
  }
  
    return handle;
  }
  
@@ -253,7 +258,7 @@ fh_create_dataset (struct dataset *ds)
    if (name[0] == '\0')
      name = _("active dataset");
  
    if (name[0] == '\0')
      name = _("active dataset");
  
-  handle = create_handle (NULL, xstrdup (name), FH_REF_DATASET);
+  handle = create_handle (NULL, xstrdup (name), FH_REF_DATASET, C_ENCODING);
    handle->ds = ds;
    return handle;
  }
    handle->ds = ds;
    return handle;
  }
@@ -263,7 +268,7 @@ const struct fh_properties *
  fh_default_properties (void)
  {
    static const struct fh_properties default_properties
  fh_default_properties (void)
  {
    static const struct fh_properties default_properties
-    = {FH_MODE_TEXT, 1024, 4, C_ENCODING};
+    = {FH_MODE_TEXT, 1024, 4, (char *) "Auto"};
    return &default_properties;
  }
  
    return &default_properties;
  }
  
@@ -333,10 +338,9 @@ fh_get_tab_width (const struct file_handle *handle)
  
  /* Returns the encoding of characters read from HANDLE. */
  const char *
  
  /* Returns the encoding of characters read from HANDLE. */
  const char *
-fh_get_legacy_encoding (const struct file_handle *handle)
+fh_get_encoding (const struct file_handle *handle)
  {
  {
-  assert (handle->referent & (FH_REF_FILE | FH_REF_INLINE));
-  return (handle->referent == FH_REF_FILE ? handle->encoding : C_ENCODING);
+  return handle->encoding;
  }
  
  /* Returns the dataset handle associated with HANDLE.
  }
  
  /* Returns the dataset handle associated with HANDLE.
diff --git a/src/data/file-handle-def.h b/src/data/file-handle-def.h

index 11898ef578eca45d437c86c41dfe71bd09d29ffc..9a60e7242383beb7c6fff28b3ec4b03dcb25365c 100644 (file)
--- a/src/data/file-handle-def.h
+++ b/src/data/file-handle-def.h
@@ -55,7 +55,7 @@ struct fh_properties
      enum fh_mode mode;          /* File mode. */
      size_t record_width;        /* Length of fixed-format records. */
      size_t tab_width;           /* Tab width, 0=do not expand tabs. */
      enum fh_mode mode;          /* File mode. */
      size_t record_width;        /* Length of fixed-format records. */
      size_t tab_width;           /* Tab width, 0=do not expand tabs. */
-    const char *encoding;       /* ASCII or EBCDIC? */
+    char *encoding;             /* Charset for contents. */
    };
  
  void fh_init (void);
    };
  
  void fh_init (void);
@@ -82,6 +82,7 @@ struct file_handle *fh_inline_file (void);
  const char *fh_get_id (const struct file_handle *);
  const char *fh_get_name (const struct file_handle *);
  enum fh_referent fh_get_referent (const struct file_handle *);
  const char *fh_get_id (const struct file_handle *);
  const char *fh_get_name (const struct file_handle *);
  enum fh_referent fh_get_referent (const struct file_handle *);
+const char *fh_get_encoding (const struct file_handle *);
  
  /* Properties of FH_REF_FILE file handles. */
  const char *fh_get_file_name (const struct file_handle *);
  
  /* Properties of FH_REF_FILE file handles. */
  const char *fh_get_file_name (const struct file_handle *);
@@ -90,7 +91,6 @@ enum fh_mode fh_get_mode (const struct file_handle *) ;
  /* Properties of FH_REF_FILE and FH_REF_INLINE file handles. */
  size_t fh_get_record_width (const struct file_handle *);
  size_t fh_get_tab_width (const struct file_handle *);
  /* Properties of FH_REF_FILE and FH_REF_INLINE file handles. */
  size_t fh_get_record_width (const struct file_handle *);
  size_t fh_get_tab_width (const struct file_handle *);
-const char *fh_get_legacy_encoding (const struct file_handle *);
  
  /* Properties of FH_REF_DATASET file handles. */
  struct dataset *fh_get_dataset (const struct file_handle *);
  
  /* Properties of FH_REF_DATASET file handles. */
  struct dataset *fh_get_dataset (const struct file_handle *);
diff --git a/src/language/data-io/data-list.c b/src/language/data-io/data-list.c

index f16c60651cf9928f46baaa5730fa4df19a3a0f01..17c6032d252b97181864f43e4158900b071316fe 100644 (file)
--- a/src/language/data-io/data-list.c
+++ b/src/language/data-io/data-list.c
@@ -78,7 +78,7 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds)
    struct dfm_reader *reader;
    struct variable *end = NULL;
    struct file_handle *fh = NULL;
    struct dfm_reader *reader;
    struct variable *end = NULL;
    struct file_handle *fh = NULL;
-  struct string encoding = DS_EMPTY_INITIALIZER;
+  char *encoding = NULL;
  
    int table;
    enum data_parser_type type;
  
    int table;
    enum data_parser_type type;
@@ -111,7 +111,8 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds)
           if (!lex_force_string (lexer))
             goto error;
  
           if (!lex_force_string (lexer))
             goto error;
  
-         ds_init_substring (&encoding, lex_tokss (lexer));
+          free (encoding);
+          encoding = ss_xstrdup (lex_tokss (lexer));
  
           lex_get (lexer);
         }
  
           lex_get (lexer);
         }
@@ -241,7 +242,7 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds)
      }
    type = data_parser_get_type (parser);
  
      }
    type = data_parser_get_type (parser);
  
-  if (! ds_is_empty (&encoding) && NULL == fh)
+  if (encoding && NULL == fh)
      msg (MW, _("Encoding should not be specified for inline data. It will be "
                 "ignored."));
  
      msg (MW, _("Encoding should not be specified for inline data. It will be "
                 "ignored."));
  
@@ -278,7 +279,7 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds)
    if (table)
      data_parser_output_description (parser, fh);
  
    if (table)
      data_parser_output_description (parser, fh);
  
-  reader = dfm_open_reader (fh, lexer);
+  reader = dfm_open_reader (fh, lexer, encoding);
    if (reader == NULL)
      goto error;
  
    if (reader == NULL)
      goto error;
  
@@ -294,7 +295,7 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds)
      data_parser_make_active_file (parser, ds, reader, dict);
  
    fh_unref (fh);
      data_parser_make_active_file (parser, ds, reader, dict);
  
    fh_unref (fh);
-  ds_destroy (&encoding);
+  free (encoding);
  
    return CMD_SUCCESS;
  
  
    return CMD_SUCCESS;
  
@@ -303,7 +304,7 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds)
    if (!in_input_program ())
      dict_destroy (dict);
    fh_unref (fh);
    if (!in_input_program ())
      dict_destroy (dict);
    fh_unref (fh);
-  ds_destroy (&encoding);
+  free (encoding);
    return CMD_CASCADING_FAILURE;
  }
  \f
    return CMD_CASCADING_FAILURE;
  }
  \f
diff --git a/src/language/data-io/data-parser.c b/src/language/data-io/data-parser.c

index aea3bbd0374693ca6bd1846aaa96ce9cfb041ca3..1dc7c93f7778f9cc266f701a46634be448655e74 100644 (file)
--- a/src/language/data-io/data-parser.c
+++ b/src/language/data-io/data-parser.c
@@ -527,7 +527,7 @@ static bool
  parse_fixed (const struct data_parser *parser, struct dfm_reader *reader,
               struct ccase *c)
  {
  parse_fixed (const struct data_parser *parser, struct dfm_reader *reader,
               struct ccase *c)
  {
-  const char *input_encoding = dfm_reader_get_legacy_encoding (reader);
+  const char *input_encoding = dfm_reader_get_encoding (reader);
    const char *output_encoding = dict_get_encoding (parser->dict);
    struct field *f;
    int row;
    const char *output_encoding = dict_get_encoding (parser->dict);
    struct field *f;
    int row;
@@ -579,7 +579,7 @@ static bool
  parse_delimited_span (const struct data_parser *parser,
                        struct dfm_reader *reader, struct ccase *c)
  {
  parse_delimited_span (const struct data_parser *parser,
                        struct dfm_reader *reader, struct ccase *c)
  {
-  const char *input_encoding = dfm_reader_get_legacy_encoding (reader);
+  const char *input_encoding = dfm_reader_get_encoding (reader);
    const char *output_encoding = dict_get_encoding (parser->dict);
    struct string tmp = DS_EMPTY_INITIALIZER;
    struct field *f;
    const char *output_encoding = dict_get_encoding (parser->dict);
    struct string tmp = DS_EMPTY_INITIALIZER;
    struct field *f;
@@ -623,7 +623,7 @@ static bool
  parse_delimited_no_span (const struct data_parser *parser,
                           struct dfm_reader *reader, struct ccase *c)
  {
  parse_delimited_no_span (const struct data_parser *parser,
                           struct dfm_reader *reader, struct ccase *c)
  {
-  const char *input_encoding = dfm_reader_get_legacy_encoding (reader);
+  const char *input_encoding = dfm_reader_get_encoding (reader);
    const char *output_encoding = dict_get_encoding (parser->dict);
    struct string tmp = DS_EMPTY_INITIALIZER;
    struct substring s;
    const char *output_encoding = dict_get_encoding (parser->dict);
    struct string tmp = DS_EMPTY_INITIALIZER;
    struct substring s;
diff --git a/src/language/data-io/data-reader.c b/src/language/data-io/data-reader.c

index 0f96e589cc3bd7b1547110088efee015dce3d2de..ea95bc983298d54acc24df5c55b62f1495436c07 100644 (file)
--- a/src/language/data-io/data-reader.c
+++ b/src/language/data-io/data-reader.c
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-2004, 2006, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-2004, 2006, 2010, 2011, 2012 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -34,7 +34,9 @@
  #include "language/lexer/lexer.h"
  #include "libpspp/assertion.h"
  #include "libpspp/cast.h"
  #include "language/lexer/lexer.h"
  #include "libpspp/assertion.h"
  #include "libpspp/cast.h"
+#include "libpspp/encoding-guesser.h"
  #include "libpspp/integer-format.h"
  #include "libpspp/integer-format.h"
+#include "libpspp/line-reader.h"
  #include "libpspp/message.h"
  #include "libpspp/str.h"
  
  #include "libpspp/message.h"
  #include "libpspp/str.h"
  
@@ -69,6 +71,10 @@ struct dfm_reader
      size_t pos;                 /* Offset in line of current character. */
      unsigned eof_cnt;           /* # of attempts to advance past EOF. */
      struct lexer *lexer;        /* The lexer reading the file */
      size_t pos;                 /* Offset in line of current character. */
      unsigned eof_cnt;           /* # of attempts to advance past EOF. */
      struct lexer *lexer;        /* The lexer reading the file */
+    char *encoding;             /* Current encoding. */
+
+    /* For FH_MODE_TEXT only. */
+    struct line_reader *line_reader;
  
      /* For FH_MODE_360_VARIABLE and FH_MODE_360_SPANNED files only. */
      size_t block_left;          /* Bytes left in current block. */
  
      /* For FH_MODE_360_VARIABLE and FH_MODE_360_SPANNED files only. */
      size_t block_left;          /* Bytes left in current block. */
@@ -101,19 +107,28 @@ dfm_close_reader (struct dfm_reader *r)
          }
      }
  
          }
      }
  
+  line_reader_free (r->line_reader);
+  free (r->encoding);
    fh_unref (r->fh);
    ds_destroy (&r->line);
    ds_destroy (&r->scratch);
    free (r);
  }
  
    fh_unref (r->fh);
    ds_destroy (&r->line);
    ds_destroy (&r->scratch);
    free (r);
  }
  
-/* Opens the file designated by file handle FH for reading as a
-   data file.  Providing fh_inline_file() for FH designates the
-   "inline file", that is, data included inline in the command
-   file between BEGIN FILE and END FILE.  Returns a reader if
-   successful, or a null pointer otherwise. */
+/* Opens the file designated by file handle FH for reading as a data file.
+   Returns a reader if successful, or a null pointer otherwise.
+
+   If FH is fh_inline_file() then the new reader reads data included inline in
+   the command file between BEGIN FILE and END FILE, obtaining data from LEXER.
+   LEXER must remain valid as long as the new reader is in use.  ENCODING is
+   ignored.
+
+   If FH is not fh_inline_file(), then the encoding of the file read is by
+   default that of FH itself.  If ENCODING is nonnull, then it overrides the
+   default encoding.  LEXER is ignored. */
  struct dfm_reader *
  struct dfm_reader *
-dfm_open_reader (struct file_handle *fh, struct lexer *lexer)
+dfm_open_reader (struct file_handle *fh, struct lexer *lexer,
+                 const char *encoding)
  {
    struct dfm_reader *r;
    struct fh_lock *lock;
  {
    struct dfm_reader *r;
    struct fh_lock *lock;
@@ -147,10 +162,6 @@ dfm_open_reader (struct file_handle *fh, struct lexer *lexer)
          {
            msg (ME, _("Could not open `%s' for reading as a data file: %s."),
                 fh_get_file_name (r->fh), strerror (errno));
          {
            msg (ME, _("Could not open `%s' for reading as a data file: %s."),
                 fh_get_file_name (r->fh), strerror (errno));
-          fh_unlock (r->lock);
-          fh_unref (fh);
-          free (r);
-          return NULL;
          }
        r->file_size = fstat (fileno (r->file), &s) == 0 ? s.st_size : -1;
      }
          }
        r->file_size = fstat (fileno (r->file), &s) == 0 ? s.st_size : -1;
      }
@@ -158,14 +169,43 @@ dfm_open_reader (struct file_handle *fh, struct lexer *lexer)
      r->file_size = -1;
    fh_lock_set_aux (lock, r);
  
      r->file_size = -1;
    fh_lock_set_aux (lock, r);
  
+  if (encoding == NULL)
+    encoding = fh_get_encoding (fh);
+  if (fh_get_referent (fh) == FH_REF_FILE && fh_get_mode (fh) == FH_MODE_TEXT)
+    {
+      r->line_reader = line_reader_for_fd (encoding, fileno (r->file));
+      if (r->line_reader == NULL)
+        {
+          msg (ME, _("Could not read `%s' as a text file with encoding `%s': "
+                     "%s."),
+               fh_get_file_name (r->fh), encoding, strerror (errno));
+          goto error;
+        }
+      r->encoding = xstrdup (line_reader_get_encoding (r->line_reader));
+    }
+  else
+    {
+      r->line_reader = NULL;
+      r->encoding = xstrdup (encoding_guess_parse_encoding (encoding));
+    }
+
    return r;
    return r;
+
+error:
+  fh_unlock (r->lock);
+  fh_unref (fh);
+  free (r);
+  return NULL;
  }
  
  /* Returns true if an I/O error occurred on READER, false otherwise. */
  bool
  dfm_reader_error (const struct dfm_reader *r)
  {
  }
  
  /* Returns true if an I/O error occurred on READER, false otherwise. */
  bool
  dfm_reader_error (const struct dfm_reader *r)
  {
-  return fh_get_referent (r->fh) == FH_REF_FILE && ferror (r->file);
+  return (fh_get_referent (r->fh) == FH_REF_FILE
+          && (r->line_reader != NULL
+              ? line_reader_error (r->line_reader) != 0
+              : ferror (r->file)));
  }
  
  /* Reads a record from the inline file into R.
  }
  
  /* Reads a record from the inline file into R.
@@ -211,17 +251,12 @@ read_inline_record (struct dfm_reader *r)
    return true;
  }
  
    return true;
  }
  
-/* Report a read error or unexpected end-of-file condition on R. */
+/* Report a read error on R. */
  static void
  read_error (struct dfm_reader *r)
  {
  static void
  read_error (struct dfm_reader *r)
  {
-  if (ferror (r->file))
-    msg (ME, _("Error reading file %s: %s."),
-         fh_get_name (r->fh), strerror (errno));
-  else if (feof (r->file))
-    msg (ME, _("Unexpected end of file reading %s."), fh_get_name (r->fh));
-  else
-    NOT_REACHED ();
+  msg (ME, _("Error reading file %s: %s."),
+       fh_get_name (r->fh), strerror (errno));
  }
  
  /* Report a partial read at end of file reading R. */
  }
  
  /* Report a partial read at end of file reading R. */
@@ -333,6 +368,34 @@ read_size (struct dfm_reader *r, size_t *size_out)
    return 1;
  }
  
    return 1;
  }
  
+static bool
+read_text_record (struct dfm_reader *r)
+{
+  bool is_auto;
+  bool ok;
+
+  /* Read a line.  If the line reader's encoding changes, update r->encoding to
+     match. */
+  is_auto = line_reader_is_auto (r->line_reader);
+  ok = line_reader_read (r->line_reader, &r->line, SIZE_MAX);
+  if (is_auto && !line_reader_is_auto (r->line_reader))
+    {
+      free (r->encoding);
+      r->encoding = xstrdup (line_reader_get_encoding (r->line_reader));
+    }
+
+  /* Detect and report read error. */
+  if (!ok)
+    {
+      int error = line_reader_error (r->line_reader);
+      if (error != 0)
+        msg (ME, _("Error reading file %s: %s."),
+             fh_get_name (r->fh), strerror (error));
+    }
+
+  return ok;
+}
+
  /* Reads a record from a disk file into R.
     Returns true if successful, false on error or at end of file. */
  static bool
  /* Reads a record from a disk file into R.
     Returns true if successful, false on error or at end of file. */
  static bool
@@ -344,17 +407,7 @@ read_file_record (struct dfm_reader *r)
    switch (fh_get_mode (r->fh))
      {
      case FH_MODE_TEXT:
    switch (fh_get_mode (r->fh))
      {
      case FH_MODE_TEXT:
-      if (ds_read_line (&r->line, r->file, SIZE_MAX))
-        {
-          ds_chomp_byte (&r->line, '\n');
-          return true;
-        }
-      else
-        {
-          if (ferror (r->file))
-            read_error (r);
-          return false;
-        }
+      return read_text_record (r);
  
      case FH_MODE_FIXED:
        if (ds_read_stream (&r->line, 1, fh_get_record_width (r->fh), r->file))
  
      case FH_MODE_FIXED:
        if (ds_read_stream (&r->line, 1, fh_get_record_width (r->fh), r->file))
@@ -597,11 +650,11 @@ dfm_expand_tabs (struct dfm_reader *r)
    r->pos = new_pos;
  }
  
    r->pos = new_pos;
  }
  
-/* Returns the legacy character encoding of data read from READER. */
+/* Returns the character encoding of data read from READER. */
  const char *
  const char *
-dfm_reader_get_legacy_encoding (const struct dfm_reader *reader)
+dfm_reader_get_encoding (const struct dfm_reader *reader)
  {
  {
-  return fh_get_legacy_encoding (reader->fh);
+  return reader->encoding;
  }
  
  /* Returns a number between 0 and 100 that approximates the
  }
  
  /* Returns a number between 0 and 100 that approximates the
@@ -615,7 +668,11 @@ dfm_get_percent_read (const struct dfm_reader *reader)
  {
    if (reader->file_size >= 0)
      {
  {
    if (reader->file_size >= 0)
      {
-      off_t position = ftello (reader->file);
+      off_t position;
+
+      position = (reader->line_reader != NULL
+                  ? line_reader_tell (reader->line_reader)
+                  : ftello (reader->file));
        if (position >= 0)
          {
            double p = 100.0 * position / reader->file_size;
        if (position >= 0)
          {
            double p = 100.0 * position / reader->file_size;
@@ -710,7 +767,7 @@ cmd_begin_data (struct lexer *lexer, struct dataset *ds)
    lex_match (lexer, T_ENDCMD);
  
    /* Open inline file. */
    lex_match (lexer, T_ENDCMD);
  
    /* Open inline file. */
-  r = dfm_open_reader (fh_inline_file (), lexer);
+  r = dfm_open_reader (fh_inline_file (), lexer, NULL);
    r->flags |= DFM_SAW_BEGIN_DATA;
    r->flags &= ~DFM_CONSUME;
  
    r->flags |= DFM_SAW_BEGIN_DATA;
    r->flags &= ~DFM_CONSUME;
  
diff --git a/src/language/data-io/data-reader.h b/src/language/data-io/data-reader.h

index affff788153401961dde16d0247a793f288fff8a..a199f015af79a5749dddfe6d99aa5ece5a537baa 100644 (file)
--- a/src/language/data-io/data-reader.h
+++ b/src/language/data-io/data-reader.h
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2010, 2011, 2012 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -31,13 +31,14 @@ struct string;
  struct lexer;
  
  /* Input. */
  struct lexer;
  
  /* Input. */
-struct dfm_reader *dfm_open_reader (struct file_handle *, struct lexer *);
+struct dfm_reader *dfm_open_reader (struct file_handle *, struct lexer *,
+                                    const char *encoding);
  void dfm_close_reader (struct dfm_reader *);
  bool dfm_reader_error (const struct dfm_reader *);
  unsigned dfm_eof (struct dfm_reader *);
  struct substring dfm_get_record (struct dfm_reader *);
  void dfm_expand_tabs (struct dfm_reader *);
  void dfm_close_reader (struct dfm_reader *);
  bool dfm_reader_error (const struct dfm_reader *);
  unsigned dfm_eof (struct dfm_reader *);
  struct substring dfm_get_record (struct dfm_reader *);
  void dfm_expand_tabs (struct dfm_reader *);
-const char *dfm_reader_get_legacy_encoding (const struct dfm_reader *);
+const char *dfm_reader_get_encoding (const struct dfm_reader *);
  int dfm_get_percent_read (const struct dfm_reader *);
  
  /* Line control. */
  int dfm_get_percent_read (const struct dfm_reader *);
  
  /* Line control. */
diff --git a/src/language/data-io/data-writer.c b/src/language/data-io/data-writer.c

index 113be58805f979c63bd736a85f5c473e5595b006..5270db0e8119f86236ba5e64e68a510b8cf6840c 100644 (file)
--- a/src/language/data-io/data-writer.c
+++ b/src/language/data-io/data-writer.c
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-2004, 2006, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-2004, 2006, 2010, 2011, 2012 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -28,7 +28,9 @@
  #include "data/make-file.h"
  #include "language/data-io/file-handle.h"
  #include "libpspp/assertion.h"
  #include "data/make-file.h"
  #include "language/data-io/file-handle.h"
  #include "libpspp/assertion.h"
+#include "libpspp/encoding-guesser.h"
  #include "libpspp/integer-format.h"
  #include "libpspp/integer-format.h"
+#include "libpspp/i18n.h"
  #include "libpspp/message.h"
  #include "libpspp/str.h"
  
  #include "libpspp/message.h"
  #include "libpspp/str.h"
  
@@ -46,14 +48,31 @@ struct dfm_writer
      struct fh_lock *lock;       /* Exclusive access to file. */
      FILE *file;                 /* Associated file. */
      struct replace_file *rf;    /* Atomic file replacement support. */
      struct fh_lock *lock;       /* Exclusive access to file. */
      FILE *file;                 /* Associated file. */
      struct replace_file *rf;    /* Atomic file replacement support. */
+    char *encoding;             /* Encoding. */
+
+    int unit;                   /* Unit width, in bytes. */
+    char lf[MAX_UNIT];          /* \n in encoding, 'unit' bytes long. */
+    char spaces[32];            /* 32 bytes worth of ' ' in encoding. */
    };
  
    };
  
-/* Opens a file handle for writing as a data file. */
+/* Opens a file handle for writing as a data file.
+
+   The encoding of the file written is by default that of FH itself.  If
+   ENCODING is nonnull, then it overrides the default encoding.
+
+   *However*: ENCODING directly affects only text strings written by the data
+   writer code itself, that is, new-lines in FH_MODE_TEXT and space padding in
+   FH_MODE_FIXED mode.  The client must do its own encoding translation for the
+   data that it writes.  (This is unavoidable because sometimes the data
+   written includes binary data that reencoding would mangle.)  The client can
+   obtain the encoding to re-encode into with dfm_writer_get_encoding(). */
  struct dfm_writer *
  struct dfm_writer *
-dfm_open_writer (struct file_handle *fh)
+dfm_open_writer (struct file_handle *fh, const char *encoding)
  {
  {
+  struct encoding_info ei;
    struct dfm_writer *w;
    struct fh_lock *lock;
    struct dfm_writer *w;
    struct fh_lock *lock;
+  int ofs;
  
    lock = fh_lock (fh, FH_REF_FILE, N_("data file"), FH_ACC_WRITE, false);
    if (lock == NULL)
  
    lock = fh_lock (fh, FH_REF_FILE, N_("data file"), FH_ACC_WRITE, false);
    if (lock == NULL)
@@ -63,11 +82,22 @@ dfm_open_writer (struct file_handle *fh)
    if (w != NULL)
      return w;
  
    if (w != NULL)
      return w;
  
+  encoding = encoding_guess_parse_encoding (encoding != NULL
+                                            ? encoding
+                                            : fh_get_encoding (fh));
+  get_encoding_info (&ei, encoding);
+
    w = xmalloc (sizeof *w);
    w->fh = fh_ref (fh);
    w->lock = lock;
    w->rf = replace_file_start (fh_get_file_name (w->fh), "wb", 0666,
                                &w->file, NULL);
    w = xmalloc (sizeof *w);
    w->fh = fh_ref (fh);
    w->lock = lock;
    w->rf = replace_file_start (fh_get_file_name (w->fh), "wb", 0666,
                                &w->file, NULL);
+  w->encoding = xstrdup (encoding);
+  w->unit = ei.unit;
+  memcpy (w->lf, ei.lf, sizeof w->lf);
+  for (ofs = 0; ofs + ei.unit <= sizeof w->spaces; ofs += ei.unit)
+    memcpy (&w->spaces[ofs], ei.space, ei.unit);
+
    if (w->rf == NULL)
      {
        msg (ME, _("An error occurred while opening `%s' for writing "
    if (w->rf == NULL)
      {
        msg (ME, _("An error occurred while opening `%s' for writing "
@@ -104,7 +134,7 @@ dfm_put_record (struct dfm_writer *w, const char *rec, size_t len)
      {
      case FH_MODE_TEXT:
        fwrite (rec, len, 1, w->file);
      {
      case FH_MODE_TEXT:
        fwrite (rec, len, 1, w->file);
-      putc ('\n', w->file);
+      fwrite (w->lf, w->unit, 1, w->file);
        break;
  
      case FH_MODE_FIXED:
        break;
  
      case FH_MODE_FIXED:
@@ -115,9 +145,8 @@ dfm_put_record (struct dfm_writer *w, const char *rec, size_t len)
          fwrite (rec, write_bytes, 1, w->file);
          while (pad_bytes > 0)
            {
          fwrite (rec, write_bytes, 1, w->file);
          while (pad_bytes > 0)
            {
-            static const char spaces[32] = "                                ";
-            size_t chunk = MIN (pad_bytes, sizeof spaces);
-            fwrite (spaces, chunk, 1, w->file);
+            size_t chunk = MIN (pad_bytes, sizeof w->spaces);
+            fwrite (w->spaces, chunk, 1, w->file);
              pad_bytes -= chunk;
            }
        }
              pad_bytes -= chunk;
            }
        }
@@ -193,14 +222,15 @@ dfm_close_writer (struct dfm_writer *w)
          ok = false;
      }
    fh_unref (w->fh);
          ok = false;
      }
    fh_unref (w->fh);
+  free (w->encoding);
    free (w);
  
    return ok;
  }
  
    free (w);
  
    return ok;
  }
  
-/* Returns the legacy character encoding of data written to WRITER. */
+/* Returns the encoding of data written to WRITER. */
  const char *
  const char *
-dfm_writer_get_legacy_encoding (const struct dfm_writer *writer)
+dfm_writer_get_encoding (const struct dfm_writer *writer)
  {
  {
-  return fh_get_legacy_encoding (writer->fh);
+  return writer->encoding;
  }
  }
diff --git a/src/language/data-io/data-writer.h b/src/language/data-io/data-writer.h

index 045db3163fd3144f2653e294d5a023e1416dc0f6..10ad6cd656c6acfc2036c66f11be2fbf0304e886 100644 (file)
--- a/src/language/data-io/data-writer.h
+++ b/src/language/data-io/data-writer.h
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2011, 2012 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -23,10 +23,11 @@
  #include <stddef.h>
  
  struct file_handle;
  #include <stddef.h>
  
  struct file_handle;
-struct dfm_writer *dfm_open_writer (struct file_handle *);
+struct dfm_writer *dfm_open_writer (struct file_handle *,
+                                    const char *encoding);
  bool dfm_close_writer (struct dfm_writer *);
  bool dfm_write_error (const struct dfm_writer *);
  bool dfm_put_record (struct dfm_writer *, const char *rec, size_t len);
  bool dfm_close_writer (struct dfm_writer *);
  bool dfm_write_error (const struct dfm_writer *);
  bool dfm_put_record (struct dfm_writer *, const char *rec, size_t len);
-const char *dfm_writer_get_legacy_encoding (const struct dfm_writer *);
+const char *dfm_writer_get_encoding (const struct dfm_writer *);
  
  #endif /* data-writer.h */
  
  #endif /* data-writer.h */
diff --git a/src/language/data-io/file-handle.q b/src/language/data-io/file-handle.q

index 0519803e7fc08080217ae4593382a4d045813780..26dfc97e0c920d1fb9d9d8f90ec9a5115f65a7c1 100644 (file)
--- a/src/language/data-io/file-handle.q
+++ b/src/language/data-io/file-handle.q
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2006, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2006, 2010, 2011, 2012 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -16,20 +16,22 @@
  
  #include <config.h>
  
  
  #include <config.h>
  
+#include "data/file-handle-def.h"
+
  #include <limits.h>
  #include <errno.h>
  #include <stdlib.h>
  
  #include "data/file-name.h"
  #include "data/session.h"
  #include <limits.h>
  #include <errno.h>
  #include <stdlib.h>
  
  #include "data/file-name.h"
  #include "data/session.h"
+#include "data/variable.h"
  #include "language/command.h"
  #include "language/data-io/file-handle.h"
  #include "language/lexer/lexer.h"
  #include "libpspp/assertion.h"
  #include "language/command.h"
  #include "language/data-io/file-handle.h"
  #include "language/lexer/lexer.h"
  #include "libpspp/assertion.h"
+#include "libpspp/cast.h"
  #include "libpspp/message.h"
  #include "libpspp/str.h"
  #include "libpspp/message.h"
  #include "libpspp/str.h"
-#include "data/variable.h"
-#include "data/file-handle-def.h"
  
  #include "gl/xalloc.h"
  
  
  #include "gl/xalloc.h"
  
@@ -45,7 +47,8 @@
       lrecl=integer;
       tabwidth=integer;
       mode=mode:!character/binary/image/360;
       lrecl=integer;
       tabwidth=integer;
       mode=mode:!character/binary/image/360;
-     recform=recform:fixed/f/variable/v/spanned/vs.
+     recform=recform:fixed/f/variable/v/spanned/vs;
+     encoding=string.
  */
  /* (declarations) */
  /* (functions) */
  */
  /* (declarations) */
  /* (functions) */
@@ -109,7 +112,7 @@ cmd_file_handle (struct lexer *lexer, struct dataset *ds)
        properties.mode = FH_MODE_VARIABLE;
        break;
      case FH_360:
        properties.mode = FH_MODE_VARIABLE;
        break;
      case FH_360:
-      properties.encoding = "EBCDIC-US";
+      properties.encoding = CONST_CAST (char *, "EBCDIC-US");
        if (cmd.recform == FH_FIXED || cmd.recform == FH_F)
          properties.mode = FH_MODE_FIXED;
        else if (cmd.recform == FH_VARIABLE || cmd.recform == FH_V)
        if (cmd.recform == FH_FIXED || cmd.recform == FH_F)
          properties.mode = FH_MODE_FIXED;
        else if (cmd.recform == FH_VARIABLE || cmd.recform == FH_V)
@@ -146,6 +149,9 @@ cmd_file_handle (struct lexer *lexer, struct dataset *ds)
          properties.record_width = cmd.n_lrecl[0];
      }
  
          properties.record_width = cmd.n_lrecl[0];
      }
  
+  if (cmd.s_encoding != NULL)
+    properties.encoding = cmd.s_encoding;
+
    fh_create_file (handle_name, cmd.s_name, &properties);
  
    result = CMD_SUCCESS;
    fh_create_file (handle_name, cmd.s_name, &properties);
  
    result = CMD_SUCCESS;
diff --git a/src/language/data-io/get-data.c b/src/language/data-io/get-data.c

index 4274f959d26048de21d79797d3eed879da6766fd..10d59aa374aa06cd29b30c3ea3e58bf528187099 100644 (file)
--- a/src/language/data-io/get-data.c
+++ b/src/language/data-io/get-data.c
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -307,6 +307,7 @@ parse_get_txt (struct lexer *lexer, struct dataset *ds)
    struct dictionary *dict = dict_create (get_default_encoding ());
    struct file_handle *fh = NULL;
    struct dfm_reader *reader = NULL;
    struct dictionary *dict = dict_create (get_default_encoding ());
    struct file_handle *fh = NULL;
    struct dfm_reader *reader = NULL;
+  char *encoding = NULL;
    char *name = NULL;
  
    int record;
    char *name = NULL;
  
    int record;
@@ -334,7 +335,18 @@ parse_get_txt (struct lexer *lexer, struct dataset *ds)
        if (!lex_force_match (lexer, T_SLASH))
          goto error;
  
        if (!lex_force_match (lexer, T_SLASH))
          goto error;
  
-      if (lex_match_id (lexer, "ARRANGEMENT"))
+      if (lex_match_id (lexer, "ENCODING"))
+       {
+         lex_match (lexer, T_EQUALS);
+         if (!lex_force_string (lexer))
+           goto error;
+
+          free (encoding);
+          encoding = ss_xstrdup (lex_tokss (lexer));
+
+         lex_get (lexer);
+       }
+      else if (lex_match_id (lexer, "ARRANGEMENT"))
          {
            bool ok;
  
          {
            bool ok;
  
@@ -606,12 +618,13 @@ parse_get_txt (struct lexer *lexer, struct dataset *ds)
      }
    while (lex_token (lexer) != T_ENDCMD);
  
      }
    while (lex_token (lexer) != T_ENDCMD);
  
-  reader = dfm_open_reader (fh, lexer);
+  reader = dfm_open_reader (fh, lexer, encoding);
    if (reader == NULL)
      goto error;
  
    data_parser_make_active_file (parser, ds, reader, dict);
    fh_unref (fh);
    if (reader == NULL)
      goto error;
  
    data_parser_make_active_file (parser, ds, reader, dict);
    fh_unref (fh);
+  free (encoding);
    return CMD_SUCCESS;
  
   error:
    return CMD_SUCCESS;
  
   error:
@@ -619,6 +632,7 @@ parse_get_txt (struct lexer *lexer, struct dataset *ds)
    dict_destroy (dict);
    fh_unref (fh);
    free (name);
    dict_destroy (dict);
    fh_unref (fh);
    free (name);
+  free (encoding);
    return CMD_CASCADING_FAILURE;
  }
  
    return CMD_CASCADING_FAILURE;
  }
  
diff --git a/src/language/data-io/inpt-pgm.c b/src/language/data-io/inpt-pgm.c

index 6f2a99e038cceca6bb2fc06c1b64ca30ec6f9e30..36c58c859122fc5a0c6f9322bd46ed102f1af135 100644 (file)
--- a/src/language/data-io/inpt-pgm.c
+++ b/src/language/data-io/inpt-pgm.c
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2009, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -266,6 +266,7 @@ cmd_reread (struct lexer *lexer, struct dataset *ds)
    struct file_handle *fh;       /* File to be re-read. */
    struct expression *e;         /* Expression for column to set. */
    struct reread_trns *t;        /* Created transformation. */
    struct file_handle *fh;       /* File to be re-read. */
    struct expression *e;         /* Expression for column to set. */
    struct reread_trns *t;        /* Created transformation. */
+  char *encoding = NULL;
  
    fh = fh_get_default_handle ();
    e = NULL;
  
    fh = fh_get_default_handle ();
    e = NULL;
@@ -278,13 +279,12 @@ cmd_reread (struct lexer *lexer, struct dataset *ds)
           if (e)
             {
                lex_sbc_only_once ("COLUMN");
           if (e)
             {
                lex_sbc_only_once ("COLUMN");
-             expr_free (e);
-             return CMD_CASCADING_FAILURE;
+              goto error;
             }
  
           e = expr_parse (lexer, ds, EXPR_NUMBER);
           if (!e)
             }
  
           e = expr_parse (lexer, ds, EXPR_NUMBER);
           if (!e)
-           return CMD_CASCADING_FAILURE;
+            goto error;
         }
        else if (lex_match_id (lexer, "FILE"))
         {
         }
        else if (lex_match_id (lexer, "FILE"))
         {
@@ -292,26 +292,39 @@ cmd_reread (struct lexer *lexer, struct dataset *ds)
            fh_unref (fh);
            fh = fh_parse (lexer, FH_REF_FILE | FH_REF_INLINE, NULL);
           if (fh == NULL)
            fh_unref (fh);
            fh = fh_parse (lexer, FH_REF_FILE | FH_REF_INLINE, NULL);
           if (fh == NULL)
-           {
-             expr_free (e);
-             return CMD_CASCADING_FAILURE;
-           }
+            goto error;
+       }
+      else if (lex_match_id (lexer, "ENCODING"))
+       {
+         lex_match (lexer, T_EQUALS);
+         if (!lex_force_string (lexer))
+           goto error;
+
+          free (encoding);
+          encoding = ss_xstrdup (lex_tokss (lexer));
+
+         lex_get (lexer);
         }
        else
         {
           lex_error (lexer, NULL);
         }
        else
         {
           lex_error (lexer, NULL);
-         expr_free (e);
-          return CMD_CASCADING_FAILURE;
+          goto error;
         }
      }
  
    t = xmalloc (sizeof *t);
         }
      }
  
    t = xmalloc (sizeof *t);
-  t->reader = dfm_open_reader (fh, lexer);
+  t->reader = dfm_open_reader (fh, lexer, encoding);
    t->column = e;
    add_transformation (ds, reread_trns_proc, reread_trns_free, t);
  
    fh_unref (fh);
    t->column = e;
    add_transformation (ds, reread_trns_proc, reread_trns_free, t);
  
    fh_unref (fh);
+  free (encoding);
    return CMD_SUCCESS;
    return CMD_SUCCESS;
+
+error:
+  expr_free (e);
+  free (encoding);
+  return CMD_CASCADING_FAILURE;
  }
  
  /* Executes a REREAD transformation. */
  }
  
  /* Executes a REREAD transformation. */
diff --git a/src/language/data-io/print-space.c b/src/language/data-io/print-space.c

index edaf13e769e42c8a6e20c72a617e4c4f3243f802..adeb92ba5b656f7391dd75bc1736469d59c00dd7 100644 (file)
--- a/src/language/data-io/print-space.c
+++ b/src/language/data-io/print-space.c
@@ -51,6 +51,7 @@ cmd_print_space (struct lexer *lexer, struct dataset *ds)
    struct file_handle *handle = NULL;
    struct expression *expr = NULL;
    struct dfm_writer *writer;
    struct file_handle *handle = NULL;
    struct expression *expr = NULL;
    struct dfm_writer *writer;
+  char *encoding = NULL;
  
    if (lex_match_id (lexer, "OUTFILE"))
      {
  
    if (lex_match_id (lexer, "OUTFILE"))
      {
@@ -59,6 +60,17 @@ cmd_print_space (struct lexer *lexer, struct dataset *ds)
        handle = fh_parse (lexer, FH_REF_FILE, NULL);
        if (handle == NULL)
         return CMD_FAILURE;
        handle = fh_parse (lexer, FH_REF_FILE, NULL);
        if (handle == NULL)
         return CMD_FAILURE;
+
+      if (lex_match_id (lexer, "ENCODING"))
+       {
+         lex_match (lexer, T_EQUALS);
+         if (!lex_force_string (lexer))
+           goto error;
+
+          encoding = ss_xstrdup (lex_tokss (lexer));
+
+         lex_get (lexer);
+       }
      }
    else
      handle = NULL;
      }
    else
      handle = NULL;
@@ -77,7 +89,7 @@ cmd_print_space (struct lexer *lexer, struct dataset *ds)
  
    if (handle != NULL)
      {
  
    if (handle != NULL)
      {
-      writer = dfm_open_writer (handle);
+      writer = dfm_open_writer (handle, encoding);
        if (writer == NULL)
          goto error;
      }
        if (writer == NULL)
          goto error;
      }
@@ -124,7 +136,7 @@ print_space_trns_proc (void *t_, struct ccase **c,
      if (trns->writer == NULL)
        text_item_submit (text_item_create (TEXT_ITEM_BLANK_LINE, ""));
      else
      if (trns->writer == NULL)
        text_item_submit (text_item_create (TEXT_ITEM_BLANK_LINE, ""));
      else
-      dfm_put_record (trns->writer, " ", 1);
+      dfm_put_record (trns->writer, " ", 1); /* XXX */
  
    if (trns->writer != NULL && dfm_write_error (trns->writer))
      return TRNS_ERROR;
  
    if (trns->writer != NULL && dfm_write_error (trns->writer))
      return TRNS_ERROR;
diff --git a/src/language/data-io/print.c b/src/language/data-io/print.c

index 86952e0efe2e4c3ce70f02519c0cd524fa60cc03..cffa3bd49fdd29f480798c311ba796ae791ff1da 100644 (file)
--- a/src/language/data-io/print.c
+++ b/src/language/data-io/print.c
@@ -136,6 +136,7 @@ internal_cmd_print (struct lexer *lexer, struct dataset *ds,
    bool print_table = 0;
    struct print_trns *trns;
    struct file_handle *fh = NULL;
    bool print_table = 0;
    struct print_trns *trns;
    struct file_handle *fh = NULL;
+  char *encoding = NULL;
    struct pool *tmp_pool;
  
    /* Fill in prt to facilitate error-handling. */
    struct pool *tmp_pool;
  
    /* Fill in prt to facilitate error-handling. */
@@ -160,6 +161,17 @@ internal_cmd_print (struct lexer *lexer, struct dataset *ds,
           if (fh == NULL)
             goto error;
         }
           if (fh == NULL)
             goto error;
         }
+      else if (lex_match_id (lexer, "ENCODING"))
+       {
+         lex_match (lexer, T_EQUALS);
+         if (!lex_force_string (lexer))
+           goto error;
+
+          free (encoding);
+          encoding = ss_xstrdup (lex_tokss (lexer));
+
+         lex_get (lexer);
+       }
        else if (lex_match_id (lexer, "RECORDS"))
         {
           lex_match (lexer, T_EQUALS);
        else if (lex_match_id (lexer, "RECORDS"))
         {
           lex_match (lexer, T_EQUALS);
@@ -194,10 +206,10 @@ internal_cmd_print (struct lexer *lexer, struct dataset *ds,
  
    if (fh != NULL)
      {
  
    if (fh != NULL)
      {
-      trns->writer = dfm_open_writer (fh);
+      trns->writer = dfm_open_writer (fh, encoding);
        if (trns->writer == NULL)
          goto error;
        if (trns->writer == NULL)
          goto error;
-      trns->encoding = dfm_writer_get_legacy_encoding (trns->writer);
+      trns->encoding = dfm_writer_get_encoding (trns->writer);
      }
    else
      trns->encoding = UTF8;
      }
    else
      trns->encoding = UTF8;
diff --git a/tests/language/data-io/get-data-txt.at b/tests/language/data-io/get-data-txt.at

index 4418974701233658dd5531635391977994eba4b9..3ba508c7c44420a0d6b2e0e8a81e866b2e315f77 100644 (file)
--- a/tests/language/data-io/get-data-txt.at
+++ b/tests/language/data-io/get-data-txt.at
@@ -568,3 +568,20 @@ x
  100
  ])
  AT_CLEANUP
  100
  ])
  AT_CLEANUP
+
+AT_SETUP([GET DATA /TYPE=TXT with ENCODING subcommand])
+AT_CHECK([i18n-test supports_encodings UTF-8 ISO-8859-1])
+AT_DATA([get-data.sps], [dnl
+set locale='utf-8'
+get data /type=txt /file='data.txt' /encoding='iso-8859-1'
+  /delimiters="," /variables=s a8.
+list.
+])
+printf '\351' > data.txt       # é in ISO-8859-1.
+AT_CHECK([pspp -o pspp.csv get-data.sps])
+AT_CHECK([cat pspp.csv], [0], [dnl
+Table: Data List
+s
+é      @&t@
+])
+AT_CLEANUP
author	Ben Pfaff <blp@cs.stanford.edu>
	Wed, 20 Jun 2012 05:43:24 +0000 (22:43 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Wed, 20 Jun 2012 06:09:38 +0000 (23:09 -0700)
NEWS		patch \| blob \| history
doc/data-io.texi		patch \| blob \| history
doc/files.texi		patch \| blob \| history
src/data/file-handle-def.c		patch \| blob \| history
src/data/file-handle-def.h		patch \| blob \| history
src/language/data-io/data-list.c		patch \| blob \| history
src/language/data-io/data-parser.c		patch \| blob \| history
src/language/data-io/data-reader.c		patch \| blob \| history
src/language/data-io/data-reader.h		patch \| blob \| history
src/language/data-io/data-writer.c		patch \| blob \| history
src/language/data-io/data-writer.h		patch \| blob \| history
src/language/data-io/file-handle.q		patch \| blob \| history
src/language/data-io/get-data.c		patch \| blob \| history
src/language/data-io/inpt-pgm.c		patch \| blob \| history
src/language/data-io/print-space.c		patch \| blob \| history
src/language/data-io/print.c		patch \| blob \| history
tests/language/data-io/get-data-txt.at		patch \| blob \| history