From 02a682272552710b1c4c869db3fea1d83b0eb3a7 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 22 Oct 2013 22:11:58 -0700 Subject: [PATCH] pspp-convert: New utility to convert among data formats. --- NEWS | 5 + doc/automake.mk | 1 + doc/pspp-convert.texi | 73 ++++++++++++++ doc/pspp.texinfo | 5 +- utilities/automake.mk | 9 ++ utilities/pspp-convert.1 | 83 +++++++++++++++ utilities/pspp-convert.c | 211 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 386 insertions(+), 1 deletion(-) create mode 100644 doc/pspp-convert.texi create mode 100644 utilities/pspp-convert.1 create mode 100644 utilities/pspp-convert.c diff --git a/NEWS b/NEWS index e3da3e489a..7b047be447 100644 --- a/NEWS +++ b/NEWS @@ -21,6 +21,11 @@ Changes since 0.8.1: way. To fix the problem, read the system file with this version of PSPP and then save a new copy of it.) + * pspp-convert, a new standalone utility for converting SPSS system + and portable files to other formats, is now included. The initial + version is only capable of converting to comma-separated value + files.. + * Build changes: - zlib is now a required dependency. (Previously it was optional.) diff --git a/doc/automake.mk b/doc/automake.mk index 2a06686d65..d5fb00b623 100644 --- a/doc/automake.mk +++ b/doc/automake.mk @@ -18,6 +18,7 @@ doc_pspp_TEXINFOS = doc/version.texi \ doc/invoking.texi \ doc/language.texi \ doc/license.texi \ + doc/pspp-convert.texi \ doc/ni.texi \ doc/not-implemented.texi \ doc/statistics.texi \ diff --git a/doc/pspp-convert.texi b/doc/pspp-convert.texi new file mode 100644 index 0000000000..328e9caa02 --- /dev/null +++ b/doc/pspp-convert.texi @@ -0,0 +1,73 @@ +@node Invoking pspp-convert +@chapter Invoking @command{pspp-convert} +@cindex Invocation +@cindex @command{pspp-convert} + +@command{pspp-convert} is a command-line utility accompanying +@pspp{}. It reads an SPSS system or portable file @var{input} and +writes a copy of it to another @var{output} in a different format. +Synopsis: + +@display +@t{pspp-convert} [@var{options}] @var{input} @var{output} + +@t{pspp-convert -@w{-}help} + +@t{pspp-convert -@w{-}version} +@end display + +The format of @var{Iinput} is automatically detected, except that the +character encoding of old system files cannot always be guessed +correctly. Use @code{-e @var{encoding}} to specify the encoding in this +case. + +By default, the intended format for @var{output} is inferred based on its +extension: + +@table @code +@item csv +@itemx txt +Comma-separated value. Each value is formatted according to its +variable's print format. The first line in the file contains variable +names. + +@item sav +@item sys +SPSS system file. + +@item por +SPSS portable file. +@end table + +Use @code{-O @var{extension}} to override the inferred format or to +specify the format for unrecognized extensions. + +The following options are accepted: + +@table @code +@item -O @var{format} +@itemx --output-format=@var{format} +Specifies the desired output format. @var{format} must be one of the +extensions listed above, e.g. @code{-O csv} requests comma-separated +value output. + +@item -c @var{maxcases} +@itemx --cases=@var{maxcases} +By default, all cases are copied from @var{input} to @var{output}. +Specifying this option to limit the number of cases written to +@var{output} to @var{maxcases}. + +@item -e @var{charset} +@itemx --encoding=@var{charset} +Overrides the encoding in which character strings in @var{input} are +interpreted. This option is necessary because old SPSS system files +do not self-identify their encoding. + +@item -h +@itemx --help +Prints a usage message on stdout and exits. + +@item -v +@itemx --version +Prints version information on stdout and exits. +@end table diff --git a/doc/pspp.texinfo b/doc/pspp.texinfo index b01f028191..eae4f540a6 100644 --- a/doc/pspp.texinfo +++ b/doc/pspp.texinfo @@ -54,7 +54,7 @@ This manual is for GNU PSPP version @value{VERSION}, software for statistical analysis. -Copyright @copyright{} 1997, 1998, 2004, 2005, 2009, 2012 Free Software Foundation, Inc. +Copyright @copyright{} 1997, 1998, 2004, 2005, 2009, 2012, 2013 Free Software Foundation, Inc. @quotation Permission is granted to copy, distribute and/or modify this document @@ -112,6 +112,7 @@ in the production of this manual. * Statistics:: Basic statistical procedures. * Utilities:: Other commands. +* Invoking pspp-convert:: Utility for converting among file formats. * Not Implemented:: What's not here yet * Bugs:: Known problems; submitting bug reports. @@ -138,6 +139,8 @@ in the production of this manual. @include flow-control.texi @include statistics.texi @include utilities.texi + +@include pspp-convert.texi @include not-implemented.texi @include bugs.texi diff --git a/utilities/automake.mk b/utilities/automake.mk index 8c77276dcd..5a44ff825c 100644 --- a/utilities/automake.mk +++ b/utilities/automake.mk @@ -5,3 +5,12 @@ utilities_pspp_dump_sav_SOURCES = \ src/libpspp/float-format.c \ utilities/pspp-dump-sav.c utilities_pspp_dump_sav_CPPFLAGS = $(AM_CPPFLAGS) -DINSTALLDIR=\"$(bindir)\" + +bin_PROGRAMS += utilities/pspp-convert +dist_man_MANS += utilities/pspp-convert.1 +utilities_pspp_convert_SOURCES = utilities/pspp-convert.c +utilities_pspp_convert_CPPFLAGS = $(AM_CPPFLAGS) -DINSTALLDIR=\"$(bindir)\" +utilities_pspp_convert_LDADD = \ + src/libpspp/liblibpspp.la \ + src/libpspp-core.la \ + gl/libgl.la diff --git a/utilities/pspp-convert.1 b/utilities/pspp-convert.1 new file mode 100644 index 0000000000..f1580cd6b5 --- /dev/null +++ b/utilities/pspp-convert.1 @@ -0,0 +1,83 @@ +.\" -*- nroff -*- +.de IQ +. br +. ns +. IP "\\$1" +.. +.TH pspp\-convert 1 "October 2013" "PSPP" "PSPP Manual" +. +.SH NAME +pspp\-convert \- convert SPSS system and portable files to other formats +. +.SH SYNOPSIS +\fBpspp\-convert\fR [\fIoptions\fR] \fIinput\fR \fIoutput\fR +.br +\fBpspp\-convert \-\-help\fR | \fB\-h\fR +.br +\fBpspp\-convert \-\-version\fR | \fB\-v\fR +. +.SH DESCRIPTION +The \fBpspp\-convert\fR program reads SPSS system or portable file +\fIinput\fR and writes it to \fIoutput\fR, performing format +conversion as necessary. +.PP +The format of \fIinput\fR is automatically detected, except that the +character encoding of old system files cannot always be guessed +correctly. Use \fB\-e \fIencoding\fR to specify the encoding in this +case. +.PP +By default, the intended format for \fIoutput\fR is inferred from its +extension: +. +.IP \fBcsv\fR +.IQ \fBtxt\fR +Comma-separated value. Each value is formatted according to its +variable's print format. The first line in the file contains variable +names. +. +.IP \fBsav\fR +.IQ \fBsys\fR +SPSS system file. +. +.IP \fBpor\fR +SPSS portable file. +. +.PP +Use \fB\-O \fIextension\fR to override the inferred format or to +specify the format for unrecognized extensions. +. +.SH "OPTIONS" +. +.IP "\fB\-O format\fR" +.IQ "\fB\-\-output\-format=\fIformat\fR" +Specifies the desired output format. \fIformat\fR must be one of the +extensions listed above, e.g. \fB\-O csv\fR requests comma-separated +value output. +. +.IP "\fB\-c \fImaxcases\fR" +.IQ "\fB\-\-cases=\fImaxcases\fR" +By default, all cases are copied from \fIinput\fR to \fIoutput\fR. +Specifying this option to limit the number of cases written to +\fIoutput\fR to \fImaxcases\fR. +. +.IP "\fB\-e \fIcharset\fR" +.IQ "\fB\-\-encoding=\fIcharset\fR" +Overrides the encoding in which character strings in \fIinput\fR are +interpreted. This option is necessary because old SPSS system files +do not self-identify their encoding. +. +.IP "\fB\-h\fR" +.IQ "\fB\-\-help\fR" +Prints a usage message on stdout and exits. +. +.IP "\fB\-v\fR" +.IQ "\fB\-\-version\fR" +Prints version information on stdout and exits. +. +.SH "AUTHORS" +Ben Pfaff. +. +.SH "SEE ALSO" +. +.BR pspp (1), +.BR psppire (1). diff --git a/utilities/pspp-convert.c b/utilities/pspp-convert.c new file mode 100644 index 0000000000..233bfbffbf --- /dev/null +++ b/utilities/pspp-convert.c @@ -0,0 +1,211 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2013 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include +#include +#include + +#include "data/any-reader.h" +#include "data/casereader.h" +#include "data/casewriter.h" +#include "data/csv-file-writer.h" +#include "data/por-file-writer.h" +#include "data/settings.h" +#include "data/sys-file-writer.h" +#include "data/file-handle-def.h" +#include "libpspp/assertion.h" +#include "libpspp/cast.h" +#include "libpspp/i18n.h" + +#include "gl/error.h" +#include "gl/progname.h" +#include "gl/version-etc.h" + +#include "gettext.h" +#define _(msgid) gettext (msgid) + +static void usage (void); + +int +main (int argc, char *argv[]) +{ + const char *input_filename; + const char *output_filename; + + struct fh_properties properties; + long long int max_cases = LLONG_MAX; + struct dictionary *dict; + struct casereader *reader; + struct file_handle *input_fh; + + const char *output_format = NULL; + struct file_handle *output_fh; + struct casewriter *writer; + + long long int i; + + set_program_name (argv[0]); + i18n_init (); + fh_init (); + settings_init (); + + properties = *fh_default_properties (); + for (;;) + { + static const struct option long_options[] = + { + { "cases", required_argument, NULL, 'c' }, + { "encoding", required_argument, NULL, 'e' }, + + { "output-format", required_argument, NULL, 'O' }, + + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'v' }, + { NULL, 0, NULL, 0 }, + }; + + int c; + + c = getopt_long (argc, argv, "c:e:O:hv", long_options, NULL); + if (c == -1) + break; + + switch (c) + { + case 'c': + max_cases = strtoull (optarg, NULL, 0); + break; + + case 'e': + properties.encoding = optarg; + break; + + case 'O': + output_format = optarg; + break; + + case 'v': + version_etc (stdout, "pspp-convert", PACKAGE_NAME, PACKAGE_VERSION, + "Ben Pfaff", "John Darrington", NULL_SENTINEL); + exit (EXIT_SUCCESS); + + case 'h': + usage (); + exit (EXIT_SUCCESS); + + default: + exit (EXIT_FAILURE); + } + } + + if (optind + 2 != argc) + error (1, 0, _("exactly two non-option arguments are required; " + "use --help for help")); + + input_filename = argv[optind]; + output_filename = argv[optind + 1]; + if (output_format == NULL) + { + const char *dot = strrchr (output_filename, '.'); + if (dot == NULL) + error (1, 0, _("%s: cannot guess output format (use -O option)"), + output_filename); + + output_format = dot + 1; + } + + input_fh = fh_create_file (NULL, input_filename, &properties); + reader = any_reader_open (input_fh, properties.encoding, &dict); + if (reader == NULL) + exit (1); + + output_fh = fh_create_file (NULL, output_filename, fh_default_properties ()); + if (!strcmp (output_format, "csv") || !strcmp (output_format, "txt")) + { + struct csv_writer_options options; + + csv_writer_options_init (&options); + options.include_var_names = true; + writer = csv_writer_open (output_fh, dict, &options); + } + else if (!strcmp (output_format, "sav") || !strcmp (output_format, "sys")) + { + struct sfm_write_options options; + + options = sfm_writer_default_options (); + writer = sfm_open_writer (output_fh, dict, options); + } + else if (!strcmp (output_format, "por")) + { + struct pfm_write_options options; + + options = pfm_writer_default_options (); + writer = pfm_open_writer (output_fh, dict, options); + } + else + { + error (1, 0, _("%s: unknown output format (use -O option)"), + output_filename); + NOT_REACHED (); + } + + for (i = 0; i < max_cases; i++) + { + struct ccase *c; + + c = casereader_read (reader); + if (c == NULL) + break; + + casewriter_write (writer, c); + } + + if (!casereader_destroy (reader)) + error (1, 0, _("%s: error reading input file"), input_filename); + if (!casewriter_destroy (writer)) + error (1, 0, _("%s: error writing output file"), output_filename); + + fh_done (); + i18n_done (); + + return 0; +} + +static void +usage (void) +{ + printf ("\ +%s, a utility for converting SPSS data files to other formats.\n\ +Usage: %s [OPTION]... INPUT OUTPUT\n\ +where INPUT is an SPSS system or portable file\n\ + and OUTPUT is the name of the desired output file.\n\ +\n\ +The desired format of OUTPUT is by default inferred from its extension:\n\ + csv txt comma-separated value\n\ + sav sys SPSS system file\n\ + por SPSS portable file\n\ +\n\ +Options:\n\ + -O, --output-format=FORMAT set specific output format, where FORMAT\n\ + is one of the extensions listed above\n\ + -e, --encoding=CHARSET override encoding of input data file\n\ + -c MAXCASES limit number of cases to copy (default is all cases)\n\ + --help display this help and exit\n\ + --version output version information and exit\n", + program_name, program_name); +} -- 2.30.2