From 8318b3fffc62b96271e4bbbeb67fe706f797e993 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Mon, 30 Dec 2019 02:48:53 +0000 Subject: [PATCH] pspp-convert: Add options for selecting variables and controlling output. --- NEWS | 6 +- doc/pspp-convert.texi | 93 +++++++++++++---- src/data/csv-file-writer.c | 13 --- src/data/csv-file-writer.h | 2 - src/language/data-io/save-translate.c | 25 +++-- utilities/automake.mk | 2 +- utilities/pspp-convert.1 | 33 +++++- utilities/pspp-convert.c | 144 ++++++++++++++++++++++---- 8 files changed, 246 insertions(+), 72 deletions(-) diff --git a/NEWS b/NEWS index 8bbecdf01d..2322a00f09 100644 --- a/NEWS +++ b/NEWS @@ -31,7 +31,11 @@ Changes from 1.2.0 to 1.3.0: - New "-a", "-l", "--password-list" options to search for an encrypted file's password. - - New "--labels" and "--recode" options for CSV output. + - New "--keep" and "--drop" options to output only selected variables. + + - New "--recode", "--no-var-name", "--labels", "--print-formats", + "--decimal", "--delimiter", and "--qualifier" options to control + CSV output. * Improvements to SAVE DATA COLLECTION support for MDD files. diff --git a/doc/pspp-convert.texi b/doc/pspp-convert.texi index 92e15b3cc2..4c31f5229a 100644 --- a/doc/pspp-convert.texi +++ b/doc/pspp-convert.texi @@ -68,20 +68,17 @@ If the password is unknown, use the @option{-a} and @option{-l} options to specify how to search for it, or @option{--password-list} to specify a file of passwords to try. -Use @code{-O format=@var{format}} to override the inferred format or to +Use @code{-O @var{format}} to override the inferred format or to specify the format for unrecognized extensions. -@command{pspp-convert} accepts the following options: +@command{pspp-convert} accepts the following general options: @table @option -@item @option{-O format=@var{format}} -Overrides the format inferred from the output file's extension. Use -@option{--help} to list the available formats. @xref{Invoking PSPP}, -for details of the available output formats. - -@item @option{-O @var{option}=@var{value}} -Sets an option for the output file format. @xref{Invoking PSPP}, for -details of the available output options. +@item @option{-O @var{format}} +@itemx @option{--output-format=@var{format}} +Sets the output format, where @var{format} is one of the extensions +listed above, e.g.: @option{-O csv}. Use @option{--help} to list +the supported output formats. @item -c @var{maxcases} @itemx --cases=@var{maxcases} @@ -95,17 +92,75 @@ Overrides the encoding in which character strings in @var{input} are interpreted. This option is necessary because old SPSS system files, and SPSS/PC+ system files, do not self-identify their encoding. -@item --labels -By default, @command{pspp-convert} writes variables' values to CVS -output files. With this option, @command{pspp-convert} writes value -labels. +@item -k @var{variable}@dots{} +@itemx --keep=@var{variable}@dots{} +By default, @command{pspp-convert} includes all the variables from the +input file. Use this option to list specific variables to include; +any variables not listed will be dropped. The variables in the output +file will also be reordered into the given order. The variable list +may use @code{TO} in the same way as in PSPP syntax, e.g.@: if the +dictionary contains consecutive variables @code{a}, @code{b}, +@code{c}, and @code{d}, then @option{--keep='a to d'} will include all +of them (and no others). + +@item -d @var{variable}@dots{} +@itemx --drop=@var{variable}@dots{} +Drops the specified variables from the output. + +When @option{--keep} and @option{--drop} are used together, +@option{--keep} is processed first. + +@item -h +@itemx --help +Prints a usage message on stdout and exits. +@item -v +@itemx --version +Prints version information on stdout and exits. +@end table + +The following options affect CSV output: + +@table @option @item --recode -By default, @command{pspp-convert} writes user-missing values to CVS +By default, @command{pspp-convert} writes user-missing values to CSV output files as their regular values. With this option, @command{pspp-convert} recodes them to system-missing values (which are written as a single space). +@item --no-var-names +By default, @command{pspp-convert} writes the variable names as the +first line of output. With this option, @command{pspp-convert} omits +this line. + +@item --labels +By default, @command{pspp-convert} writes variables' values to CSV +output files. With this option, @command{pspp-convert} writes value +labels. + +@item --print-formats +By default, @command{pspp-convert} writes numeric variables as plain +numbers. This option makes @command{pspp-convert} honor variables' +print formats. + +@item --decimal=@var{decimal} +This option sets the character used as a decimal point in output. The +default is @samp{.}. + +@item --delimiter=@var{delimiter} +This option sets the character used to separate fields in output. The +default is @samp{,}, unless the decimal point is @samp{,}, in which +case @samp{;} is used. + +@item --qualifier=@var{qualifier} +The option sets the character used to quote fields that contain the +delimiter. The default is @samp{"}. +@end table + +The following options specify how to obtain the password for encrypted +files: + +@table @option @item -p @var{password} @item --password=@var{password} Specifies the password to use to decrypt an encrypted SPSS system file @@ -137,12 +192,4 @@ Specifies the maximum length of the passwords to try. @item --password-list=@var{file} Specifies a file to read containing a list of passwords to try, one per line. If @var{file} is @file{-}, reads from stdin. - -@item -h -@itemx --help -Prints a usage message on stdout and exits. - -@item -v -@itemx --version -Prints version information on stdout and exits. @end table diff --git a/src/data/csv-file-writer.c b/src/data/csv-file-writer.c index e3dff983db..8f24bf3bfb 100644 --- a/src/data/csv-file-writer.c +++ b/src/data/csv-file-writer.c @@ -87,19 +87,6 @@ static void write_var_names (struct csv_writer *, const struct dictionary *); static bool write_error (const struct csv_writer *); static bool close_writer (struct csv_writer *); -/* Initializes OPTS with default options for writing a CSV file. */ -void -csv_writer_options_init (struct csv_writer_options *opts) -{ - opts->recode_user_missing = false; - opts->include_var_names = false; - opts->use_value_labels = false; - opts->use_print_formats = false; - opts->decimal = settings_get_decimal_char (FMT_F); - opts->delimiter = ','; - opts->qualifier = '"'; -} - /* Opens the CSV file designated by file handle FH for writing cases from dictionary DICT according to the given OPTS. diff --git a/src/data/csv-file-writer.h b/src/data/csv-file-writer.h index 65723bc288..cf9fee835a 100644 --- a/src/data/csv-file-writer.h +++ b/src/data/csv-file-writer.h @@ -33,8 +33,6 @@ struct csv_writer_options char qualifier; /* Quote character. */ }; -void csv_writer_options_init (struct csv_writer_options *); - struct file_handle; struct dictionary; struct casewriter *csv_writer_open (struct file_handle *, diff --git a/src/language/data-io/save-translate.c b/src/language/data-io/save-translate.c index 18853a03d0..0fb474631d 100644 --- a/src/language/data-io/save-translate.c +++ b/src/language/data-io/save-translate.c @@ -49,8 +49,6 @@ cmd_save_translate (struct lexer *lexer, struct dataset *ds) struct casewriter *writer; struct file_handle *handle; - struct csv_writer_options csv_opts; - bool replace; bool retain_unselected; @@ -256,17 +254,18 @@ cmd_save_translate (struct lexer *lexer, struct dataset *ds) dict_delete_scratch_vars (dict); dict_compact_values (dict); - csv_opts.recode_user_missing = recode_user_missing; - csv_opts.include_var_names = include_var_names; - csv_opts.use_value_labels = use_value_labels; - csv_opts.use_print_formats = use_print_formats; - csv_opts.decimal = decimal; - csv_opts.delimiter = (delimiter ? delimiter - : type == TAB_FILE ? '\t' - : decimal == '.' ? ',' - : ';'); - csv_opts.qualifier = qualifier; - + struct csv_writer_options csv_opts = { + .recode_user_missing = recode_user_missing, + .include_var_names = include_var_names, + .use_value_labels = use_value_labels, + .use_print_formats = use_print_formats, + .decimal = decimal, + .delimiter = (delimiter ? delimiter + : type == TAB_FILE ? '\t' + : decimal == '.' ? ',' + : ';'), + .qualifier = qualifier, + }; writer = csv_writer_open (handle, dict, &csv_opts); if (writer == NULL) goto error; diff --git a/utilities/automake.mk b/utilities/automake.mk index f4cf8983c8..cde13530c2 100644 --- a/utilities/automake.mk +++ b/utilities/automake.mk @@ -26,7 +26,7 @@ bin_PROGRAMS += utilities/pspp-convert dist_man_MANS += utilities/pspp-convert.1 utilities_pspp_convert_SOURCES = utilities/pspp-convert.c utilities_pspp_convert_CPPFLAGS = $(AM_CPPFLAGS) -DINSTALLDIR=\"$(bindir)\" -utilities_pspp_convert_LDADD = src/libpspp-core.la +utilities_pspp_convert_LDADD = src/libpspp.la src/libpspp-core.la $(CAIRO_LIBS) utilities_pspp_convert_LDFLAGS = $(PSPP_LDFLAGS) $(PG_LDFLAGS) if RELOCATABLE_VIA_LD diff --git a/utilities/pspp-convert.1 b/utilities/pspp-convert.1 index c568e3c0fc..08b5fec065 100644 --- a/utilities/pspp-convert.1 +++ b/utilities/pspp-convert.1 @@ -81,21 +81,52 @@ Overrides the encoding in which character strings in \fIinput\fR are interpreted. This option is necessary because old SPSS system files do not self-identify their encoding. . +.IP "\fB\-k \fIvar\fR..." +.IQ "\fB\-\-keep=\fIvar\fR..." +Drops all variables except those listed as \fIvar\fR, and reorders the +remaining variables into the specified order. +. +.IP "\fB\-d \fIvar\fR..." +.IQ "\fB\-\-drop=\fIvar\fR..." +Drops each \fIvar\fR listed from the output. +. .SS "CSV Output Options" .PP These options affect only output to \fB.csv\fR and \fB.txt\fR files. .IP "\fB\-\-labels\fR" By default, \fBpspp\-convert\fR writes variables' values to the output. With this option, \fBpspp\-convert\fR writes value labels. +.IP "\fB\-\-no\-var\-names\fR" +By default, \fRpspp\-convert\fR\fR writes the variable names as the +first line of output. With this option, \fBpspp\-convert\fR omits +this line. .IP "\fB\-\-recode\fR" By default, \fBpspp\-convert\fR writes user-missing values as their regular values. With this option, \fBpspp\-convert\fR recodes them to system-missing values (which are written as a single space). + +.IP "\fB\-\-print\-formats\fR" +By default, \fBpspp\-convert\fR writes numeric variables as plain +numbers. This option makes \fBpspp\-convert\fR honor variables' +print formats. + +.IP "\fB\-\-decimal=\fIdecimal\fR" +This option sets the character used as a decimal point in output. The +default is a period (\fB.\fR). + +.IP "\fB\-\-delimiter=\fIdelimiter\fR" +This option sets the character used to separate fields in output. The +default is a comma (\fB,\fR), unless the decimal point is a comma, in +which case a semicolon (\fB;\fR) is used. + +.IP "\fB\-\-qualifier=\fIqualifier\fR" +The option sets the character used to quote fields that contain the +delimiter. The default is a double quote (\fB\(dq\fR). . .SS "Password Options" When the input file is encrypted, \fBpspp\-convert\fR needs to obtain a password to decrypt it. To do so, the user may specify the password -with \f\-p\fR (or \fB\-\-password), or the name of a file containing a +with \f\-p\fR (or \fB\-\-password\fR), or the name of a file containing a list of possible passwords with \fB\-\-password\-list\fR, or an alphabet of possible passwords to try along with a maximum length with \fB\-a\fR (or \fB\-\-password\-alphabet\fR) and \fB\-l\fR (or diff --git a/utilities/pspp-convert.c b/utilities/pspp-convert.c index 3489a86b5d..a9e075a0e7 100644 --- a/utilities/pspp-convert.c +++ b/utilities/pspp-convert.c @@ -33,12 +33,16 @@ #include "data/settings.h" #include "data/sys-file-writer.h" #include "data/file-handle-def.h" +#include "language/command.h" +#include "language/lexer/lexer.h" +#include "language/lexer/variable-parser.h" #include "libpspp/assertion.h" #include "libpspp/cast.h" #include "libpspp/i18n.h" #include "gl/error.h" #include "gl/getpass.h" +#include "gl/localcharset.h" #include "gl/progname.h" #include "gl/version-etc.h" @@ -54,6 +58,42 @@ static bool decrypt_file (struct encrypted_file *enc, const char *alphabet, int max_length, const char *password_list); +static void +parse_character_option (const char *arg, const char *option_name, char *out) +{ + if (strlen (arg) != 1) + { + /* XXX support multibyte characters */ + error (1, 0, _("%s argument must be a single character"), option_name); + } + *out = arg[0]; +} + +static bool +parse_variables_option (const char *arg, struct dictionary *dict, + struct variable ***vars, size_t *n_vars) +{ + struct lexer *lexer = lex_create (); + lex_append (lexer, lex_reader_for_string (arg, locale_charset ())); + lex_get (lexer); + + bool ok = parse_variables (lexer, dict, vars, n_vars, 0); + if (ok && (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)) + { + lex_error (lexer, _("expecting variable name")); + ok = false; + } + + lex_destroy (lexer); + if (!ok) + { + free (*vars); + *vars = NULL; + *n_vars = 0; + } + return ok; +} + int main (int argc, char *argv[]) { @@ -61,8 +101,10 @@ main (int argc, char *argv[]) const char *output_filename; long long int max_cases = LLONG_MAX; + const char *keep = NULL; + const char *drop = NULL; struct dictionary *dict = NULL; - struct casereader *reader; + struct casereader *reader = NULL; struct file_handle *input_fh = NULL; const char *encoding = NULL; struct encrypted_file *enc; @@ -75,8 +117,12 @@ main (int argc, char *argv[]) const char *password_list = NULL; int length = 0; - bool recode_user_missing = false; - bool use_value_labels = false; + struct csv_writer_options csv_opts = { + .include_var_names = true, + .decimal = '.', + .delimiter = 0, /* The default will be set later. */ + .qualifier = '"', + }; long long int i; @@ -90,16 +136,28 @@ main (int argc, char *argv[]) enum { OPT_PASSWORD_LIST = UCHAR_MAX + 1, - OPT_LABELS, OPT_RECODE, + OPT_NO_VAR_NAMES, + OPT_LABELS, + OPT_PRINT_FORMATS, + OPT_DECIMAL, + OPT_DELIMITER, + OPT_QUALIFIER, }; static const struct option long_options[] = { - { "cases", required_argument, NULL, 'c' }, + { "cases", required_argument, NULL, 'c' }, + { "keep", required_argument, NULL, 'k' }, + { "drop", required_argument, NULL, 'd' }, { "encoding", required_argument, NULL, 'e' }, - { "labels", no_argument, NULL, OPT_LABELS }, { "recode", no_argument, NULL, OPT_RECODE }, + { "no-var-names", no_argument, NULL, OPT_NO_VAR_NAMES }, + { "labels", no_argument, NULL, OPT_LABELS }, + { "print-formats", no_argument, NULL, OPT_PRINT_FORMATS }, + { "decimal", required_argument, NULL, OPT_DECIMAL }, + { "delimiter", required_argument, NULL, OPT_DELIMITER }, + { "qualifier", required_argument, NULL, OPT_QUALIFIER }, { "password", required_argument, NULL, 'p' }, { "password-alphabet", required_argument, NULL, 'a' }, @@ -115,7 +173,7 @@ main (int argc, char *argv[]) int c; - c = getopt_long (argc, argv, "c:e:p:a:l:O:hv", long_options, NULL); + c = getopt_long (argc, argv, "c:k:d:e:p:a:l:O:hv", long_options, NULL); if (c == -1) break; @@ -125,6 +183,14 @@ main (int argc, char *argv[]) max_cases = strtoull (optarg, NULL, 0); break; + case 'k': + keep = optarg; + break; + + case 'd': + drop = optarg; + break; + case 'e': encoding = optarg; break; @@ -141,12 +207,28 @@ main (int argc, char *argv[]) password_list = optarg; break; + case OPT_RECODE: + csv_opts.recode_user_missing = true; + break; + + case OPT_NO_VAR_NAMES: + csv_opts.include_var_names = false; + break; + case OPT_LABELS: - use_value_labels = true; + csv_opts.use_value_labels = true; break; - case OPT_RECODE: - recode_user_missing = true; + case OPT_DECIMAL: + parse_character_option (optarg, "--decimal", &csv_opts.decimal); + break; + + case OPT_DELIMITER: + parse_character_option (optarg, "--delimiter", &csv_opts.delimiter); + break; + + case OPT_QUALIFIER: + parse_character_option (optarg, "--qualifier", &csv_opts.qualifier); break; case 'a': @@ -212,15 +294,33 @@ main (int argc, char *argv[]) if (reader == NULL) goto error; - if (!strcmp (output_format, "csv") || !strcmp (output_format, "txt")) + if (keep) + { + struct variable **keep_vars; + size_t n_keep_vars; + if (!parse_variables_option (keep, dict, &keep_vars, &n_keep_vars)) + goto error; + dict_reorder_vars (dict, keep_vars, n_keep_vars); + dict_delete_consecutive_vars (dict, n_keep_vars, + dict_get_var_cnt (dict) - n_keep_vars); + free (keep_vars); + } + + if (drop) { - struct csv_writer_options options; + struct variable **drop_vars; + size_t n_drop_vars; + if (!parse_variables_option (drop, dict, &drop_vars, &n_drop_vars)) + goto error; + dict_delete_vars (dict, drop_vars, n_drop_vars); + free (drop_vars); + } - csv_writer_options_init (&options); - options.include_var_names = true; - options.use_value_labels = use_value_labels; - options.recode_user_missing = recode_user_missing; - writer = csv_writer_open (output_fh, dict, &options); + if (!strcmp (output_format, "csv") || !strcmp (output_format, "txt")) + { + if (!csv_opts.delimiter) + csv_opts.delimiter = csv_opts.decimal == '.' ? ',' : ';'; + writer = csv_writer_open (output_fh, dict, &csv_opts); } else if (!strcmp (output_format, "sav") || !strcmp (output_format, "sys")) { @@ -272,6 +372,7 @@ exit: return 0; error: + casereader_destroy (reader); ds_destroy (&alphabet); dict_unref (dict); fh_unref (output_fh); @@ -473,9 +574,16 @@ General options:\n\ is one of the extensions listed above\n\ -e, --encoding=CHARSET override encoding of input data file\n\ -c MAXCASES limit number of cases to copy (default is all cases)\n\ + -k, --keep=VAR... include only the given variables in output\n\ + -d, --drop=VAR... drop the given variables from output\n\ CSV output options:\n\ - --labels write value labels to output\n\ --recode convert user-missing values to system-missing\n\ + --no-var-names do not include variable names as first row\n\ + --labels write value labels to output\n\ + --print-formats honor variables' print formats\n\ + --decimal=CHAR use CHAR as the decimal point (default: .)\n\ + --delimiter=CHAR use CHAR to separate fields (default: ,)\n\ + --qualifier=CHAR use CHAR to quote the delimiter (default: \")\n\ Password options (for used with encrypted files):\n\ -p PASSWORD individual password\n\ -a ALPHABET with -l, alphabet of passwords to try\n\ -- 2.30.2