1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2017 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/case.h"
20 #include "data/casereader.h"
21 #include "data/casewriter.h"
22 #include "data/dataset.h"
23 #include "data/dictionary.h"
24 #include "data/format.h"
25 #include "data/transformations.h"
26 #include "data/variable.h"
27 #include "language/command.h"
28 #include "language/data-io/data-parser.h"
29 #include "language/data-io/data-reader.h"
30 #include "language/data-io/file-handle.h"
31 #include "language/data-io/inpt-pgm.h"
32 #include "language/data-io/placement-parser.h"
33 #include "language/lexer/lexer.h"
34 #include "language/lexer/variable-parser.h"
35 #include "libpspp/i18n.h"
36 #include "libpspp/message.h"
37 #include "libpspp/misc.h"
40 #include "gl/xalloc.h"
43 #define _(msgid) gettext (msgid)
45 /* DATA LIST transformation data. */
48 struct data_parser *parser; /* Parser. */
49 struct dfm_reader *reader; /* Data file reader. */
50 struct variable *end; /* Variable specified on END subcommand. */
53 static trns_free_func data_list_trns_free;
54 static trns_proc_func data_list_trns_proc;
69 static const int ROWTYPE_WIDTH = 8;
73 enum triangle triangle;
74 enum diagonal diagonal;
75 const struct variable *rowtype;
76 const struct variable *varname;
77 int n_continuous_vars;
78 struct variable **split_vars;
84 valid rowtype_ values:
101 /* Sets the value of OUTCASE which corresponds to VNAME
102 to the value STR. VNAME must be of type string.
105 set_varname_column (struct ccase *outcase, const struct variable *vname,
108 int len = var_get_width (vname);
109 uint8_t *s = case_str_rw (outcase, vname);
111 strncpy (CHAR_CAST (char *, s), str, len);
115 blank_varname_column (struct ccase *outcase, const struct variable *vname)
117 int len = var_get_width (vname);
118 uint8_t *s = case_str_rw (outcase, vname);
120 memset (s, ' ', len);
123 static struct casereader *
124 preprocess (struct casereader *casereader0, const struct dictionary *dict, void *aux)
126 struct matrix_format *mformat = aux;
127 const struct caseproto *proto = casereader_get_proto (casereader0);
128 struct casewriter *writer = autopaging_writer_create (proto);
129 struct ccase *prev_case = NULL;
130 double **matrices = NULL;
133 const size_t sizeof_matrix =
134 sizeof (double) * mformat->n_continuous_vars * mformat->n_continuous_vars;
137 /* Make an initial pass to populate our temporary matrix */
138 struct casereader *pass0 = casereader_clone (casereader0);
140 union value *prev_values = xcalloc (mformat->n_split_vars, sizeof *prev_values);
141 int row = (mformat->triangle == LOWER && mformat->diagonal == NO_DIAGONAL) ? 1 : 0;
142 bool first_case = true;
143 for (; (c = casereader_read (pass0)) != NULL; case_unref (c))
150 for (s = 0; s < mformat->n_split_vars; ++s)
152 const struct variable *svar = mformat->split_vars[s];
153 const union value *sv = case_data (c, svar);
154 if (! value_equal (prev_values + s, sv, var_get_width (svar)))
163 if (matrices == NULL || ! match)
165 row = (mformat->triangle == LOWER && mformat->diagonal == NO_DIAGONAL) ?
169 matrices = xrealloc (matrices, sizeof (double*) * n_splits);
170 matrices[n_splits - 1] = xmalloc (sizeof_matrix);
173 for (s = 0; s < mformat->n_split_vars; ++s)
175 const struct variable *svar = mformat->split_vars[s];
176 const union value *sv = case_data (c, svar);
177 value_clone (prev_values + s, sv, var_get_width (svar));
180 int c_offset = (mformat->triangle == UPPER) ? row : 0;
181 if (mformat->triangle == UPPER && mformat->diagonal == NO_DIAGONAL)
183 const union value *v = case_data (c, mformat->rowtype);
184 const char *val = CHAR_CAST (const char *, v->s);
185 if (0 == strncasecmp (val, "corr ", ROWTYPE_WIDTH) ||
186 0 == strncasecmp (val, "cov ", ROWTYPE_WIDTH))
188 if (row >= mformat->n_continuous_vars)
191 _("There are %d variable declared but the data has at least %d matrix rows."),
192 mformat->n_continuous_vars, row + 1);
194 casereader_destroy (pass0);
199 for (col = c_offset; col < mformat->n_continuous_vars; ++col)
201 const struct variable *var =
204 var_get_dict_index (mformat->varname));
206 double e = case_data (c, var)->f;
210 /* Fill in the lower triangle */
211 (matrices[n_splits-1])[col + mformat->n_continuous_vars * row] = e;
213 if (mformat->triangle != FULL)
214 /* Fill in the upper triangle */
215 (matrices[n_splits-1]) [row + mformat->n_continuous_vars * col] = e;
220 casereader_destroy (pass0);
223 /* Now make a second pass to fill in the other triangle from our
225 const int idx = var_get_dict_index (mformat->varname);
231 struct ccase *outcase = case_create (proto);
232 union value *v = case_data_rw (outcase, mformat->rowtype);
233 memcpy (v->s, "N ", ROWTYPE_WIDTH);
234 blank_varname_column (outcase, mformat->varname);
235 for (col = 0; col < mformat->n_continuous_vars; ++col)
237 union value *dest_val =
238 case_data_rw_idx (outcase,
239 1 + col + var_get_dict_index (mformat->varname));
240 dest_val->f = mformat->n;
242 casewriter_write (writer, outcase);
246 prev_values = xcalloc (mformat->n_split_vars, sizeof *prev_values);
248 for (; (c = casereader_read (casereader0)) != NULL; prev_case = c)
255 for (s = 0; s < mformat->n_split_vars; ++s)
257 const struct variable *svar = mformat->split_vars[s];
258 const union value *sv = case_data (c, svar);
259 if (! value_equal (prev_values + s, sv, var_get_width (svar)))
273 for (s = 0; s < mformat->n_split_vars; ++s)
275 const struct variable *svar = mformat->split_vars[s];
276 const union value *sv = case_data (c, svar);
277 value_clone (prev_values + s, sv, var_get_width (svar));
280 case_unref (prev_case);
281 const union value *v = case_data (c, mformat->rowtype);
282 const char *val = CHAR_CAST (const char *, v->s);
285 if (0 == strncasecmp (val, "n ", ROWTYPE_WIDTH) ||
286 0 == strncasecmp (val, "n_vector", ROWTYPE_WIDTH))
289 _("The N subcommand was specified, but a N record was also found in the data. The N record will be ignored."));
294 struct ccase *outcase = case_create (proto);
295 case_copy (outcase, 0, c, 0, caseproto_get_n_widths (proto));
297 if (0 == strncasecmp (val, "corr ", ROWTYPE_WIDTH) ||
298 0 == strncasecmp (val, "cov ", ROWTYPE_WIDTH))
301 const struct variable *var = dict_get_var (dict, idx + 1 + row);
302 set_varname_column (outcase, mformat->varname, var_get_name (var));
303 value_copy (case_data_rw (outcase, mformat->rowtype), v, ROWTYPE_WIDTH);
305 for (col = 0; col < mformat->n_continuous_vars; ++col)
307 union value *dest_val =
308 case_data_rw_idx (outcase,
309 1 + col + var_get_dict_index (mformat->varname));
310 dest_val->f = (matrices[n_splits - 1])[col + mformat->n_continuous_vars * row];
311 if (col == row && mformat->diagonal == NO_DIAGONAL)
318 blank_varname_column (outcase, mformat->varname);
321 /* Special case for SD and N_VECTOR: Rewrite as STDDEV and N respectively */
322 if (0 == strncasecmp (val, "sd ", ROWTYPE_WIDTH))
324 value_copy_buf_rpad (case_data_rw (outcase, mformat->rowtype), ROWTYPE_WIDTH,
325 (uint8_t *) "STDDEV", 6, ' ');
327 else if (0 == strncasecmp (val, "n_vector", ROWTYPE_WIDTH))
329 value_copy_buf_rpad (case_data_rw (outcase, mformat->rowtype), ROWTYPE_WIDTH,
330 (uint8_t *) "N", 1, ' ');
333 casewriter_write (writer, outcase);
336 /* If NODIAGONAL is specified, then a final case must be written */
337 if (mformat->diagonal == NO_DIAGONAL)
340 struct ccase *outcase = case_create (proto);
343 case_copy (outcase, 0, prev_case, 0, caseproto_get_n_widths (proto));
345 const struct variable *var = dict_get_var (dict, idx + 1 + row);
346 set_varname_column (outcase, mformat->varname, var_get_name (var));
348 for (col = 0; col < mformat->n_continuous_vars; ++col)
350 union value *dest_val =
351 case_data_rw_idx (outcase, 1 + col +
352 var_get_dict_index (mformat->varname));
353 dest_val->f = (matrices[n_splits - 1]) [col + mformat->n_continuous_vars * row];
354 if (col == row && mformat->diagonal == NO_DIAGONAL)
358 casewriter_write (writer, outcase);
363 case_unref (prev_case);
366 for (i = 0 ; i < n_splits; ++i)
369 struct casereader *reader1 = casewriter_make_reader (writer);
370 casereader_destroy (casereader0);
376 case_unref (prev_case);
378 for (i = 0 ; i < n_splits; ++i)
381 casereader_destroy (casereader0);
382 casewriter_destroy (writer);
387 cmd_matrix (struct lexer *lexer, struct dataset *ds)
389 struct dictionary *dict;
390 struct data_parser *parser;
391 struct dfm_reader *reader;
392 struct file_handle *fh = NULL;
393 char *encoding = NULL;
394 struct matrix_format mformat;
399 mformat.triangle = LOWER;
400 mformat.diagonal = DIAGONAL;
401 mformat.n_split_vars = 0;
402 mformat.split_vars = NULL;
405 dict = (in_input_program ()
407 : dict_create (get_default_encoding ()));
408 parser = data_parser_create (dict);
411 data_parser_set_type (parser, DP_DELIMITED);
412 data_parser_set_warn_missing_fields (parser, false);
413 data_parser_set_span (parser, false);
415 mformat.rowtype = dict_create_var (dict, "ROWTYPE_", ROWTYPE_WIDTH);
417 mformat.n_continuous_vars = 0;
418 mformat.n_split_vars = 0;
420 if (! lex_force_match_id (lexer, "VARIABLES"))
423 lex_match (lexer, T_EQUALS);
425 if (! parse_mixed_vars (lexer, dict, &names, &n_names, PV_NO_DUPLICATE))
428 for (i = 0; i < n_names; ++i)
434 int longest_name = 0;
435 for (i = 0; i < n_names; ++i)
437 maximize_int (&longest_name, strlen (names[i]));
440 mformat.varname = dict_create_var (dict, "VARNAME_",
441 8 * DIV_RND_UP (longest_name, 8));
443 for (i = 0; i < n_names; ++i)
445 if (0 == strcasecmp (names[i], "ROWTYPE_"))
447 const struct fmt_spec fmt = fmt_for_input (FMT_A, 8, 0);
448 data_parser_add_delimited_field (parser,
450 var_get_case_index (mformat.rowtype),
455 const struct fmt_spec fmt = fmt_for_input (FMT_F, 10, 4);
456 struct variable *v = dict_create_var (dict, names[i], 0);
457 var_set_both_formats (v, &fmt);
458 data_parser_add_delimited_field (parser,
460 var_get_case_index (mformat.varname) +
461 ++mformat.n_continuous_vars,
465 for (i = 0; i < n_names; ++i)
469 while (lex_token (lexer) != T_ENDCMD)
471 if (! lex_force_match (lexer, T_SLASH))
474 if (lex_match_id (lexer, "N"))
476 lex_match (lexer, T_EQUALS);
478 if (! lex_force_int (lexer))
481 mformat.n = lex_integer (lexer);
484 msg (SE, _("%s must not be negative."), "N");
489 else if (lex_match_id (lexer, "FORMAT"))
491 lex_match (lexer, T_EQUALS);
493 while (lex_token (lexer) != T_SLASH && (lex_token (lexer) != T_ENDCMD))
495 if (lex_match_id (lexer, "LIST"))
497 data_parser_set_span (parser, false);
499 else if (lex_match_id (lexer, "FREE"))
501 data_parser_set_span (parser, true);
503 else if (lex_match_id (lexer, "UPPER"))
505 mformat.triangle = UPPER;
507 else if (lex_match_id (lexer, "LOWER"))
509 mformat.triangle = LOWER;
511 else if (lex_match_id (lexer, "FULL"))
513 mformat.triangle = FULL;
515 else if (lex_match_id (lexer, "DIAGONAL"))
517 mformat.diagonal = DIAGONAL;
519 else if (lex_match_id (lexer, "NODIAGONAL"))
521 mformat.diagonal = NO_DIAGONAL;
525 lex_error (lexer, NULL);
530 else if (lex_match_id (lexer, "FILE"))
532 lex_match (lexer, T_EQUALS);
534 fh = fh_parse (lexer, FH_REF_FILE | FH_REF_INLINE, NULL);
538 else if (lex_match_id (lexer, "SPLIT"))
540 lex_match (lexer, T_EQUALS);
541 if (! parse_variables (lexer, dict, &mformat.split_vars, &mformat.n_split_vars, 0))
543 free (mformat.split_vars);
547 for (i = 0; i < mformat.n_split_vars; ++i)
549 const struct fmt_spec fmt = fmt_for_input (FMT_F, 4, 0);
550 var_set_both_formats (mformat.split_vars[i], &fmt);
552 dict_reorder_vars (dict, mformat.split_vars, mformat.n_split_vars);
553 mformat.n_continuous_vars -= mformat.n_split_vars;
557 lex_error (lexer, NULL);
562 if (mformat.diagonal == NO_DIAGONAL && mformat.triangle == FULL)
564 msg (SE, _("FORMAT = FULL and FORMAT = NODIAGONAL are mutually exclusive."));
569 fh = fh_inline_file ();
570 fh_set_default_handle (fh);
572 if (!data_parser_any_fields (parser))
574 msg (SE, _("At least one variable must be specified."));
578 if (lex_end_of_command (lexer) != CMD_SUCCESS)
581 reader = dfm_open_reader (fh, lexer, encoding);
585 if (in_input_program ())
587 struct data_list_trns *trns = xmalloc (sizeof *trns);
588 trns->parser = parser;
589 trns->reader = reader;
591 add_transformation (ds, data_list_trns_proc, data_list_trns_free, trns);
595 data_parser_make_active_file (parser, ds, reader, dict, preprocess,
601 free (mformat.split_vars);
603 return CMD_DATA_LIST;
606 data_parser_destroy (parser);
607 if (!in_input_program ())
611 free (mformat.split_vars);
612 return CMD_CASCADING_FAILURE;
616 /* Input procedure. */
618 /* Destroys DATA LIST transformation TRNS.
619 Returns true if successful, false if an I/O error occurred. */
621 data_list_trns_free (void *trns_)
623 struct data_list_trns *trns = trns_;
624 data_parser_destroy (trns->parser);
625 dfm_close_reader (trns->reader);
630 /* Handle DATA LIST transformation TRNS, parsing data into *C. */
632 data_list_trns_proc (void *trns_, struct ccase **c, casenumber case_num UNUSED)
634 struct data_list_trns *trns = trns_;
637 *c = case_unshare (*c);
638 if (data_parser_parse (trns->parser, trns->reader, *c))
639 retval = TRNS_CONTINUE;
640 else if (dfm_reader_error (trns->reader) || dfm_eof (trns->reader) > 1)
642 /* An I/O error, or encountering end of file for a second
643 time, should be escalated into a more serious error. */
647 retval = TRNS_END_FILE;
649 /* If there was an END subcommand handle it. */
650 if (trns->end != NULL)
652 double *end = &case_data_rw (*c, trns->end)->f;
653 if (retval == TRNS_END_FILE)
656 retval = TRNS_CONTINUE;