1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2017, 2019 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "matrix-reader.h"
24 #include "data/casegrouper.h"
25 #include "data/casereader.h"
26 #include "data/casewriter.h"
27 #include "data/data-out.h"
28 #include "data/dataset.h"
29 #include "data/dictionary.h"
30 #include "data/format.h"
31 #include "data/variable.h"
32 #include "language/command.h"
33 #include "language/lexer/lexer.h"
34 #include "libpspp/i18n.h"
35 #include "libpspp/message.h"
36 #include "libpspp/str.h"
37 #include "output/pivot-table.h"
40 #define _(msgid) gettext (msgid)
41 #define N_(msgid) msgid
46 This module interprets a "data matrix", typically generated by the command
47 MATRIX DATA. The dictionary of such a matrix takes the form:
49 s_0, s_1, ... s_m, ROWTYPE_, VARNAME_, v_0, v_1, .... v_n
51 where s_0, s_1 ... s_m are the variables defining the splits, and
52 v_0, v_1 ... v_n are the continuous variables.
56 The ROWTYPE_ variable is of type A8.
57 The VARNAME_ variable is a string type whose width is not predetermined.
58 The variables s_x are of type F4.0 (although this reader accepts any type),
59 and v_x are of any numeric type.
61 The values of the ROWTYPE_ variable are in the set {MEAN, STDDEV, N, CORR, COV}
62 and determine the purpose of that case.
63 The values of the VARNAME_ variable must correspond to the names of the varibles
64 in {v_0, v_1 ... v_n} and indicate the rows of the correlation or covariance
69 A typical example is as follows:
71 s_0 ROWTYPE_ VARNAME_ v_0 v_1 v_2
73 0 MEAN 5.0000 4.0000 3.0000
74 0 STDDEV 1.0000 2.0000 3.0000
75 0 N 9.0000 9.0000 9.0000
76 0 CORR V1 1.0000 .6000 .7000
77 0 CORR V2 .6000 1.0000 .8000
78 0 CORR V3 .7000 .8000 1.0000
79 1 MEAN 9.0000 8.0000 7.0000
80 1 STDDEV 5.0000 6.0000 7.0000
81 1 N 9.0000 9.0000 9.0000
82 1 CORR V1 1.0000 .4000 .3000
83 1 CORR V2 .4000 1.0000 .2000
84 1 CORR V3 .3000 .2000 1.0000
89 matrix_material_uninit (struct matrix_material *mm)
91 gsl_matrix_free (mm->corr);
92 gsl_matrix_free (mm->cov);
93 gsl_matrix_free (mm->n);
94 gsl_matrix_free (mm->mean_matrix);
95 gsl_matrix_free (mm->var_matrix);
98 static const struct variable *
99 find_matrix_string_var (const struct dictionary *dict, const char *name)
101 const struct variable *var = dict_lookup_var (dict, name);
104 msg (SE, _("Matrix dataset lacks a variable called %s."), name);
107 if (!var_is_alpha (var))
109 msg (SE, _("Matrix dataset variable %s should be of string type."), name);
115 struct matrix_reader *
116 matrix_reader_create (const struct dictionary *dict,
117 struct casereader *in_reader)
119 const struct variable *varname = find_matrix_string_var (dict, "VARNAME_");
120 const struct variable *rowtype = find_matrix_string_var (dict, "ROWTYPE_");
121 if (!varname || !rowtype)
124 size_t varname_idx = var_get_dict_index (varname);
125 size_t rowtype_idx = var_get_dict_index (rowtype);
126 if (varname_idx < rowtype_idx)
128 msg (SE, _("Variable %s must precede %s in matrix file dictionary."),
129 "ROWTYPE_", "VARNAME_");
133 for (size_t i = 0; i < dict_get_n_vars (dict); i++)
135 const struct variable *v = dict_get_var (dict, i);
136 if (!var_is_numeric (v) && v != rowtype && v != varname)
138 msg (SE, _("Matrix dataset variable %s should be numeric."),
145 const struct variable **vars;
146 dict_get_vars (dict, &vars, &n_vars, DC_SCRATCH);
148 /* Different kinds of variables. */
149 size_t first_svar = 0;
150 size_t n_svars = rowtype_idx;
151 size_t first_fvar = rowtype_idx + 1;
152 size_t n_fvars = varname_idx - rowtype_idx - 1;
153 size_t first_cvar = varname_idx + 1;
154 size_t n_cvars = n_vars - varname_idx - 1;
157 msg (SE, _("Matrix dataset does not have any continuous variables."));
162 struct matrix_reader *mr = xmalloc (sizeof *mr);
163 *mr = (struct matrix_reader) {
165 .grouper = casegrouper_create_vars (in_reader, &vars[first_svar], n_svars),
166 .svars = xmemdup (vars + first_svar, n_svars * sizeof *mr->svars),
169 .fvars = xmemdup (vars + first_fvar, n_fvars * sizeof *mr->fvars),
172 .cvars = xmemdup (vars + first_cvar, n_cvars * sizeof *mr->cvars),
181 matrix_reader_destroy (struct matrix_reader *mr)
185 bool ret = casegrouper_destroy (mr->grouper);
195 Allocates MATRIX if necessary,
196 and populates row MROW, from the data in C corresponding to
197 variables in VARS. N_VARS is the length of VARS.
200 matrix_fill_row (gsl_matrix **matrix,
201 const struct ccase *c, int mrow,
202 const struct variable **vars, size_t n_vars)
207 *matrix = gsl_matrix_alloc (n_vars, n_vars);
208 gsl_matrix_set_all (*matrix, SYSMIS);
211 for (col = 0; col < n_vars; ++col)
213 const struct variable *cv = vars [col];
214 double x = case_num (c, cv);
215 assert (col < (*matrix)->size2);
216 assert (mrow < (*matrix)->size1);
217 gsl_matrix_set (*matrix, mrow, col, x);
222 find_varname (const struct variable **vars, int n_vars,
225 for (int i = 0; i < n_vars; i++)
226 if (!strcasecmp (var_get_name (vars[i]), varname))
232 matrix_reader_get_string (const struct ccase *c, const struct variable *var)
234 struct substring s = case_ss (c, var);
235 ss_rtrim (&s, ss_cstr (CC_SPACES));
240 matrix_reader_set_string (struct ccase *c, const struct variable *var,
241 struct substring src)
243 struct substring dst = case_ss (c, var);
244 for (size_t i = 0; i < dst.length; i++)
245 dst.string[i] = i < src.length ? src.string[i] : ' ';
249 matrix_reader_next (struct matrix_material *mm, struct matrix_reader *mr,
250 struct casereader **groupp)
252 struct casereader *group;
253 if (!casegrouper_get_next_group (mr->grouper, &group))
255 *mm = (struct matrix_material) MATRIX_MATERIAL_INIT;
262 *groupp = casereader_clone (group);
264 const struct variable **vars = mr->cvars;
265 size_t n_vars = mr->n_cvars;
267 *mm = (struct matrix_material) { .n = NULL };
276 struct matrix matrices[] = {
277 { .name = "CORR", .m = &mm->corr },
278 { .name = "COV", .m = &mm->cov },
280 enum { N_MATRICES = 2 };
283 for (; (c = casereader_read (group)); case_unref (c))
285 struct substring rowtype = matrix_reader_get_string (c, mr->rowtype);
288 = (ss_equals_case (rowtype, ss_cstr ("N")) ? &mm->n
289 : ss_equals_case (rowtype, ss_cstr ("MEAN")) ? &mm->mean_matrix
290 : ss_equals_case (rowtype, ss_cstr ("STDDEV")) ? &mm->var_matrix
295 *v = gsl_matrix_calloc (n_vars, n_vars);
297 for (int x = 0; x < n_vars; ++x)
299 double n = case_num (c, vars[x]);
300 if (v == &mm->var_matrix)
302 for (int y = 0; y < n_vars; ++y)
303 gsl_matrix_set (*v, y, x, n);
308 struct matrix *m = NULL;
309 for (size_t i = 0; i < N_MATRICES; i++)
310 if (ss_equals_case (rowtype, ss_cstr (matrices[i].name)))
317 struct substring varname_raw = case_ss (c, mr->varname);
318 struct substring varname = ss_cstr (
319 recode_string (UTF8, dict_get_encoding (mr->dict),
320 varname_raw.string, varname_raw.length));
321 ss_rtrim (&varname, ss_cstr (CC_SPACES));
322 varname.string[varname.length] = '\0';
324 int y = find_varname (vars, n_vars, varname.string);
328 matrix_fill_row (m->m, c, y, vars, n_vars);
332 ss_dealloc (&varname);
335 casereader_destroy (group);
337 for (size_t i = 0; i < N_MATRICES; i++)
338 if (matrices[i].good_rows && matrices[i].good_rows != n_vars)
339 msg (SW, _("%s matrix has %zu columns but %zu rows named variables "
340 "to be analyzed (and %zu rows named unknown variables)."),
341 matrices[i].name, n_vars, matrices[i].good_rows,
342 matrices[i].bad_rows);
348 cmd_debug_matrix_read (struct lexer *lexer, struct dataset *ds)
350 if (lex_match_id (lexer, "NODATA"))
352 struct casereader *cr = casewriter_make_reader (
353 mem_writer_create (dict_get_proto (dataset_dict (ds))));
354 struct matrix_reader *mr = matrix_reader_create (dataset_dict (ds), cr);
357 casereader_destroy (cr);
360 matrix_reader_destroy (mr);
364 struct matrix_reader *mr = matrix_reader_create (dataset_dict (ds),
369 struct pivot_table *pt = pivot_table_create ("Debug Matrix Reader");
379 const char *mm_stat_names[] = {
380 [MM_CORR] = "Correlation",
381 [MM_COV] = "Covariance",
384 [MM_STDDEV] = "Standard Deviation",
386 enum { N_STATS = sizeof mm_stat_names / sizeof *mm_stat_names };
387 for (size_t i = 0; i < 2; i++)
389 struct pivot_dimension *d = pivot_dimension_create (
391 i ? PIVOT_AXIS_COLUMN : PIVOT_AXIS_ROW,
392 i ? "Column" : "Row");
394 pivot_category_create_leaf_rc (d->root, pivot_value_new_text ("Value"),
395 PIVOT_RC_CORRELATION);
396 for (size_t j = 0; j < mr->n_cvars; j++)
397 pivot_category_create_leaf_rc (
398 d->root, pivot_value_new_variable (mr->cvars[j]),
399 PIVOT_RC_CORRELATION);
402 struct pivot_dimension *stat = pivot_dimension_create (pt, PIVOT_AXIS_ROW,
404 for (size_t i = 0; i < N_STATS; i++)
405 pivot_category_create_leaf (stat->root,
406 pivot_value_new_text (mm_stat_names[i]));
408 struct pivot_dimension *split = pivot_dimension_create (
409 pt, PIVOT_AXIS_ROW, "Split");
413 struct matrix_material mm = MATRIX_MATERIAL_INIT;
414 while (matrix_reader_next (&mm, mr, NULL))
416 pivot_category_create_leaf (split->root,
417 pivot_value_new_integer (split_num + 1));
419 const gsl_matrix *m[N_STATS] = {
423 [MM_MEAN] = mm.mean_matrix,
424 [MM_STDDEV] = mm.var_matrix,
427 for (size_t i = 0; i < N_STATS; i++)
430 if (i == MM_COV || i == MM_CORR)
432 for (size_t y = 0; y < mr->n_cvars; y++)
433 for (size_t x = 0; x < mr->n_cvars; x++)
435 pt, y + 1, x, i, split_num,
436 pivot_value_new_number (gsl_matrix_get (m[i], y, x)));
439 for (size_t x = 0; x < mr->n_cvars; x++)
441 double n = gsl_matrix_get (m[i], 0, x);
444 pivot_table_put4 (pt, 0, x, i, split_num,
445 pivot_value_new_number (n));
450 matrix_material_uninit (&mm);
452 pivot_table_submit (pt);
456 matrix_reader_destroy (mr);