1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <gsl/gsl_matrix.h>
23 #include "data/case.h"
24 #include "data/casegrouper.h"
25 #include "data/casereader.h"
26 #include "data/dataset.h"
27 #include "data/dictionary.h"
28 #include "data/format.h"
29 #include "data/value.h"
30 #include "language/command.h"
31 #include "language/dictionary/split-file.h"
32 #include "language/lexer/lexer.h"
33 #include "language/lexer/value-parser.h"
34 #include "language/lexer/variable-parser.h"
35 #include "libpspp/ll.h"
36 #include "libpspp/message.h"
37 #include "libpspp/misc.h"
38 #include "libpspp/taint.h"
39 #include "linreg/sweep.h"
40 #include "math/categoricals.h"
41 #include "math/covariance.h"
42 #include "math/moments.h"
43 #include "output/tab.h"
46 #define _(msgid) gettext (msgid)
51 const struct variable **dep_vars;
54 const struct variable **factor_vars;
56 enum mv_class exclude;
58 /* The weight variable */
59 const struct variable *wv;
67 struct moments *totals;
69 struct categoricals *cats;
72 Sums of squares due to different variables. Element 0 is the SSE
73 for the entire model. For i > 0, element i is the SS due to
79 static void output_glm (const struct glm_spec *,
80 const struct glm_workspace *ws);
81 static void run_glm (struct glm_spec *cmd, struct casereader *input,
82 const struct dataset *ds);
85 cmd_glm (struct lexer *lexer, struct dataset *ds)
87 struct const_var_set *factors = NULL;
88 const struct dictionary *dict = dataset_dict (ds);
91 glm.n_factor_vars = 0;
93 glm.factor_vars = NULL;
96 glm.wv = dict_get_weight (dict);
99 if (!parse_variables_const (lexer, dict,
100 &glm.dep_vars, &glm.n_dep_vars,
101 PV_NO_DUPLICATE | PV_NUMERIC))
104 lex_force_match (lexer, T_BY);
106 if (!parse_variables_const (lexer, dict,
107 &glm.factor_vars, &glm.n_factor_vars,
108 PV_NO_DUPLICATE | PV_NUMERIC))
111 if (glm.n_dep_vars > 1)
113 msg (ME, _("Multivariate analysis is not yet implemented"));
118 const_var_set_create_from_array (glm.factor_vars, glm.n_factor_vars);
120 while (lex_token (lexer) != T_ENDCMD)
122 lex_match (lexer, T_SLASH);
124 if (lex_match_id (lexer, "MISSING"))
126 lex_match (lexer, T_EQUALS);
127 while (lex_token (lexer) != T_ENDCMD
128 && lex_token (lexer) != T_SLASH)
130 if (lex_match_id (lexer, "INCLUDE"))
132 glm.exclude = MV_SYSTEM;
134 else if (lex_match_id (lexer, "EXCLUDE"))
136 glm.exclude = MV_ANY;
140 lex_error (lexer, NULL);
145 else if (lex_match_id (lexer, "INTERCEPT"))
147 lex_match (lexer, T_EQUALS);
148 while (lex_token (lexer) != T_ENDCMD
149 && lex_token (lexer) != T_SLASH)
151 if (lex_match_id (lexer, "INCLUDE"))
153 glm.intercept = true;
155 else if (lex_match_id (lexer, "EXCLUDE"))
157 glm.intercept = false;
161 lex_error (lexer, NULL);
167 else if (lex_match_id (lexer, "DESIGN"))
170 const struct variable **des;
171 lex_match (lexer, T_EQUALS);
173 parse_const_var_set_vars (lexer, factors, &des, &n_des, 0);
178 lex_error (lexer, NULL);
185 struct casegrouper *grouper;
186 struct casereader *group;
189 grouper = casegrouper_create_splits (proc_open (ds), dict);
190 while (casegrouper_get_next_group (grouper, &group))
191 run_glm (&glm, group, ds);
192 ok = casegrouper_destroy (grouper);
193 ok = proc_commit (ds) && ok;
196 const_var_set_destroy (factors);
197 free (glm.factor_vars);
204 const_var_set_destroy (factors);
205 free (glm.factor_vars);
211 static void get_ssq (struct covariance *, gsl_vector *,
212 const struct glm_spec *);
215 not_dropped (size_t j, size_t * dropped, size_t n_dropped)
219 for (i = 0; i < n_dropped; i++)
228 get_ssq (struct covariance *cov, gsl_vector * ssq, const struct glm_spec *cmd)
230 const struct variable **vars;
231 gsl_matrix *small_cov = NULL;
232 gsl_matrix *cm = covariance_calculate_unnormalized (cov);
241 dropped = xcalloc (covariance_dim (cov), sizeof (*dropped));
242 vars = xcalloc (covariance_dim (cov), sizeof (*vars));
243 covariance_get_var_indices (cov, vars);
245 for (k = 0; k < cmd->n_factor_vars; k++)
248 for (i = 1; i < covariance_dim (cov); i++)
250 if (vars[i] == cmd->factor_vars[k])
252 dropped[n_dropped++] = i;
256 gsl_matrix_alloc (cm->size1 - n_dropped, cm->size2 - n_dropped);
257 gsl_matrix_set (small_cov, 0, 0, gsl_matrix_get (cm, 0, 0));
260 for (i = 0; i < cm->size1; i++)
262 if (not_dropped (i, dropped, n_dropped))
265 for (j = 0; j < cm->size2; j++)
267 if (not_dropped (j, dropped, n_dropped))
269 gsl_matrix_set (small_cov, n, m,
270 gsl_matrix_get (cm, i, j));
277 reg_sweep (small_cov, 0);
278 gsl_vector_set (ssq, k + 1,
279 gsl_matrix_get (small_cov, 0, 0)
280 - gsl_vector_get (ssq, 0));
281 gsl_matrix_free (small_cov);
286 gsl_matrix_free (cm);
289 //static void dump_matrix (const gsl_matrix *m);
292 run_glm (struct glm_spec *cmd, struct casereader *input,
293 const struct dataset *ds)
295 bool warn_bad_weight = true;
298 struct dictionary *dict = dataset_dict (ds);
299 struct casereader *reader;
302 struct glm_workspace ws;
303 struct covariance *cov;
304 ws.cats = categoricals_create (cmd->factor_vars, cmd->n_factor_vars,
305 cmd->wv, cmd->exclude,
306 NULL, NULL, NULL, NULL);
308 cov = covariance_2pass_create (cmd->n_dep_vars, cmd->dep_vars,
309 ws.cats, cmd->wv, cmd->exclude);
312 c = casereader_peek (input, 0);
315 casereader_destroy (input);
318 output_split_file_values (ds, c);
321 taint = taint_clone (casereader_get_taint (input));
323 ws.totals = moments_create (MOMENT_VARIANCE);
325 for (reader = casereader_clone (input);
326 (c = casereader_read (reader)) != NULL; case_unref (c))
328 double weight = dict_get_case_weight (dict, c, &warn_bad_weight);
330 for (v = 0; v < cmd->n_dep_vars; ++v)
331 moments_pass_one (ws.totals, case_data (c, cmd->dep_vars[v])->f,
334 covariance_accumulate_pass1 (cov, c);
336 casereader_destroy (reader);
338 categoricals_done (ws.cats);
341 (c = casereader_read (reader)) != NULL; case_unref (c))
343 double weight = dict_get_case_weight (dict, c, &warn_bad_weight);
345 for (v = 0; v < cmd->n_dep_vars; ++v)
346 moments_pass_two (ws.totals, case_data (c, cmd->dep_vars[v])->f,
349 covariance_accumulate_pass2 (cov, c);
351 casereader_destroy (reader);
354 gsl_matrix *cm = covariance_calculate_unnormalized (cov);
358 ws.total_ssq = gsl_matrix_get (cm, 0, 0);
363 Store the overall SSE.
365 ws.ssq = gsl_vector_alloc (cm->size1);
366 gsl_vector_set (ws.ssq, 0, gsl_matrix_get (cm, 0, 0));
367 get_ssq (cov, ws.ssq, cmd);
370 gsl_matrix_free (cm);
373 if (!taint_has_tainted_successor (taint))
374 output_glm (cmd, &ws);
376 gsl_vector_free (ws.ssq);
378 covariance_destroy (cov);
379 moments_destroy (ws.totals);
381 taint_destroy (taint);
385 output_glm (const struct glm_spec *cmd, const struct glm_workspace *ws)
387 const struct fmt_spec *wfmt =
388 cmd->wv ? var_get_print_format (cmd->wv) : &F_8_0;
390 double n_total, mean;
391 double df_corr = 0.0;
396 const int heading_columns = 1;
397 const int heading_rows = 1;
401 int nr = heading_rows + 4 + cmd->n_factor_vars;
405 t = tab_create (nc, nr);
406 tab_title (t, _("Tests of Between-Subjects Effects"));
408 tab_headers (t, heading_columns, 0, heading_rows, 0);
410 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
412 tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
413 tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
415 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Source"));
417 /* TRANSLATORS: The parameter is a roman numeral */
418 tab_text_format (t, 1, 0, TAB_CENTER | TAT_TITLE,
419 _("Type %s Sum of Squares"), "III");
420 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("df"));
421 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
422 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("F"));
423 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("Sig."));
425 moments_calculate (ws->totals, &n_total, &mean, NULL, NULL, NULL);
430 for (f = 0; f < cmd->n_factor_vars; ++f)
431 df_corr += categoricals_n_count (ws->cats, f) - 1.0;
433 mse = gsl_vector_get (ws->ssq, 0) / (n_total - df_corr);
436 tab_text (t, 0, r, TAB_LEFT | TAT_TITLE, _("Corrected Model"));
442 const double intercept = pow2 (mean * n_total) / n_total;
443 const double df = 1.0;
444 const double F = intercept / df / mse;
445 tab_text (t, 0, r, TAB_LEFT | TAT_TITLE, _("Intercept"));
446 tab_double (t, 1, r, 0, intercept, NULL);
447 tab_double (t, 2, r, 0, 1.00, wfmt);
448 tab_double (t, 3, r, 0, intercept / df, NULL);
449 tab_double (t, 4, r, 0, F, NULL);
450 tab_double (t, 5, r, 0, gsl_cdf_fdist_Q (F, df, n_total - df_corr),
455 for (f = 0; f < cmd->n_factor_vars; ++f)
457 const double df = categoricals_n_count (ws->cats, f) - 1.0;
458 const double ssq = gsl_vector_get (ws->ssq, f + 1);
459 const double F = ssq / df / mse;
460 tab_text (t, 0, r, TAB_LEFT | TAT_TITLE,
461 var_to_string (cmd->factor_vars[f]));
463 tab_double (t, 1, r, 0, ssq, NULL);
464 tab_double (t, 2, r, 0, df, wfmt);
465 tab_double (t, 3, r, 0, ssq / df, NULL);
466 tab_double (t, 4, r, 0, F, NULL);
468 tab_double (t, 5, r, 0, gsl_cdf_fdist_Q (F, df, n_total - df_corr),
476 /* Corrected Model */
477 const double df = df_corr - 1.0;
478 const double ssq = ws->total_ssq - gsl_vector_get (ws->ssq, 0);
479 const double F = ssq / df / mse;
480 tab_double (t, 1, heading_rows, 0, ssq, NULL);
481 tab_double (t, 2, heading_rows, 0, df, wfmt);
482 tab_double (t, 3, heading_rows, 0, ssq / df, NULL);
483 tab_double (t, 4, heading_rows, 0, F, NULL);
485 tab_double (t, 5, heading_rows, 0,
486 gsl_cdf_fdist_Q (F, df, n_total - df_corr), NULL);
490 const double df = n_total - df_corr;
491 const double ssq = gsl_vector_get (ws->ssq, 0);
492 const double mse = ssq / df;
493 tab_text (t, 0, r, TAB_LEFT | TAT_TITLE, _("Error"));
494 tab_double (t, 1, r, 0, ssq, NULL);
495 tab_double (t, 2, r, 0, df, wfmt);
496 tab_double (t, 3, r++, 0, mse, NULL);
501 const double intercept = pow2 (mean * n_total) / n_total;
502 const double ssq = intercept + ws->total_ssq;
504 tab_text (t, 0, r, TAB_LEFT | TAT_TITLE, _("Total"));
505 tab_double (t, 1, r, 0, ssq, NULL);
506 tab_double (t, 2, r, 0, n_total, wfmt);
511 tab_text (t, 0, r, TAB_LEFT | TAT_TITLE, _("Corrected Total"));
514 tab_double (t, 1, r, 0, ws->total_ssq, NULL);
515 tab_double (t, 2, r, 0, n_total - 1.0, wfmt);
522 dump_matrix (const gsl_matrix * m)
525 for (i = 0; i < m->size1; ++i)
527 for (j = 0; j < m->size2; ++j)
529 double x = gsl_matrix_get (m, i, j);