1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <gsl/gsl_matrix.h>
21 #include <gsl/gsl_vector.h>
25 #include <data/case.h>
26 #include <data/category.h>
27 #include <data/casegrouper.h>
28 #include <data/casereader.h>
29 #include <data/dictionary.h>
30 #include <data/missing-values.h>
31 #include <data/procedure.h>
32 #include <data/transformations.h>
33 #include <data/value-labels.h>
34 #include <data/variable.h>
35 #include <language/command.h>
36 #include <language/dictionary/split-file.h>
37 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <libpspp/compiler.h>
40 #include <libpspp/message.h>
41 #include <math/covariance-matrix.h>
42 #include <math/coefficient.h>
43 #include <math/linreg.h>
44 #include <math/moments.h>
45 #include <output/table.h>
50 #define GLM_LARGE_DATA 10000
62 static struct cmd_glm cmd;
65 Moments for each of the variables used.
73 const struct variable *v;
78 Dependent variable used.
80 static const struct variable **v_dependent;
83 Number of dependent variables.
85 static size_t n_dependent;
89 Return value for the procedure.
91 static int pspp_glm_rc = CMD_SUCCESS;
93 int cmd_glm (struct lexer *lexer, struct dataset *ds);
96 static bool run_glm (struct casereader *,
98 const struct dataset *);
101 cmd_glm (struct lexer *lexer, struct dataset *ds)
103 struct casegrouper *grouper;
104 struct casereader *group;
108 if (!parse_glm (lexer, ds, &cmd, NULL))
112 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
113 while (casegrouper_get_next_group (grouper, &group))
115 run_glm (group, &cmd, ds);
117 ok = casegrouper_destroy (grouper);
118 ok = proc_commit (ds) && ok;
121 return ok ? CMD_SUCCESS : CMD_FAILURE;
124 /* Parser for the dependent sub command */
126 glm_custom_dependent (struct lexer *lexer, struct dataset *ds,
127 struct cmd_glm *cmd UNUSED, void *aux UNUSED)
129 const struct dictionary *dict = dataset_dict (ds);
131 if ((lex_token (lexer) != T_ID
132 || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
133 && lex_token (lexer) != T_ALL)
136 if (!parse_variables_const
137 (lexer, dict, &v_dependent, &n_dependent, PV_NONE))
142 assert (n_dependent);
144 msg (SE, _("Multivariate GLM not yet supported"));
145 n_dependent = 1; /* Drop this line after adding support for multivariate GLM. */
151 COV is the covariance matrix for variables included in the
152 model. That means the dependent variable is in there, too.
155 coeff_init (pspp_linreg_cache * c, struct design_matrix *cov)
157 c->coeff = xnmalloc (cov->m->size2, sizeof (*c->coeff));
158 c->n_coeffs = cov->m->size2 - 1;
159 pspp_coeff_init (c->coeff, cov);
163 Put the moments in the linreg cache.
166 compute_moments (pspp_linreg_cache * c, struct moments_var *mom,
167 struct design_matrix *dm, size_t n)
177 Scan the variable names in the columns of the design matrix.
178 When we find the variable we need, insert its mean in the cache.
180 for (i = 0; i < dm->m->size2; i++)
182 for (j = 0; j < n; j++)
184 if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
186 moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
187 &skewness, &kurtosis);
188 gsl_vector_set (c->indep_means, i, mean);
189 gsl_vector_set (c->indep_std, i, sqrt (variance));
195 /* Encode categorical variables.
196 Returns number of valid cases. */
198 data_pass_one (struct casereader *input,
199 const struct variable **vars, size_t n_vars,
200 struct moments_var **mom)
206 for (i = 0; i < n_vars; i++)
208 mom[i] = xmalloc (sizeof (*mom[i]));
210 mom[i]->mean = xmalloc (sizeof (*mom[i]->mean));
211 mom[i]->variance = xmalloc (sizeof (*mom[i]->mean));
212 mom[i]->weight = xmalloc (sizeof (*mom[i]->weight));
213 mom[i]->m = moments1_create (MOMENT_VARIANCE);
214 if (var_is_alpha (vars[i]))
215 cat_stored_values_create (vars[i]);
219 for (; casereader_read (input, &c); case_destroy (&c))
222 The second condition ensures the program will run even if
223 there is only one variable to act as both explanatory and
226 for (i = 0; i < n_vars; i++)
228 const union value *val = case_data (&c, vars[i]);
229 if (var_is_alpha (vars[i]))
230 cat_value_update (vars[i], val);
232 moments1_add (mom[i]->m, val->f, 1.0);
236 casereader_destroy (input);
237 for (i = 0; i < n_vars; i++)
239 if (var_is_numeric (mom[i]->v))
241 moments1_calculate (mom[i]->m, mom[i]->weight, mom[i]->mean,
242 mom[i]->variance, NULL, NULL);
250 run_glm (struct casereader *input,
252 const struct dataset *ds)
254 pspp_linreg_cache *model = NULL;
259 const struct variable **indep_vars;
260 const struct variable **all_vars;
261 struct design_matrix *X;
262 struct moments_var **mom;
263 struct casereader *reader;
266 size_t n_data; /* Number of valid cases. */
268 pspp_linreg_opts lopts;
270 if (!casereader_peek (input, 0, &c))
272 casereader_destroy (input);
275 output_split_file_values (ds, &c);
280 dict_get_vars (dataset_dict (ds), &v_dependent, &n_dependent,
284 lopts.get_depvar_mean_std = 1;
286 lopts.get_indep_mean_std = xnmalloc (n_dependent, sizeof (int));
287 indep_vars = xnmalloc (cmd->n_by, sizeof *indep_vars);
288 n_all_vars = cmd->n_by + n_dependent;
289 all_vars = xnmalloc (n_all_vars, sizeof *all_vars);
291 for (i = 0; i < n_dependent; i++)
293 all_vars[i] = v_dependent[i];
295 for (i = 0; i < cmd->n_by; i++)
297 indep_vars[i] = cmd->v_by[i];
298 all_vars[i + n_dependent] = cmd->v_by[i];
301 mom = xnmalloc (n_all_vars, sizeof (*mom));
304 reader = casereader_clone (input);
305 reader = casereader_create_filter_missing (reader, indep_vars, n_indep,
307 reader = casereader_create_filter_missing (reader, v_dependent, 1,
309 n_data = data_pass_one (casereader_clone (reader),
310 (const struct variable **) all_vars, n_all_vars,
313 if ((n_data > 0) && (n_indep > 0))
315 for (i = 0; i < n_all_vars; i++)
316 if (var_is_alpha (all_vars[i]))
317 cat_stored_values_create (all_vars[i]);
320 covariance_matrix_create (n_all_vars,
321 (const struct variable **) all_vars);
322 reader = casereader_create_counter (reader, &row, -1);
323 for (; casereader_read (reader, &c); case_destroy (&c))
326 Accumulate the covariance matrix.
328 for (i = 0; i < n_all_vars; ++i)
330 const struct variable *v = all_vars[i];
331 const union value *val_v = case_data (&c, v);
332 if (var_is_alpha (all_vars[i]))
333 cat_value_update (all_vars[i], val_v);
334 for (j = i; j < n_all_vars; j++)
336 const struct variable *w = all_vars[j];
337 const union value *val_w = case_data (&c, w);
338 covariance_pass_two (X, *mom[i]->mean, *mom[j]->mean,
344 model = pspp_linreg_cache_alloc (v_dependent[0], indep_vars, n_data, n_indep);
346 For large data sets, use QR decomposition.
348 if (n_data > sqrt (n_indep) && n_data > GLM_LARGE_DATA)
350 model->method = PSPP_LINREG_QR;
352 coeff_init (model, X);
353 pspp_linreg_with_cov (X, model);
354 casereader_destroy (reader);
355 for (i = 0; i < n_all_vars; i++)
357 moments1_destroy (mom[i]->m);
359 free (mom[i]->variance);
360 free (mom[i]->weight);
364 covariance_matrix_destroy (X);
368 msg (SE, gettext ("No valid data found. This command was skipped."));
371 free (lopts.get_indep_mean_std);
372 pspp_linreg_cache_free (model);
373 casereader_destroy (input);