1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <gsl/gsl_matrix.h>
21 #include <gsl/gsl_vector.h>
25 #include <data/case.h>
26 #include <data/category.h>
27 #include <data/casegrouper.h>
28 #include <data/casereader.h>
29 #include <data/dictionary.h>
30 #include <data/missing-values.h>
31 #include <data/procedure.h>
32 #include <data/transformations.h>
33 #include <data/value-labels.h>
34 #include <data/variable.h>
35 #include <language/command.h>
36 #include <language/dictionary/split-file.h>
37 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <libpspp/compiler.h>
40 #include <libpspp/message.h>
41 #include <math/covariance-matrix.h>
42 #include <math/coefficient.h>
43 #include <math/linreg.h>
44 #include <math/moments.h>
45 #include <output/table.h>
50 #define GLM_LARGE_DATA 10000
62 static struct cmd_glm cmd;
65 Moments for each of the variables used.
73 const struct variable *v;
78 Dependent variable used.
80 static const struct variable **v_dependent;
83 Number of dependent variables.
85 static size_t n_dependent;
89 Return value for the procedure.
91 static int pspp_glm_rc = CMD_SUCCESS;
93 int cmd_glm (struct lexer *lexer, struct dataset *ds);
96 static bool run_glm (struct casereader *,
98 const struct dataset *, pspp_linreg_cache *);
101 cmd_glm (struct lexer *lexer, struct dataset *ds)
103 struct casegrouper *grouper;
104 struct casereader *group;
105 pspp_linreg_cache *model = NULL;
109 model = xmalloc (sizeof *model);
111 if (!parse_glm (lexer, ds, &cmd, NULL))
115 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
116 while (casegrouper_get_next_group (grouper, &group))
118 run_glm (group, &cmd, ds, model);
120 ok = casegrouper_destroy (grouper);
121 ok = proc_commit (ds) && ok;
125 return ok ? CMD_SUCCESS : CMD_FAILURE;
128 /* Parser for the dependent sub command */
130 glm_custom_dependent (struct lexer *lexer, struct dataset *ds,
131 struct cmd_glm *cmd UNUSED, void *aux UNUSED)
133 const struct dictionary *dict = dataset_dict (ds);
135 if ((lex_token (lexer) != T_ID
136 || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
137 && lex_token (lexer) != T_ALL)
140 if (!parse_variables_const
141 (lexer, dict, &v_dependent, &n_dependent, PV_NONE))
146 assert (n_dependent);
148 msg (SE, _("Multivariate GLM not yet supported"));
149 n_dependent = 1; /* Drop this line after adding support for multivariate GLM. */
155 COV is the covariance matrix for variables included in the
156 model. That means the dependent variable is in there, too.
159 coeff_init (pspp_linreg_cache * c, struct design_matrix *cov)
161 c->coeff = xnmalloc (cov->m->size2, sizeof (*c->coeff));
162 c->n_coeffs = cov->m->size2 - 1;
163 c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
164 c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
165 pspp_coeff_init (c->coeff + 1, cov);
169 Put the moments in the linreg cache.
172 compute_moments (pspp_linreg_cache * c, struct moments_var *mom,
173 struct design_matrix *dm, size_t n)
183 Scan the variable names in the columns of the design matrix.
184 When we find the variable we need, insert its mean in the cache.
186 for (i = 0; i < dm->m->size2; i++)
188 for (j = 0; j < n; j++)
190 if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
192 moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
193 &skewness, &kurtosis);
194 gsl_vector_set (c->indep_means, i, mean);
195 gsl_vector_set (c->indep_std, i, sqrt (variance));
201 /* Encode categorical variables.
202 Returns number of valid cases. */
204 data_pass_one (struct casereader *input,
205 const struct variable **vars, size_t n_vars,
206 struct moments_var **mom)
212 for (i = 0; i < n_vars; i++)
214 mom[i] = xmalloc (sizeof (*mom[i]));
216 mom[i]->mean = xmalloc (sizeof (*mom[i]->mean));
217 mom[i]->variance = xmalloc (sizeof (*mom[i]->mean));
218 mom[i]->weight = xmalloc (sizeof (*mom[i]->weight));
219 mom[i]->m = moments1_create (MOMENT_VARIANCE);
220 if (var_is_alpha (vars[i]))
221 cat_stored_values_create (vars[i]);
225 for (; casereader_read (input, &c); case_destroy (&c))
228 The second condition ensures the program will run even if
229 there is only one variable to act as both explanatory and
232 for (i = 0; i < n_vars; i++)
234 const union value *val = case_data (&c, vars[i]);
235 if (var_is_alpha (vars[i]))
236 cat_value_update (vars[i], val);
238 moments1_add (mom[i]->m, val->f, 1.0);
242 casereader_destroy (input);
243 for (i = 0; i < n_vars; i++)
245 if (var_is_numeric (mom[i]->v))
247 moments1_calculate (mom[i]->m, mom[i]->weight, mom[i]->mean,
248 mom[i]->variance, NULL, NULL);
256 run_glm (struct casereader *input,
258 const struct dataset *ds, pspp_linreg_cache * model)
264 const struct variable **indep_vars;
265 const struct variable **all_vars;
266 struct design_matrix *X;
267 struct moments_var **mom;
268 struct casereader *reader;
271 size_t n_data; /* Number of valid cases. */
273 pspp_linreg_opts lopts;
275 assert (model != NULL);
277 if (!casereader_peek (input, 0, &c))
279 casereader_destroy (input);
282 output_split_file_values (ds, &c);
287 dict_get_vars (dataset_dict (ds), &v_dependent, &n_dependent,
293 lopts.get_depvar_mean_std = 1;
295 lopts.get_indep_mean_std = xnmalloc (n_dependent, sizeof (int));
296 indep_vars = xnmalloc (cmd->n_by, sizeof *indep_vars);
297 n_all_vars = cmd->n_by + n_dependent;
298 all_vars = xnmalloc (n_all_vars, sizeof *all_vars);
300 for (i = 0; i < n_dependent; i++)
302 all_vars[i] = v_dependent[i];
304 for (i = 0; i < cmd->n_by; i++)
306 indep_vars[i] = cmd->v_by[i];
307 all_vars[i + n_dependent] = cmd->v_by[i];
310 mom = xnmalloc (n_all_vars, sizeof (*mom));
313 reader = casereader_clone (input);
314 reader = casereader_create_filter_missing (reader, indep_vars, n_indep,
316 reader = casereader_create_filter_missing (reader, v_dependent, 1,
318 n_data = data_pass_one (casereader_clone (reader),
319 (const struct variable **) all_vars, n_all_vars,
322 if ((n_data > 0) && (n_indep > 0))
325 covariance_matrix_create (n_all_vars,
326 (const struct variable **) all_vars);
327 reader = casereader_create_counter (reader, &row, -1);
328 for (; casereader_read (reader, &c); case_destroy (&c))
331 Accumulate the covariance matrix.
333 for (i = 0; i < n_all_vars; ++i)
335 const struct variable *v = all_vars[i];
336 const union value *val_v = case_data (&c, v);
337 for (j = i; j < n_all_vars; j++)
339 const struct variable *w = all_vars[j];
340 const union value *val_w = case_data (&c, w);
341 covariance_pass_two (X, *mom[i]->mean, *mom[j]->mean,
347 model = pspp_linreg_cache_alloc (v_dependent, indep_vars, n_data, n_indep);
349 For large data sets, use QR decomposition.
351 if (n_data > sqrt (n_indep) && n_data > GLM_LARGE_DATA)
353 model->method = PSPP_LINREG_QR;
355 coeff_init (model, X);
356 pspp_linreg_with_cov (X, model);
357 casereader_destroy (reader);
358 for (i = 0; i < n_all_vars; i++)
360 moments1_destroy (mom[i]->m);
362 free (mom[i]->variance);
363 free (mom[i]->weight);
367 covariance_matrix_destroy (X);
371 msg (SE, gettext ("No valid data found. This command was skipped."));
374 free (lopts.get_indep_mean_std);
375 pspp_linreg_cache_free (model);
376 casereader_destroy (input);