1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <gsl/gsl_matrix.h>
21 #include <gsl/gsl_vector.h>
25 #include <data/case.h>
26 #include <data/category.h>
27 #include <data/casegrouper.h>
28 #include <data/casereader.h>
29 #include <data/dictionary.h>
30 #include <data/missing-values.h>
31 #include <data/procedure.h>
32 #include <data/transformations.h>
33 #include <data/value-labels.h>
34 #include <data/variable.h>
35 #include <language/command.h>
36 #include <language/dictionary/split-file.h>
37 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <libpspp/compiler.h>
40 #include <libpspp/message.h>
41 #include <math/covariance-matrix.h>
42 #include <math/coefficient.h>
43 #include <math/linreg.h>
44 #include <math/moments.h>
45 #include <output/table.h>
50 #define GLM_LARGE_DATA 10000
62 static struct cmd_glm cmd;
65 Moments for each of the variables used.
73 const struct variable *v;
78 Dependent variable used.
80 static const struct variable **v_dependent;
83 Number of dependent variables.
85 static size_t n_dependent;
89 Return value for the procedure.
91 static int pspp_glm_rc = CMD_SUCCESS;
93 int cmd_glm (struct lexer *lexer, struct dataset *ds);
96 static bool run_glm (struct casereader *,
98 const struct dataset *);
101 cmd_glm (struct lexer *lexer, struct dataset *ds)
103 struct casegrouper *grouper;
104 struct casereader *group;
108 if (!parse_glm (lexer, ds, &cmd, NULL))
112 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
113 while (casegrouper_get_next_group (grouper, &group))
115 run_glm (group, &cmd, ds);
117 ok = casegrouper_destroy (grouper);
118 ok = proc_commit (ds) && ok;
121 return ok ? CMD_SUCCESS : CMD_FAILURE;
124 /* Parser for the dependent sub command */
126 glm_custom_dependent (struct lexer *lexer, struct dataset *ds,
127 struct cmd_glm *cmd UNUSED, void *aux UNUSED)
129 const struct dictionary *dict = dataset_dict (ds);
131 if ((lex_token (lexer) != T_ID
132 || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
133 && lex_token (lexer) != T_ALL)
136 if (!parse_variables_const
137 (lexer, dict, &v_dependent, &n_dependent, PV_NONE))
142 assert (n_dependent);
144 msg (SE, _("Multivariate GLM not yet supported"));
145 n_dependent = 1; /* Drop this line after adding support for multivariate GLM. */
151 COV is the covariance matrix for variables included in the
152 model. That means the dependent variable is in there, too.
155 coeff_init (pspp_linreg_cache * c, struct design_matrix *cov)
157 c->coeff = xnmalloc (cov->m->size2, sizeof (*c->coeff));
158 c->n_coeffs = cov->m->size2 - 1;
159 pspp_coeff_init (c->coeff, cov);
162 /* Encode categorical variables.
163 Returns number of valid cases. */
165 data_pass_one (struct casereader *input,
166 const struct variable **vars, size_t n_vars,
167 struct moments_var **mom)
173 for (i = 0; i < n_vars; i++)
175 mom[i] = xmalloc (sizeof (*mom[i]));
177 mom[i]->mean = xmalloc (sizeof (*mom[i]->mean));
178 mom[i]->variance = xmalloc (sizeof (*mom[i]->mean));
179 mom[i]->weight = xmalloc (sizeof (*mom[i]->weight));
180 mom[i]->m = moments1_create (MOMENT_VARIANCE);
181 if (var_is_alpha (vars[i]))
182 cat_stored_values_create (vars[i]);
186 for (; casereader_read (input, &c); case_destroy (&c))
189 The second condition ensures the program will run even if
190 there is only one variable to act as both explanatory and
193 for (i = 0; i < n_vars; i++)
195 const union value *val = case_data (&c, vars[i]);
196 if (var_is_alpha (vars[i]))
197 cat_value_update (vars[i], val);
199 moments1_add (mom[i]->m, val->f, 1.0);
203 casereader_destroy (input);
204 for (i = 0; i < n_vars; i++)
206 if (var_is_numeric (mom[i]->v))
208 moments1_calculate (mom[i]->m, mom[i]->weight, mom[i]->mean,
209 mom[i]->variance, NULL, NULL);
217 run_glm (struct casereader *input,
219 const struct dataset *ds)
221 pspp_linreg_cache *model = NULL;
226 const struct variable **indep_vars;
227 const struct variable **all_vars;
228 struct design_matrix *X;
229 struct moments_var **mom;
230 struct casereader *reader;
233 size_t n_data; /* Number of valid cases. */
235 pspp_linreg_opts lopts;
237 if (!casereader_peek (input, 0, &c))
239 casereader_destroy (input);
242 output_split_file_values (ds, &c);
247 dict_get_vars (dataset_dict (ds), &v_dependent, &n_dependent,
251 lopts.get_depvar_mean_std = 1;
253 lopts.get_indep_mean_std = xnmalloc (n_dependent, sizeof (int));
254 indep_vars = xnmalloc (cmd->n_by, sizeof *indep_vars);
255 n_all_vars = cmd->n_by + n_dependent;
256 all_vars = xnmalloc (n_all_vars, sizeof *all_vars);
258 for (i = 0; i < n_dependent; i++)
260 all_vars[i] = v_dependent[i];
262 for (i = 0; i < cmd->n_by; i++)
264 indep_vars[i] = cmd->v_by[i];
265 all_vars[i + n_dependent] = cmd->v_by[i];
268 mom = xnmalloc (n_all_vars, sizeof (*mom));
271 reader = casereader_clone (input);
272 reader = casereader_create_filter_missing (reader, indep_vars, n_indep,
274 reader = casereader_create_filter_missing (reader, v_dependent, 1,
276 n_data = data_pass_one (casereader_clone (reader),
277 (const struct variable **) all_vars, n_all_vars,
280 if ((n_data > 0) && (n_indep > 0))
282 for (i = 0; i < n_all_vars; i++)
283 if (var_is_alpha (all_vars[i]))
284 cat_stored_values_create (all_vars[i]);
287 covariance_matrix_create (n_all_vars,
288 (const struct variable **) all_vars);
289 reader = casereader_create_counter (reader, &row, -1);
290 for (; casereader_read (reader, &c); case_destroy (&c))
293 Accumulate the covariance matrix.
295 for (i = 0; i < n_all_vars; ++i)
297 const struct variable *v = all_vars[i];
298 const union value *val_v = case_data (&c, v);
299 if (var_is_alpha (all_vars[i]))
300 cat_value_update (all_vars[i], val_v);
301 for (j = i; j < n_all_vars; j++)
303 const struct variable *w = all_vars[j];
304 const union value *val_w = case_data (&c, w);
305 covariance_pass_two (X, *mom[i]->mean, *mom[j]->mean,
311 model = pspp_linreg_cache_alloc (v_dependent[0], indep_vars, n_data, n_indep);
313 For large data sets, use QR decomposition.
315 if (n_data > sqrt (n_indep) && n_data > GLM_LARGE_DATA)
317 model->method = PSPP_LINREG_QR;
319 coeff_init (model, X);
320 pspp_linreg_with_cov (X, model);
321 casereader_destroy (reader);
322 for (i = 0; i < n_all_vars; i++)
324 moments1_destroy (mom[i]->m);
326 free (mom[i]->variance);
327 free (mom[i]->weight);
331 covariance_matrix_destroy (X);
335 msg (SE, gettext ("No valid data found. This command was skipped."));
338 free (lopts.get_indep_mean_std);
339 pspp_linreg_cache_free (model);
340 casereader_destroy (input);