1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <gsl/gsl_matrix.h>
21 #include <gsl/gsl_vector.h>
25 #include <data/case.h>
26 #include <data/category.h>
27 #include <data/casegrouper.h>
28 #include <data/casereader.h>
29 #include <data/dictionary.h>
30 #include <data/missing-values.h>
31 #include <data/procedure.h>
32 #include <data/transformations.h>
33 #include <data/value-labels.h>
34 #include <data/variable.h>
35 #include <language/command.h>
36 #include <language/dictionary/split-file.h>
37 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <libpspp/alloc.h>
40 #include <libpspp/compiler.h>
41 #include <libpspp/message.h>
42 #include <math/design-matrix.h>
43 #include <math/coefficient.h>
44 #include <math/linreg/linreg.h>
45 #include <math/moments.h>
46 #include <output/table.h>
50 #define GLM_LARGE_DATA 1000
62 static struct cmd_glm cmd;
65 Moments for each of the variables used.
70 const struct variable *v;
75 Dependent variable used.
77 static const struct variable **v_dependent;
80 Number of dependent variables.
82 static size_t n_dependent;
86 Return value for the procedure.
88 static int pspp_glm_rc = CMD_SUCCESS;
90 int cmd_glm (struct lexer *lexer, struct dataset *ds);
93 static bool run_glm (struct casereader*,
95 const struct dataset *,
99 cmd_glm (struct lexer *lexer, struct dataset *ds)
101 struct casegrouper *grouper;
102 struct casereader *group;
103 pspp_linreg_cache *model = NULL;
107 model = xmalloc (sizeof *model);
109 if (!parse_glm (lexer, ds, &cmd, NULL))
113 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
114 while (casegrouper_get_next_group (grouper, &group))
116 run_glm (group, &cmd, ds, model);
118 ok = casegrouper_destroy (grouper);
119 ok = proc_commit (ds) && ok;
123 return ok ? CMD_SUCCESS : CMD_FAILURE;
126 /* Parser for the dependent sub command */
128 glm_custom_dependent (struct lexer *lexer, struct dataset *ds,
129 struct cmd_glm *cmd UNUSED,
132 const struct dictionary *dict = dataset_dict (ds);
134 if ((lex_token (lexer) != T_ID
135 || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
136 && lex_token (lexer) != T_ALL)
139 if (!parse_variables_const
140 (lexer, dict, &v_dependent, &n_dependent, PV_NONE))
145 assert (n_dependent);
147 msg (SE, _("Multivariate GLM not yet supported"));
148 n_dependent = 1; /* Drop this line after adding support for multivariate GLM. */
154 coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
156 c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
157 c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
158 c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
159 pspp_coeff_init (c->coeff + 1, dm);
163 Put the moments in the linreg cache.
166 compute_moments (pspp_linreg_cache * c, struct moments_var *mom,
167 struct design_matrix *dm, size_t n)
177 Scan the variable names in the columns of the design matrix.
178 When we find the variable we need, insert its mean in the cache.
180 for (i = 0; i < dm->m->size2; i++)
182 for (j = 0; j < n; j++)
184 if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
186 moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
187 &skewness, &kurtosis);
188 gsl_vector_set (c->indep_means, i, mean);
189 gsl_vector_set (c->indep_std, i, sqrt (variance));
194 /* Encode categorical variables.
195 Returns number of valid cases. */
197 prepare_categories (struct casereader *input,
198 const struct variable **vars, size_t n_vars,
199 struct moments_var *mom)
205 for (i = 0; i < n_vars; i++)
206 if (var_is_alpha (vars[i]))
207 cat_stored_values_create (vars[i]);
210 for (; casereader_read (input, &c); case_destroy (&c))
213 The second condition ensures the program will run even if
214 there is only one variable to act as both explanatory and
217 for (i = 0; i < n_vars; i++)
219 const union value *val = case_data (&c, vars[i]);
220 if (var_is_alpha (vars[i]))
221 cat_value_update (vars[i], val);
223 moments1_add (mom[i].m, val->f, 1.0);
227 casereader_destroy (input);
233 run_glm (struct casereader *input,
235 const struct dataset *ds,
236 pspp_linreg_cache *model)
241 const struct variable **indep_vars;
242 struct design_matrix *X;
243 struct moments_var *mom;
245 struct casereader *reader;
247 size_t n_data; /* Number of valid cases. */
249 pspp_linreg_opts lopts;
251 assert (model != NULL);
253 if (!casereader_peek (input, 0, &c))
255 output_split_file_values (ds, &c);
260 dict_get_vars (dataset_dict (ds), &v_dependent, &n_dependent,
264 for (i = 0; i < n_dependent; i++)
266 if (!var_is_numeric (v_dependent[i]))
268 msg (SE, _("Dependent variable must be numeric."));
273 mom = xnmalloc (n_dependent, sizeof (*mom));
274 mom->m = moments1_create (MOMENT_VARIANCE);
275 mom->v = v_dependent[0];
276 lopts.get_depvar_mean_std = 1;
278 lopts.get_indep_mean_std = xnmalloc (n_dependent, sizeof (int));
279 indep_vars = xnmalloc (cmd->n_by, sizeof *indep_vars);
281 for (i = 0; i < cmd->n_by; i++)
283 indep_vars[i] = cmd->v_by[i];
287 reader = casereader_clone (input);
288 reader = casereader_create_filter_missing (reader, indep_vars, n_indep,
290 reader = casereader_create_filter_missing (reader, v_dependent, 1,
292 n_data = prepare_categories (casereader_clone (reader),
293 indep_vars, n_indep, mom);
295 if ((n_data > 0) && (n_indep > 0))
297 Y = gsl_vector_alloc (n_data);
299 design_matrix_create (n_indep,
300 (const struct variable **) indep_vars,
302 for (i = 0; i < X->m->size2; i++)
304 lopts.get_indep_mean_std[i] = 1;
306 model = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
307 model->indep_means = gsl_vector_alloc (X->m->size2);
308 model->indep_std = gsl_vector_alloc (X->m->size2);
309 model->depvar = v_dependent[0];
310 reader = casereader_create_counter (reader, &row, -1);
311 for (; casereader_read (reader, &c); case_destroy (&c))
313 for (i = 0; i < n_indep; ++i)
315 const struct variable *v = indep_vars[i];
316 const union value *val = case_data (&c, v);
317 if (var_is_alpha (v))
318 design_matrix_set_categorical (X, row, v, val);
320 design_matrix_set_numeric (X, row, v, val);
322 gsl_vector_set (Y, row, case_num (&c, model->depvar));
324 casereader_destroy (reader);
326 Now that we know the number of coefficients, allocate space
327 and store pointers to the variables that correspond to the
330 coeff_init (model, X);
333 Find the least-squares estimates and other statistics.
335 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, model);
336 compute_moments (model, mom, X, n_dependent);
339 design_matrix_destroy (X);
343 msg (SE, gettext ("No valid data found. This command was skipped."));
346 free (lopts.get_indep_mean_std);
347 casereader_destroy (input);