1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <gsl/gsl_matrix.h>
21 #include <gsl/gsl_vector.h>
25 #include <data/case.h>
26 #include <data/category.h>
27 #include <data/casegrouper.h>
28 #include <data/casereader.h>
29 #include <data/dictionary.h>
30 #include <data/missing-values.h>
31 #include <data/procedure.h>
32 #include <data/transformations.h>
33 #include <data/value-labels.h>
34 #include <data/variable.h>
35 #include <language/command.h>
36 #include <language/dictionary/split-file.h>
37 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <libpspp/compiler.h>
40 #include <libpspp/message.h>
41 #include <math/design-matrix.h>
42 #include <math/coefficient.h>
43 #include <math/linreg/linreg.h>
44 #include <math/moments.h>
45 #include <output/table.h>
50 #define GLM_LARGE_DATA 1000
62 static struct cmd_glm cmd;
65 Moments for each of the variables used.
70 const struct variable *v;
75 Dependent variable used.
77 static const struct variable **v_dependent;
80 Number of dependent variables.
82 static size_t n_dependent;
86 Return value for the procedure.
88 static int pspp_glm_rc = CMD_SUCCESS;
90 int cmd_glm (struct lexer *lexer, struct dataset *ds);
93 static bool run_glm (struct casereader*,
95 const struct dataset *,
99 cmd_glm (struct lexer *lexer, struct dataset *ds)
101 struct casegrouper *grouper;
102 struct casereader *group;
103 pspp_linreg_cache *model = NULL;
107 model = xmalloc (sizeof *model);
109 if (!parse_glm (lexer, ds, &cmd, NULL))
113 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
114 while (casegrouper_get_next_group (grouper, &group))
116 run_glm (group, &cmd, ds, model);
118 ok = casegrouper_destroy (grouper);
119 ok = proc_commit (ds) && ok;
123 return ok ? CMD_SUCCESS : CMD_FAILURE;
126 /* Parser for the dependent sub command */
128 glm_custom_dependent (struct lexer *lexer, struct dataset *ds,
129 struct cmd_glm *cmd UNUSED,
132 const struct dictionary *dict = dataset_dict (ds);
134 if ((lex_token (lexer) != T_ID
135 || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
136 && lex_token (lexer) != T_ALL)
139 if (!parse_variables_const
140 (lexer, dict, &v_dependent, &n_dependent, PV_NONE))
145 assert (n_dependent);
147 msg (SE, _("Multivariate GLM not yet supported"));
148 n_dependent = 1; /* Drop this line after adding support for multivariate GLM. */
154 coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
156 c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
157 c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
158 c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
159 pspp_coeff_init (c->coeff + 1, dm);
163 Put the moments in the linreg cache.
166 compute_moments (pspp_linreg_cache * c, struct moments_var *mom,
167 struct design_matrix *dm, size_t n)
177 Scan the variable names in the columns of the design matrix.
178 When we find the variable we need, insert its mean in the cache.
180 for (i = 0; i < dm->m->size2; i++)
182 for (j = 0; j < n; j++)
184 if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
186 moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
187 &skewness, &kurtosis);
188 gsl_vector_set (c->indep_means, i, mean);
189 gsl_vector_set (c->indep_std, i, sqrt (variance));
194 /* Encode categorical variables.
195 Returns number of valid cases. */
197 prepare_categories (struct casereader *input,
198 const struct variable **vars, size_t n_vars,
199 struct moments_var *mom)
205 for (i = 0; i < n_vars; i++)
206 if (var_is_alpha (vars[i]))
207 cat_stored_values_create (vars[i]);
210 for (; casereader_read (input, &c); case_destroy (&c))
213 The second condition ensures the program will run even if
214 there is only one variable to act as both explanatory and
217 for (i = 0; i < n_vars; i++)
219 const union value *val = case_data (&c, vars[i]);
220 if (var_is_alpha (vars[i]))
221 cat_value_update (vars[i], val);
223 moments1_add (mom[i].m, val->f, 1.0);
227 casereader_destroy (input);
233 run_glm (struct casereader *input,
235 const struct dataset *ds,
236 pspp_linreg_cache *model)
241 const struct variable **indep_vars;
242 struct design_matrix *X;
243 struct moments_var *mom;
245 struct casereader *reader;
247 size_t n_data; /* Number of valid cases. */
249 pspp_linreg_opts lopts;
251 assert (model != NULL);
253 if (!casereader_peek (input, 0, &c))
255 casereader_destroy (input);
258 output_split_file_values (ds, &c);
263 dict_get_vars (dataset_dict (ds), &v_dependent, &n_dependent,
267 for (i = 0; i < n_dependent; i++)
269 if (!var_is_numeric (v_dependent[i]))
271 msg (SE, _("Dependent variable must be numeric."));
276 mom = xnmalloc (n_dependent, sizeof (*mom));
277 mom->m = moments1_create (MOMENT_VARIANCE);
278 mom->v = v_dependent[0];
279 lopts.get_depvar_mean_std = 1;
281 lopts.get_indep_mean_std = xnmalloc (n_dependent, sizeof (int));
282 indep_vars = xnmalloc (cmd->n_by, sizeof *indep_vars);
284 for (i = 0; i < cmd->n_by; i++)
286 indep_vars[i] = cmd->v_by[i];
290 reader = casereader_clone (input);
291 reader = casereader_create_filter_missing (reader, indep_vars, n_indep,
293 reader = casereader_create_filter_missing (reader, v_dependent, 1,
295 n_data = prepare_categories (casereader_clone (reader),
296 indep_vars, n_indep, mom);
298 if ((n_data > 0) && (n_indep > 0))
300 Y = gsl_vector_alloc (n_data);
302 design_matrix_create (n_indep,
303 (const struct variable **) indep_vars,
305 for (i = 0; i < X->m->size2; i++)
307 lopts.get_indep_mean_std[i] = 1;
309 model = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
310 model->indep_means = gsl_vector_alloc (X->m->size2);
311 model->indep_std = gsl_vector_alloc (X->m->size2);
312 model->depvar = v_dependent[0];
313 reader = casereader_create_counter (reader, &row, -1);
314 for (; casereader_read (reader, &c); case_destroy (&c))
316 for (i = 0; i < n_indep; ++i)
318 const struct variable *v = indep_vars[i];
319 const union value *val = case_data (&c, v);
320 if (var_is_alpha (v))
321 design_matrix_set_categorical (X, row, v, val);
323 design_matrix_set_numeric (X, row, v, val);
325 gsl_vector_set (Y, row, case_num (&c, model->depvar));
327 casereader_destroy (reader);
329 Now that we know the number of coefficients, allocate space
330 and store pointers to the variables that correspond to the
333 coeff_init (model, X);
336 Find the least-squares estimates and other statistics.
338 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, model);
339 compute_moments (model, mom, X, n_dependent);
342 design_matrix_destroy (X);
346 msg (SE, gettext ("No valid data found. This command was skipped."));
349 free (lopts.get_indep_mean_std);
350 casereader_destroy (input);