1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <gsl/gsl_matrix.h>
21 #include <gsl/gsl_vector.h>
25 #include <data/case.h>
26 #include <data/category.h>
27 #include <data/casegrouper.h>
28 #include <data/casereader.h>
29 #include <data/dictionary.h>
30 #include <data/missing-values.h>
31 #include <data/procedure.h>
32 #include <data/transformations.h>
33 #include <data/value-labels.h>
34 #include <data/variable.h>
35 #include <language/command.h>
36 #include <language/dictionary/split-file.h>
37 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <libpspp/alloc.h>
40 #include <libpspp/compiler.h>
41 #include <libpspp/message.h>
42 #include <math/design-matrix.h>
43 #include <math/coefficient.h>
44 #include <math/linreg/linreg.h>
45 #include <math/moments.h>
46 #include <output/table.h>
50 #define GLM_LARGE_DATA 1000
62 static struct cmd_glm cmd;
65 Moments for each of the variables used.
70 const struct variable *v;
75 Dependent variable used.
77 static const struct variable **v_dependent;
80 Number of dependent variables.
82 static size_t n_dependent;
85 Return value for the procedure.
87 static int pspp_glm_rc = CMD_SUCCESS;
89 static bool run_glm (struct casereader*,
91 const struct dataset *,
95 cmd_glm (struct lexer *lexer, struct dataset *ds)
97 struct casegrouper *grouper;
98 struct casereader *group;
99 pspp_linreg_cache *model = NULL;
103 model = xmalloc (sizeof *model);
105 if (!parse_glm (lexer, ds, &cmd, NULL))
109 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
110 while (casegrouper_get_next_group (grouper, &group))
112 run_glm (group, &cmd, ds, model);
114 ok = casegrouper_destroy (grouper);
115 ok = proc_commit (ds) && ok;
119 return ok ? CMD_SUCCESS : CMD_FAILURE;
122 /* Parser for the dependent sub command */
124 glm_custom_dependent (struct lexer *lexer, struct dataset *ds,
125 struct cmd_glm *cmd UNUSED,
128 const struct dictionary *dict = dataset_dict (ds);
130 if ((lex_token (lexer) != T_ID
131 || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
132 && lex_token (lexer) != T_ALL)
135 if (!parse_variables_const
136 (lexer, dict, &v_dependent, &n_dependent, PV_NONE))
141 assert (n_dependent);
143 msg (SE, _("Multivariate GLM not yet supported"));
144 n_dependent = 1; /* Drop this line after adding support for multivariate GLM. */
150 coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
152 c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
153 c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
154 c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
155 pspp_coeff_init (c->coeff + 1, dm);
159 Put the moments in the linreg cache.
162 compute_moments (pspp_linreg_cache * c, struct moments_var *mom,
163 struct design_matrix *dm, size_t n)
173 Scan the variable names in the columns of the design matrix.
174 When we find the variable we need, insert its mean in the cache.
176 for (i = 0; i < dm->m->size2; i++)
178 for (j = 0; j < n; j++)
180 if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
182 moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
183 &skewness, &kurtosis);
184 gsl_vector_set (c->indep_means, i, mean);
185 gsl_vector_set (c->indep_std, i, sqrt (variance));
190 /* Encode categorical variables.
191 Returns number of valid cases. */
193 prepare_categories (struct casereader *input,
194 const struct variable **vars, size_t n_vars,
195 struct moments_var *mom)
201 for (i = 0; i < n_vars; i++)
202 if (var_is_alpha (vars[i]))
203 cat_stored_values_create (vars[i]);
206 for (; casereader_read (input, &c); case_destroy (&c))
209 The second condition ensures the program will run even if
210 there is only one variable to act as both explanatory and
213 for (i = 0; i < n_vars; i++)
215 const union value *val = case_data (&c, vars[i]);
216 if (var_is_alpha (vars[i]))
217 cat_value_update (vars[i], val);
219 moments1_add (mom[i].m, val->f, 1.0);
223 casereader_destroy (input);
229 run_glm (struct casereader *input,
231 const struct dataset *ds,
232 pspp_linreg_cache *model)
237 const struct variable **indep_vars;
238 struct design_matrix *X;
239 struct moments_var *mom;
241 struct casereader *reader;
243 size_t n_data; /* Number of valid cases. */
245 pspp_linreg_opts lopts;
247 assert (model != NULL);
249 if (!casereader_peek (input, 0, &c))
251 output_split_file_values (ds, &c);
256 dict_get_vars (dataset_dict (ds), &v_dependent, &n_dependent,
260 for (i = 0; i < n_dependent; i++)
262 if (!var_is_numeric (v_dependent[i]))
264 msg (SE, _("Dependent variable must be numeric."));
269 mom = xnmalloc (n_dependent, sizeof (*mom));
270 mom->m = moments1_create (MOMENT_VARIANCE);
271 mom->v = v_dependent[0];
272 lopts.get_depvar_mean_std = 1;
274 lopts.get_indep_mean_std = xnmalloc (n_dependent, sizeof (int));
275 indep_vars = xnmalloc (cmd->n_by, sizeof *indep_vars);
277 for (i = 0; i < cmd->n_by; i++)
279 indep_vars[i] = cmd->v_by[i];
283 reader = casereader_clone (input);
284 reader = casereader_create_filter_missing (reader, indep_vars, n_indep,
286 reader = casereader_create_filter_missing (reader, v_dependent, 1,
288 n_data = prepare_categories (casereader_clone (reader),
289 indep_vars, n_indep, mom);
291 if ((n_data > 0) && (n_indep > 0))
293 Y = gsl_vector_alloc (n_data);
295 design_matrix_create (n_indep,
296 (const struct variable **) indep_vars,
298 for (i = 0; i < X->m->size2; i++)
300 lopts.get_indep_mean_std[i] = 1;
302 model = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
303 model->indep_means = gsl_vector_alloc (X->m->size2);
304 model->indep_std = gsl_vector_alloc (X->m->size2);
305 model->depvar = v_dependent[0];
306 reader = casereader_create_counter (reader, &row, -1);
307 for (; casereader_read (reader, &c); case_destroy (&c))
309 for (i = 0; i < n_indep; ++i)
311 const struct variable *v = indep_vars[i];
312 const union value *val = case_data (&c, v);
313 if (var_is_alpha (v))
314 design_matrix_set_categorical (X, row, v, val);
316 design_matrix_set_numeric (X, row, v, val);
318 gsl_vector_set (Y, row, case_num (&c, model->depvar));
320 casereader_destroy (reader);
322 Now that we know the number of coefficients, allocate space
323 and store pointers to the variables that correspond to the
326 coeff_init (model, X);
329 Find the least-squares estimates and other statistics.
331 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, model);
332 compute_moments (model, mom, X, n_dependent);
335 design_matrix_destroy (X);
339 msg (SE, gettext ("No valid data found. This command was skipped."));
342 free (lopts.get_indep_mean_std);
343 casereader_destroy (input);