1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <gsl/gsl_matrix.h>
21 #include <gsl/gsl_vector.h>
25 #include <data/case.h>
26 #include <data/category.h>
27 #include <data/casegrouper.h>
28 #include <data/casereader.h>
29 #include <data/dictionary.h>
30 #include <data/missing-values.h>
31 #include <data/procedure.h>
32 #include <data/transformations.h>
33 #include <data/value-labels.h>
34 #include <data/variable.h>
35 #include <language/command.h>
36 #include <language/dictionary/split-file.h>
37 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <libpspp/compiler.h>
40 #include <libpspp/message.h>
41 #include <math/covariance-matrix.h>
42 #include <math/coefficient.h>
43 #include <math/linreg.h>
44 #include <math/moments.h>
45 #include <output/table.h>
50 #define GLM_LARGE_DATA 1000
62 static struct cmd_glm cmd;
65 Moments for each of the variables used.
73 const struct variable *v;
78 Dependent variable used.
80 static const struct variable **v_dependent;
83 Number of dependent variables.
85 static size_t n_dependent;
89 Return value for the procedure.
91 static int pspp_glm_rc = CMD_SUCCESS;
93 int cmd_glm (struct lexer *lexer, struct dataset *ds);
96 static bool run_glm (struct casereader *,
98 const struct dataset *, pspp_linreg_cache *);
101 cmd_glm (struct lexer *lexer, struct dataset *ds)
103 struct casegrouper *grouper;
104 struct casereader *group;
105 pspp_linreg_cache *model = NULL;
109 model = xmalloc (sizeof *model);
111 if (!parse_glm (lexer, ds, &cmd, NULL))
115 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
116 while (casegrouper_get_next_group (grouper, &group))
118 run_glm (group, &cmd, ds, model);
120 ok = casegrouper_destroy (grouper);
121 ok = proc_commit (ds) && ok;
125 return ok ? CMD_SUCCESS : CMD_FAILURE;
128 /* Parser for the dependent sub command */
130 glm_custom_dependent (struct lexer *lexer, struct dataset *ds,
131 struct cmd_glm *cmd UNUSED, void *aux UNUSED)
133 const struct dictionary *dict = dataset_dict (ds);
135 if ((lex_token (lexer) != T_ID
136 || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
137 && lex_token (lexer) != T_ALL)
140 if (!parse_variables_const
141 (lexer, dict, &v_dependent, &n_dependent, PV_NONE))
146 assert (n_dependent);
148 msg (SE, _("Multivariate GLM not yet supported"));
149 n_dependent = 1; /* Drop this line after adding support for multivariate GLM. */
155 coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
157 c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
158 c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
159 c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
160 pspp_coeff_init (c->coeff + 1, dm);
164 Put the moments in the linreg cache.
167 compute_moments (pspp_linreg_cache * c, struct moments_var *mom,
168 struct design_matrix *dm, size_t n)
178 Scan the variable names in the columns of the design matrix.
179 When we find the variable we need, insert its mean in the cache.
181 for (i = 0; i < dm->m->size2; i++)
183 for (j = 0; j < n; j++)
185 if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
187 moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
188 &skewness, &kurtosis);
189 gsl_vector_set (c->indep_means, i, mean);
190 gsl_vector_set (c->indep_std, i, sqrt (variance));
196 /* Encode categorical variables.
197 Returns number of valid cases. */
199 data_pass_one (struct casereader *input,
200 const struct variable **vars, size_t n_vars,
201 struct moments_var **mom)
207 for (i = 0; i < n_vars; i++)
209 mom[i] = xmalloc (sizeof (*mom[i]));
211 mom[i]->mean = xmalloc (sizeof (*mom[i]->mean));
212 mom[i]->variance = xmalloc (sizeof (*mom[i]->mean));
213 mom[i]->weight = xmalloc (sizeof (*mom[i]->weight));
214 mom[i]->m = moments1_create (MOMENT_VARIANCE);
215 if (var_is_alpha (vars[i]))
216 cat_stored_values_create (vars[i]);
220 for (; casereader_read (input, &c); case_destroy (&c))
223 The second condition ensures the program will run even if
224 there is only one variable to act as both explanatory and
227 for (i = 0; i < n_vars; i++)
229 const union value *val = case_data (&c, vars[i]);
230 if (var_is_alpha (vars[i]))
231 cat_value_update (vars[i], val);
233 moments1_add (mom[i]->m, val->f, 1.0);
237 casereader_destroy (input);
238 for (i = 0; i < n_vars; i++)
240 if (var_is_numeric (mom[i]->v))
242 moments1_calculate (mom[i]->m, mom[i]->weight, mom[i]->mean,
243 mom[i]->variance, NULL, NULL);
251 run_glm (struct casereader *input,
253 const struct dataset *ds, pspp_linreg_cache * model)
259 const struct variable **indep_vars;
260 const struct variable **all_vars;
261 struct design_matrix *X;
262 struct moments_var **mom;
263 struct casereader *reader;
266 size_t n_data; /* Number of valid cases. */
268 pspp_linreg_opts lopts;
270 assert (model != NULL);
272 if (!casereader_peek (input, 0, &c))
274 casereader_destroy (input);
277 output_split_file_values (ds, &c);
282 dict_get_vars (dataset_dict (ds), &v_dependent, &n_dependent,
288 lopts.get_depvar_mean_std = 1;
290 lopts.get_indep_mean_std = xnmalloc (n_dependent, sizeof (int));
291 indep_vars = xnmalloc (cmd->n_by, sizeof *indep_vars);
292 n_all_vars = cmd->n_by + n_dependent;
293 all_vars = xnmalloc (n_all_vars, sizeof *all_vars);
295 for (i = 0; i < n_dependent; i++)
297 all_vars[i] = v_dependent[i];
299 for (i = 0; i < cmd->n_by; i++)
301 indep_vars[i] = cmd->v_by[i];
302 all_vars[i + n_dependent] = cmd->v_by[i];
305 mom = xnmalloc (n_all_vars, sizeof (*mom));
308 reader = casereader_clone (input);
309 reader = casereader_create_filter_missing (reader, indep_vars, n_indep,
311 reader = casereader_create_filter_missing (reader, v_dependent, 1,
313 n_data = data_pass_one (casereader_clone (reader),
314 (const struct variable **) all_vars, n_all_vars,
317 if ((n_data > 0) && (n_indep > 0))
320 covariance_matrix_create (n_all_vars,
321 (const struct variable **) all_vars);
322 reader = casereader_create_counter (reader, &row, -1);
323 for (; casereader_read (reader, &c); case_destroy (&c))
326 Accumulate the covariance matrix.
328 for (i = 0; i < n_all_vars; ++i)
330 const struct variable *v = all_vars[i];
331 const union value *val_v = case_data (&c, v);
332 for (j = i; j < n_all_vars; j++)
334 const struct variable *w = all_vars[j];
335 const union value *val_w = case_data (&c, w);
336 covariance_pass_two (X, *mom[i]->mean, *mom[j]->mean,
342 casereader_destroy (reader);
343 for (i = 0; i < n_all_vars; i++)
345 moments1_destroy (mom[i]->m);
347 free (mom[i]->variance);
348 free (mom[i]->weight);
352 covariance_matrix_destroy (X);
356 msg (SE, gettext ("No valid data found. This command was skipped."));
359 free (lopts.get_indep_mean_std);
360 casereader_destroy (input);