1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <gsl/gsl_matrix.h>
21 #include <gsl/gsl_vector.h>
25 #include "regression-export.h"
26 #include <data/case.h>
27 #include <data/casegrouper.h>
28 #include <data/casereader.h>
29 #include <data/category.h>
30 #include <data/dictionary.h>
31 #include <data/missing-values.h>
32 #include <data/procedure.h>
33 #include <data/transformations.h>
34 #include <data/value-labels.h>
35 #include <data/variable.h>
36 #include <language/command.h>
37 #include <language/dictionary/split-file.h>
38 #include <language/data-io/file-handle.h>
39 #include <language/lexer/lexer.h>
40 #include <libpspp/alloc.h>
41 #include <libpspp/compiler.h>
42 #include <libpspp/message.h>
43 #include <libpspp/taint.h>
44 #include <math/design-matrix.h>
45 #include <math/coefficient.h>
46 #include <math/linreg/linreg.h>
47 #include <math/moments.h>
48 #include <output/table.h>
51 #define _(msgid) gettext (msgid)
53 #define REG_LARGE_DATA 1000
58 "REGRESSION" (regression_):
79 +save[sv_]=resid,pred;
84 static struct cmd_regression cmd;
87 Moments for each of the variables used.
92 const struct variable *v;
95 /* Linear regression models. */
96 static pspp_linreg_cache **models = NULL;
99 Transformations for saving predicted values
104 int n_trns; /* Number of transformations. */
105 int trns_id; /* Which trns is this one? */
106 pspp_linreg_cache *c; /* Linear model for this trns. */
109 Variables used (both explanatory and response).
111 static const struct variable **v_variables;
116 static size_t n_variables;
119 File where the model will be saved if the EXPORT subcommand
122 static struct file_handle *model_file;
124 static bool run_regression (struct casereader *, struct cmd_regression *,
128 STATISTICS subcommand output functions.
130 static void reg_stats_r (pspp_linreg_cache *);
131 static void reg_stats_coeff (pspp_linreg_cache *);
132 static void reg_stats_anova (pspp_linreg_cache *);
133 static void reg_stats_outs (pspp_linreg_cache *);
134 static void reg_stats_zpp (pspp_linreg_cache *);
135 static void reg_stats_label (pspp_linreg_cache *);
136 static void reg_stats_sha (pspp_linreg_cache *);
137 static void reg_stats_ci (pspp_linreg_cache *);
138 static void reg_stats_f (pspp_linreg_cache *);
139 static void reg_stats_bcov (pspp_linreg_cache *);
140 static void reg_stats_ses (pspp_linreg_cache *);
141 static void reg_stats_xtx (pspp_linreg_cache *);
142 static void reg_stats_collin (pspp_linreg_cache *);
143 static void reg_stats_tol (pspp_linreg_cache *);
144 static void reg_stats_selection (pspp_linreg_cache *);
145 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
146 int, pspp_linreg_cache *);
149 reg_stats_r (pspp_linreg_cache * c)
159 rsq = c->ssm / c->sst;
160 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
161 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
162 t = tab_create (n_cols, n_rows, 0);
163 tab_dim (t, tab_natural_dimensions);
164 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
165 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
166 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
167 tab_vline (t, TAL_0, 1, 0, 0);
169 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
170 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
171 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
172 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
173 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
174 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
175 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
176 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
177 tab_title (t, _("Model Summary"));
182 Table showing estimated regression coefficients.
185 reg_stats_coeff (pspp_linreg_cache * c)
197 const struct variable *v;
198 const union value *val;
203 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
204 n_rows = c->n_coeffs + 2;
206 t = tab_create (n_cols, n_rows, 0);
207 tab_headers (t, 2, 0, 1, 0);
208 tab_dim (t, tab_natural_dimensions);
209 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
210 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
211 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
212 tab_vline (t, TAL_0, 1, 0, 0);
214 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
215 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
216 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
217 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
218 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
219 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
220 coeff = c->coeff[0]->estimate;
221 tab_float (t, 2, 1, 0, coeff, 10, 2);
222 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
223 tab_float (t, 3, 1, 0, std_err, 10, 2);
224 beta = coeff / c->depvar_std;
225 tab_float (t, 4, 1, 0, beta, 10, 2);
226 t_stat = coeff / std_err;
227 tab_float (t, 5, 1, 0, t_stat, 10, 2);
228 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
229 tab_float (t, 6, 1, 0, pval, 10, 2);
230 for (j = 1; j <= c->n_indeps; j++)
232 v = pspp_coeff_get_var (c->coeff[j], 0);
233 label = var_to_string (v);
234 /* Do not overwrite the variable's name. */
235 strncpy (tmp, label, MAX_STRING);
236 if (var_is_alpha (v))
239 Append the value associated with this coefficient.
240 This makes sense only if we us the usual binary encoding
244 val = pspp_coeff_get_value (c->coeff[j], v);
245 val_s = var_get_value_name (v, val);
246 strncat (tmp, val_s, MAX_STRING);
249 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
251 Regression coefficients.
253 coeff = c->coeff[j]->estimate;
254 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
256 Standard error of the coefficients.
258 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
259 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
261 'Standardized' coefficient, i.e., regression coefficient
262 if all variables had unit variance.
264 beta = gsl_vector_get (c->indep_std, j);
265 beta *= coeff / c->depvar_std;
266 tab_float (t, 4, j + 1, 0, beta, 10, 2);
269 Test statistic for H0: coefficient is 0.
271 t_stat = coeff / std_err;
272 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
274 P values for the test statistic above.
277 2 * gsl_cdf_tdist_Q (fabs (t_stat),
278 (double) (c->n_obs - c->n_coeffs));
279 tab_float (t, 6, j + 1, 0, pval, 10, 2);
281 tab_title (t, _("Coefficients"));
287 Display the ANOVA table.
290 reg_stats_anova (pspp_linreg_cache * c)
294 const double msm = c->ssm / c->dfm;
295 const double mse = c->sse / c->dfe;
296 const double F = msm / mse;
297 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
302 t = tab_create (n_cols, n_rows, 0);
303 tab_headers (t, 2, 0, 1, 0);
304 tab_dim (t, tab_natural_dimensions);
306 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
308 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
309 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
310 tab_vline (t, TAL_0, 1, 0, 0);
312 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
313 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
314 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
315 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
316 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
318 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
319 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
320 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
322 /* Sums of Squares */
323 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
324 tab_float (t, 2, 3, 0, c->sst, 10, 2);
325 tab_float (t, 2, 2, 0, c->sse, 10, 2);
328 /* Degrees of freedom */
329 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
330 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
331 tab_float (t, 3, 3, 0, c->dft, 4, 0);
335 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
336 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
338 tab_float (t, 5, 1, 0, F, 8, 3);
340 tab_float (t, 6, 1, 0, pval, 8, 3);
342 tab_title (t, _("ANOVA"));
346 reg_stats_outs (pspp_linreg_cache * c)
351 reg_stats_zpp (pspp_linreg_cache * c)
356 reg_stats_label (pspp_linreg_cache * c)
361 reg_stats_sha (pspp_linreg_cache * c)
366 reg_stats_ci (pspp_linreg_cache * c)
371 reg_stats_f (pspp_linreg_cache * c)
376 reg_stats_bcov (pspp_linreg_cache * c)
388 n_cols = c->n_indeps + 1 + 2;
389 n_rows = 2 * (c->n_indeps + 1);
390 t = tab_create (n_cols, n_rows, 0);
391 tab_headers (t, 2, 0, 1, 0);
392 tab_dim (t, tab_natural_dimensions);
393 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
394 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
395 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
396 tab_vline (t, TAL_0, 1, 0, 0);
397 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
398 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
399 for (i = 1; i < c->n_coeffs; i++)
401 const struct variable *v = pspp_coeff_get_var (c->coeff[i], 0);
402 label = var_to_string (v);
403 tab_text (t, 2, i, TAB_CENTER, label);
404 tab_text (t, i + 2, 0, TAB_CENTER, label);
405 for (k = 1; k < c->n_coeffs; k++)
407 col = (i <= k) ? k : i;
408 row = (i <= k) ? i : k;
409 tab_float (t, k + 2, i, TAB_CENTER,
410 gsl_matrix_get (c->cov, row, col), 8, 3);
413 tab_title (t, _("Coefficient Correlations"));
417 reg_stats_ses (pspp_linreg_cache * c)
422 reg_stats_xtx (pspp_linreg_cache * c)
427 reg_stats_collin (pspp_linreg_cache * c)
432 reg_stats_tol (pspp_linreg_cache * c)
437 reg_stats_selection (pspp_linreg_cache * c)
443 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
444 int keyword, pspp_linreg_cache * c)
453 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
456 The order here must match the order in which the STATISTICS
457 keywords appear in the specification section above.
484 Set everything but F.
486 for (i = 0; i < f; i++)
493 for (i = 0; i < all; i++)
501 Default output: ANOVA table, parameter estimates,
502 and statistics for variables not entered into model,
505 if (keywords[defaults] | d)
513 statistics_keyword_output (reg_stats_r, keywords[r], c);
514 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
515 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
516 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
517 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
518 statistics_keyword_output (reg_stats_label, keywords[label], c);
519 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
520 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
521 statistics_keyword_output (reg_stats_f, keywords[f], c);
522 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
523 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
524 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
525 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
526 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
527 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
531 Free the transformation. Free its linear model if this
532 transformation is the last one.
535 regression_trns_free (void *t_)
538 struct reg_trns *t = t_;
540 if (t->trns_id == t->n_trns)
542 result = pspp_linreg_cache_free (t->c);
550 Gets the predicted values.
553 regression_trns_pred_proc (void *t_, struct ccase *c,
554 casenumber case_idx UNUSED)
558 struct reg_trns *trns = t_;
559 pspp_linreg_cache *model;
560 union value *output = NULL;
561 const union value **vals = NULL;
562 const struct variable **vars = NULL;
564 assert (trns != NULL);
566 assert (model != NULL);
567 assert (model->depvar != NULL);
568 assert (model->pred != NULL);
570 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
571 n_vals = (*model->get_vars) (model, vars);
573 vals = xnmalloc (n_vals, sizeof (*vals));
574 output = case_data_rw (c, model->pred);
575 assert (output != NULL);
577 for (i = 0; i < n_vals; i++)
579 vals[i] = case_data (c, vars[i]);
581 output->f = (*model->predict) ((const struct variable **) vars,
582 vals, model, n_vals);
585 return TRNS_CONTINUE;
592 regression_trns_resid_proc (void *t_, struct ccase *c,
593 casenumber case_idx UNUSED)
597 struct reg_trns *trns = t_;
598 pspp_linreg_cache *model;
599 union value *output = NULL;
600 const union value **vals = NULL;
601 const union value *obs = NULL;
602 const struct variable **vars = NULL;
604 assert (trns != NULL);
606 assert (model != NULL);
607 assert (model->depvar != NULL);
608 assert (model->resid != NULL);
610 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
611 n_vals = (*model->get_vars) (model, vars);
613 vals = xnmalloc (n_vals, sizeof (*vals));
614 output = case_data_rw (c, model->resid);
615 assert (output != NULL);
617 for (i = 0; i < n_vals; i++)
619 vals[i] = case_data (c, vars[i]);
621 obs = case_data (c, model->depvar);
622 output->f = (*model->residual) ((const struct variable **) vars,
623 vals, obs, model, n_vals);
626 return TRNS_CONTINUE;
630 Returns false if NAME is a duplicate of any existing variable name.
633 try_name (const struct dictionary *dict, const char *name)
635 if (dict_lookup_var (dict, name) != NULL)
642 reg_get_name (const struct dictionary *dict, char name[LONG_NAME_LEN],
643 const char prefix[LONG_NAME_LEN])
647 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
648 while (!try_name (dict, name))
651 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
656 reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
657 pspp_linreg_cache * c, struct variable **v, int n_trns)
659 struct dictionary *dict = dataset_dict (ds);
660 static int trns_index = 1;
661 char name[LONG_NAME_LEN];
662 struct variable *new_var;
663 struct reg_trns *t = NULL;
665 t = xmalloc (sizeof (*t));
666 t->trns_id = trns_index;
669 reg_get_name (dict, name, prefix);
670 new_var = dict_create_var (dict, name, 0);
671 assert (new_var != NULL);
673 add_transformation (ds, f, regression_trns_free, t);
678 subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models)
680 pspp_linreg_cache **lc;
684 assert (models != NULL);
688 /* Count the number of transformations we will need. */
689 for (i = 0; i < REGRESSION_SV_count; i++)
696 n_trns *= cmd.n_dependent;
698 for (lc = models; lc < models + cmd.n_dependent; lc++)
700 assert (*lc != NULL);
701 assert ((*lc)->depvar != NULL);
702 if (cmd.a_save[REGRESSION_SV_RESID])
704 reg_save_var (ds, "RES", regression_trns_resid_proc, *lc,
705 &(*lc)->resid, n_trns);
707 if (cmd.a_save[REGRESSION_SV_PRED])
709 reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc,
710 &(*lc)->pred, n_trns);
716 for (lc = models; lc < models + cmd.n_dependent; lc++)
720 pspp_linreg_cache_free (*lc);
727 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
731 for (i = 0; i < n_vars; i++)
742 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
746 struct variable **varlist;
748 fprintf (fp, "%s", reg_export_categorical_encode_1);
750 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
751 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
753 struct pspp_coeff *coeff = c->coeff[i];
754 const struct variable *v = pspp_coeff_get_var (coeff, 0);
755 if (var_is_alpha (v))
757 if (!reg_inserted (v, varlist, n_vars))
759 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
761 varlist[n_vars] = (struct variable *) v;
766 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
767 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
769 for (i = 0; i < n_vars - 1; i++)
771 fprintf (fp, "&%s,\n\t\t", var_get_name (varlist[i]));
773 fprintf (fp, "&%s};\n\t", var_get_name (varlist[i]));
775 for (i = 0; i < n_vars; i++)
777 int n_categories = cat_get_n_categories (varlist[i]);
780 fprintf (fp, "%s.name = \"%s\";\n\t",
781 var_get_name (varlist[i]), var_get_name (varlist[i]));
782 fprintf (fp, "%s.n_vals = %d;\n\t",
783 var_get_name (varlist[i]), n_categories);
785 for (j = 0; j < n_categories; j++)
787 const union value *val = cat_subscript_to_value (j, varlist[i]);
788 fprintf (fp, "%s.values[%d] = \"%s\";\n\t",
789 var_get_name (varlist[i]), j,
790 var_get_value_name (varlist[i], val));
793 fprintf (fp, "%s", reg_export_categorical_encode_2);
797 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
800 struct pspp_coeff *coeff;
801 const struct variable *v;
803 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
804 for (i = 1; i < c->n_indeps; i++)
807 v = pspp_coeff_get_var (coeff, 0);
808 fprintf (fp, "\"%s\",\n\t\t", var_get_name (v));
811 v = pspp_coeff_get_var (coeff, 0);
812 fprintf (fp, "\"%s\"};\n\t", var_get_name (v));
815 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
817 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
818 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
819 reg_print_depvars (fp, c);
820 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
822 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
823 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
826 reg_has_categorical (pspp_linreg_cache * c)
829 const struct variable *v;
831 for (i = 1; i < c->n_coeffs; i++)
833 v = pspp_coeff_get_var (c->coeff[i], 0);
834 if (var_is_alpha (v))
841 subcommand_export (int export, pspp_linreg_cache * c)
846 int n_quantiles = 100;
848 struct pspp_coeff *coeff;
853 assert (model_file != NULL);
854 fp = fopen (fh_get_file_name (model_file), "w");
856 fprintf (fp, "%s", reg_preamble);
857 reg_print_getvar (fp, c);
858 if (reg_has_categorical (c))
860 reg_print_categorical_encoding (fp, c);
862 fprintf (fp, "%s", reg_export_t_quantiles_1);
863 for (i = 0; i < n_quantiles - 1; i++)
865 tmp = 0.5 + 0.005 * (double) i;
866 fprintf (fp, "%.15e,\n\t\t",
867 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
869 fprintf (fp, "%.15e};\n\t",
870 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
871 fprintf (fp, "%s", reg_export_t_quantiles_2);
872 fprintf (fp, "%s", reg_mean_cmt);
873 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
874 fprintf (fp, "const char *var_names[])\n{\n\t");
875 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
876 for (i = 1; i < c->n_indeps; i++)
879 fprintf (fp, "%.15e,\n\t\t", coeff->estimate);
882 fprintf (fp, "%.15e};\n\t", coeff->estimate);
884 fprintf (fp, "double estimate = %.15e;\n\t", coeff->estimate);
885 fprintf (fp, "int i;\n\tint j;\n\n\t");
886 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
887 fprintf (fp, "%s", reg_getvar);
888 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
890 for (i = 0; i < c->cov->size1 - 1; i++)
893 for (j = 0; j < c->cov->size2 - 1; j++)
895 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
897 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
900 for (j = 0; j < c->cov->size2 - 1; j++)
902 fprintf (fp, "%.15e, ",
903 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
905 fprintf (fp, "%.15e}\n\t",
906 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
907 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
909 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
910 fprintf (fp, "%s", reg_variance);
911 fprintf (fp, "%s", reg_export_confidence_interval);
912 tmp = c->mse * c->mse;
913 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
914 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
915 fprintf (fp, "%s", reg_export_prediction_interval_3);
917 fp = fopen ("pspp_model_reg.h", "w");
918 fprintf (fp, "%s", reg_header);
924 regression_custom_export (struct lexer *lexer, struct dataset *ds UNUSED,
925 struct cmd_regression *cmd UNUSED, void *aux UNUSED)
927 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
928 if (!lex_force_match (lexer, '('))
931 if (lex_match (lexer, '*'))
935 model_file = fh_parse (lexer, FH_REF_FILE);
936 if (model_file == NULL)
940 if (!lex_force_match (lexer, ')'))
947 cmd_regression (struct lexer *lexer, struct dataset *ds)
949 struct casegrouper *grouper;
950 struct casereader *group;
954 if (!parse_regression (lexer, ds, &cmd, NULL))
957 models = xnmalloc (cmd.n_dependent, sizeof *models);
958 for (i = 0; i < cmd.n_dependent; i++)
964 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
965 while (casegrouper_get_next_group (grouper, &group))
966 run_regression (group, &cmd, ds);
967 ok = casegrouper_destroy (grouper);
968 ok = proc_commit (ds) && ok;
970 subcommand_save (ds, cmd.sbc_save, models);
973 return ok ? CMD_SUCCESS : CMD_FAILURE;
977 Is variable k the dependent variable?
980 is_depvar (size_t k, const struct variable *v)
982 return v == v_variables[k];
985 /* Parser for the variables sub command */
987 regression_custom_variables (struct lexer *lexer, struct dataset *ds,
988 struct cmd_regression *cmd UNUSED,
991 const struct dictionary *dict = dataset_dict (ds);
993 lex_match (lexer, '=');
995 if ((lex_token (lexer) != T_ID
996 || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
997 && lex_token (lexer) != T_ALL)
1001 if (!parse_variables_const
1002 (lexer, dict, &v_variables, &n_variables, PV_NONE))
1007 assert (n_variables);
1012 /* Identify the explanatory variables in v_variables. Returns
1013 the number of independent variables. */
1015 identify_indep_vars (const struct variable **indep_vars, const struct variable *depvar)
1017 int n_indep_vars = 0;
1020 for (i = 0; i < n_variables; i++)
1021 if (!is_depvar (i, depvar))
1022 indep_vars[n_indep_vars++] = v_variables[i];
1024 return n_indep_vars;
1027 /* Encode categorical variables.
1028 Returns number of valid cases. */
1030 prepare_categories (struct casereader *input,
1031 const struct variable **vars, size_t n_vars,
1032 struct moments_var *mom)
1038 for (i = 0; i < n_vars; i++)
1039 if (var_is_alpha (vars[i]))
1040 cat_stored_values_create (vars[i]);
1043 for (; casereader_read (input, &c); case_destroy (&c))
1046 The second condition ensures the program will run even if
1047 there is only one variable to act as both explanatory and
1050 for (i = 0; i < n_vars; i++)
1052 const union value *val = case_data (&c, vars[i]);
1053 if (var_is_alpha (vars[i]))
1054 cat_value_update (vars[i], val);
1056 moments1_add (mom[i].m, val->f, 1.0);
1060 casereader_destroy (input);
1066 coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
1068 c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
1069 c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
1070 c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
1071 pspp_coeff_init (c->coeff + 1, dm);
1075 Put the moments in the linreg cache.
1078 compute_moments (pspp_linreg_cache * c, struct moments_var *mom,
1079 struct design_matrix *dm, size_t n)
1089 Scan the variable names in the columns of the design matrix.
1090 When we find the variable we need, insert its mean in the cache.
1092 for (i = 0; i < dm->m->size2; i++)
1094 for (j = 0; j < n; j++)
1096 if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
1098 moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
1099 &skewness, &kurtosis);
1100 gsl_vector_set (c->indep_means, i, mean);
1101 gsl_vector_set (c->indep_std, i, sqrt (variance));
1108 run_regression (struct casereader *input, struct cmd_regression *cmd,
1115 const struct variable **indep_vars;
1116 struct design_matrix *X;
1117 struct moments_var *mom;
1120 pspp_linreg_opts lopts;
1122 assert (models != NULL);
1124 if (!casereader_peek (input, 0, &c))
1126 output_split_file_values (ds, &c);
1131 dict_get_vars (dataset_dict (ds), &v_variables, &n_variables,
1135 for (i = 0; i < cmd->n_dependent; i++)
1137 if (!var_is_numeric (cmd->v_dependent[i]))
1139 msg (SE, _("Dependent variable must be numeric."));
1144 mom = xnmalloc (n_variables, sizeof (*mom));
1145 for (i = 0; i < n_variables; i++)
1147 (mom + i)->m = moments1_create (MOMENT_VARIANCE);
1148 (mom + i)->v = v_variables[i];
1150 lopts.get_depvar_mean_std = 1;
1152 lopts.get_indep_mean_std = xnmalloc (n_variables, sizeof (int));
1153 indep_vars = xnmalloc (n_variables, sizeof *indep_vars);
1155 for (k = 0; k < cmd->n_dependent; k++)
1157 const struct variable *dep_var;
1158 struct casereader *reader;
1161 size_t n_data; /* Number of valid cases. */
1163 dep_var = cmd->v_dependent[k];
1164 n_indep = identify_indep_vars (indep_vars, dep_var);
1166 reader = casereader_clone (input);
1167 reader = casereader_create_filter_missing (reader, indep_vars, n_indep,
1169 reader = casereader_create_filter_missing (reader, &dep_var, 1,
1171 n_data = prepare_categories (casereader_clone (reader),
1172 indep_vars, n_indep, mom);
1174 if ((n_data > 0) && (n_indep > 0))
1176 Y = gsl_vector_alloc (n_data);
1178 design_matrix_create (n_indep,
1179 (const struct variable **) indep_vars,
1181 for (i = 0; i < X->m->size2; i++)
1183 lopts.get_indep_mean_std[i] = 1;
1185 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1186 models[k]->indep_means = gsl_vector_alloc (X->m->size2);
1187 models[k]->indep_std = gsl_vector_alloc (X->m->size2);
1188 models[k]->depvar = dep_var;
1190 For large data sets, use QR decomposition.
1192 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1194 models[k]->method = PSPP_LINREG_SVD;
1198 The second pass fills the design matrix.
1200 reader = casereader_create_counter (reader, &row, -1);
1201 for (; casereader_read (reader, &c); case_destroy (&c))
1203 for (i = 0; i < n_indep; ++i)
1205 const struct variable *v = indep_vars[i];
1206 const union value *val = case_data (&c, v);
1207 if (var_is_alpha (v))
1208 design_matrix_set_categorical (X, row, v, val);
1210 design_matrix_set_numeric (X, row, v, val);
1212 gsl_vector_set (Y, row, case_num (&c, dep_var));
1214 casereader_destroy (reader);
1216 Now that we know the number of coefficients, allocate space
1217 and store pointers to the variables that correspond to the
1220 coeff_init (models[k], X);
1223 Find the least-squares estimates and other statistics.
1225 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1226 compute_moments (models[k], mom, X, n_variables);
1228 if (!taint_has_tainted_successor (casereader_get_taint (input)))
1230 subcommand_statistics (cmd->a_statistics, models[k]);
1231 subcommand_export (cmd->sbc_export, models[k]);
1234 gsl_vector_free (Y);
1235 design_matrix_destroy (X);
1239 msg (SE, gettext ("No valid data found. This command was skipped."));
1243 free (lopts.get_indep_mean_std);
1244 casereader_destroy (input);