1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <gsl/gsl_matrix.h>
21 #include <gsl/gsl_vector.h>
25 #include "regression-export.h"
26 #include <data/case.h>
27 #include <data/casegrouper.h>
28 #include <data/casereader.h>
29 #include <data/category.h>
30 #include <data/dictionary.h>
31 #include <data/missing-values.h>
32 #include <data/procedure.h>
33 #include <data/transformations.h>
34 #include <data/value-labels.h>
35 #include <data/variable.h>
36 #include <language/command.h>
37 #include <language/dictionary/split-file.h>
38 #include <language/data-io/file-handle.h>
39 #include <language/lexer/lexer.h>
40 #include <libpspp/alloc.h>
41 #include <libpspp/compiler.h>
42 #include <libpspp/message.h>
43 #include <libpspp/taint.h>
44 #include <math/design-matrix.h>
45 #include <math/coefficient.h>
46 #include <math/linreg/linreg.h>
47 #include <math/moments.h>
48 #include <output/table.h>
51 #define _(msgid) gettext (msgid)
53 #define REG_LARGE_DATA 1000
58 "REGRESSION" (regression_):
79 +save[sv_]=resid,pred;
84 static struct cmd_regression cmd;
87 Moments for each of the variables used.
92 const struct variable *v;
95 /* Linear regression models. */
96 static pspp_linreg_cache **models = NULL;
99 Transformations for saving predicted values
104 int n_trns; /* Number of transformations. */
105 int trns_id; /* Which trns is this one? */
106 pspp_linreg_cache *c; /* Linear model for this trns. */
109 Variables used (both explanatory and response).
111 static const struct variable **v_variables;
116 static size_t n_variables;
119 File where the model will be saved if the EXPORT subcommand
122 static struct file_handle *model_file;
124 static bool run_regression (struct casereader *, struct cmd_regression *,
128 STATISTICS subcommand output functions.
130 static void reg_stats_r (pspp_linreg_cache *);
131 static void reg_stats_coeff (pspp_linreg_cache *);
132 static void reg_stats_anova (pspp_linreg_cache *);
133 static void reg_stats_outs (pspp_linreg_cache *);
134 static void reg_stats_zpp (pspp_linreg_cache *);
135 static void reg_stats_label (pspp_linreg_cache *);
136 static void reg_stats_sha (pspp_linreg_cache *);
137 static void reg_stats_ci (pspp_linreg_cache *);
138 static void reg_stats_f (pspp_linreg_cache *);
139 static void reg_stats_bcov (pspp_linreg_cache *);
140 static void reg_stats_ses (pspp_linreg_cache *);
141 static void reg_stats_xtx (pspp_linreg_cache *);
142 static void reg_stats_collin (pspp_linreg_cache *);
143 static void reg_stats_tol (pspp_linreg_cache *);
144 static void reg_stats_selection (pspp_linreg_cache *);
145 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
146 int, pspp_linreg_cache *);
149 reg_stats_r (pspp_linreg_cache * c)
159 rsq = c->ssm / c->sst;
160 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
161 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
162 t = tab_create (n_cols, n_rows, 0);
163 tab_dim (t, tab_natural_dimensions);
164 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
165 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
166 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
167 tab_vline (t, TAL_0, 1, 0, 0);
169 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
170 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
171 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
172 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
173 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
174 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
175 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
176 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
177 tab_title (t, _("Model Summary"));
182 Table showing estimated regression coefficients.
185 reg_stats_coeff (pspp_linreg_cache * c)
197 const struct variable *v;
198 const union value *val;
203 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
204 n_rows = c->n_coeffs + 2;
206 t = tab_create (n_cols, n_rows, 0);
207 tab_headers (t, 2, 0, 1, 0);
208 tab_dim (t, tab_natural_dimensions);
209 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
210 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
211 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
212 tab_vline (t, TAL_0, 1, 0, 0);
214 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
215 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
216 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
217 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
218 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
219 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
220 coeff = c->coeff[0]->estimate;
221 tab_float (t, 2, 1, 0, coeff, 10, 2);
222 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
223 tab_float (t, 3, 1, 0, std_err, 10, 2);
224 beta = coeff / c->depvar_std;
225 tab_float (t, 4, 1, 0, beta, 10, 2);
226 t_stat = coeff / std_err;
227 tab_float (t, 5, 1, 0, t_stat, 10, 2);
228 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
229 tab_float (t, 6, 1, 0, pval, 10, 2);
230 for (j = 1; j <= c->n_indeps; j++)
232 v = pspp_coeff_get_var (c->coeff[j], 0);
233 label = var_to_string (v);
234 /* Do not overwrite the variable's name. */
235 strncpy (tmp, label, MAX_STRING);
236 if (var_is_alpha (v))
239 Append the value associated with this coefficient.
240 This makes sense only if we us the usual binary encoding
244 val = pspp_coeff_get_value (c->coeff[j], v);
245 val_s = var_get_value_name (v, val);
246 strncat (tmp, val_s, MAX_STRING);
249 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
251 Regression coefficients.
253 coeff = c->coeff[j]->estimate;
254 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
256 Standard error of the coefficients.
258 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
259 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
261 'Standardized' coefficient, i.e., regression coefficient
262 if all variables had unit variance.
264 beta = gsl_vector_get (c->indep_std, j);
265 beta *= coeff / c->depvar_std;
266 tab_float (t, 4, j + 1, 0, beta, 10, 2);
269 Test statistic for H0: coefficient is 0.
271 t_stat = coeff / std_err;
272 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
274 P values for the test statistic above.
277 2 * gsl_cdf_tdist_Q (fabs (t_stat),
278 (double) (c->n_obs - c->n_coeffs));
279 tab_float (t, 6, j + 1, 0, pval, 10, 2);
281 tab_title (t, _("Coefficients"));
287 Display the ANOVA table.
290 reg_stats_anova (pspp_linreg_cache * c)
294 const double msm = c->ssm / c->dfm;
295 const double mse = c->sse / c->dfe;
296 const double F = msm / mse;
297 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
302 t = tab_create (n_cols, n_rows, 0);
303 tab_headers (t, 2, 0, 1, 0);
304 tab_dim (t, tab_natural_dimensions);
306 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
308 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
309 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
310 tab_vline (t, TAL_0, 1, 0, 0);
312 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
313 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
314 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
315 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
316 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
318 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
319 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
320 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
322 /* Sums of Squares */
323 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
324 tab_float (t, 2, 3, 0, c->sst, 10, 2);
325 tab_float (t, 2, 2, 0, c->sse, 10, 2);
328 /* Degrees of freedom */
329 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
330 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
331 tab_float (t, 3, 3, 0, c->dft, 4, 0);
335 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
336 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
338 tab_float (t, 5, 1, 0, F, 8, 3);
340 tab_float (t, 6, 1, 0, pval, 8, 3);
342 tab_title (t, _("ANOVA"));
346 reg_stats_outs (pspp_linreg_cache * c)
351 reg_stats_zpp (pspp_linreg_cache * c)
356 reg_stats_label (pspp_linreg_cache * c)
361 reg_stats_sha (pspp_linreg_cache * c)
366 reg_stats_ci (pspp_linreg_cache * c)
371 reg_stats_f (pspp_linreg_cache * c)
376 reg_stats_bcov (pspp_linreg_cache * c)
388 n_cols = c->n_indeps + 1 + 2;
389 n_rows = 2 * (c->n_indeps + 1);
390 t = tab_create (n_cols, n_rows, 0);
391 tab_headers (t, 2, 0, 1, 0);
392 tab_dim (t, tab_natural_dimensions);
393 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
394 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
395 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
396 tab_vline (t, TAL_0, 1, 0, 0);
397 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
398 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
399 for (i = 1; i < c->n_coeffs; i++)
401 const struct variable *v = pspp_coeff_get_var (c->coeff[i], 0);
402 label = var_to_string (v);
403 tab_text (t, 2, i, TAB_CENTER, label);
404 tab_text (t, i + 2, 0, TAB_CENTER, label);
405 for (k = 1; k < c->n_coeffs; k++)
407 col = (i <= k) ? k : i;
408 row = (i <= k) ? i : k;
409 tab_float (t, k + 2, i, TAB_CENTER,
410 gsl_matrix_get (c->cov, row, col), 8, 3);
413 tab_title (t, _("Coefficient Correlations"));
417 reg_stats_ses (pspp_linreg_cache * c)
422 reg_stats_xtx (pspp_linreg_cache * c)
427 reg_stats_collin (pspp_linreg_cache * c)
432 reg_stats_tol (pspp_linreg_cache * c)
437 reg_stats_selection (pspp_linreg_cache * c)
443 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
444 int keyword, pspp_linreg_cache * c)
453 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
456 The order here must match the order in which the STATISTICS
457 keywords appear in the specification section above.
484 Set everything but F.
486 for (i = 0; i < f; i++)
493 for (i = 0; i < all; i++)
501 Default output: ANOVA table, parameter estimates,
502 and statistics for variables not entered into model,
505 if (keywords[defaults] | d)
513 statistics_keyword_output (reg_stats_r, keywords[r], c);
514 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
515 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
516 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
517 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
518 statistics_keyword_output (reg_stats_label, keywords[label], c);
519 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
520 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
521 statistics_keyword_output (reg_stats_f, keywords[f], c);
522 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
523 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
524 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
525 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
526 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
527 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
531 Free the transformation. Free its linear model if this
532 transformation is the last one.
535 regression_trns_free (void *t_)
538 struct reg_trns *t = t_;
540 if (t->trns_id == t->n_trns)
542 result = pspp_linreg_cache_free (t->c);
550 Gets the predicted values.
553 regression_trns_pred_proc (void *t_, struct ccase *c,
554 casenumber case_idx UNUSED)
558 struct reg_trns *trns = t_;
559 pspp_linreg_cache *model;
560 union value *output = NULL;
561 const union value **vals = NULL;
562 const struct variable **vars = NULL;
564 assert (trns != NULL);
566 assert (model != NULL);
567 assert (model->depvar != NULL);
568 assert (model->pred != NULL);
570 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
571 n_vals = (*model->get_vars) (model, vars);
573 vals = xnmalloc (n_vals, sizeof (*vals));
574 output = case_data_rw (c, model->pred);
575 assert (output != NULL);
577 for (i = 0; i < n_vals; i++)
579 vals[i] = case_data (c, vars[i]);
581 output->f = (*model->predict) ((const struct variable **) vars,
582 vals, model, n_vals);
585 return TRNS_CONTINUE;
592 regression_trns_resid_proc (void *t_, struct ccase *c,
593 casenumber case_idx UNUSED)
597 struct reg_trns *trns = t_;
598 pspp_linreg_cache *model;
599 union value *output = NULL;
600 const union value **vals = NULL;
601 const union value *obs = NULL;
602 const struct variable **vars = NULL;
604 assert (trns != NULL);
606 assert (model != NULL);
607 assert (model->depvar != NULL);
608 assert (model->resid != NULL);
610 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
611 n_vals = (*model->get_vars) (model, vars);
613 vals = xnmalloc (n_vals, sizeof (*vals));
614 output = case_data_rw (c, model->resid);
615 assert (output != NULL);
617 for (i = 0; i < n_vals; i++)
619 vals[i] = case_data (c, vars[i]);
621 obs = case_data (c, model->depvar);
622 output->f = (*model->residual) ((const struct variable **) vars,
623 vals, obs, model, n_vals);
626 return TRNS_CONTINUE;
630 Returns false if NAME is a duplicate of any existing variable name.
633 try_name (const struct dictionary *dict, const char *name)
635 if (dict_lookup_var (dict, name) != NULL)
642 reg_get_name (const struct dictionary *dict, char name[LONG_NAME_LEN],
643 const char prefix[LONG_NAME_LEN])
647 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
648 while (!try_name (dict, name))
651 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
656 reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
657 pspp_linreg_cache * c, struct variable **v, int n_trns)
659 struct dictionary *dict = dataset_dict (ds);
660 static int trns_index = 1;
661 char name[LONG_NAME_LEN];
662 struct variable *new_var;
663 struct reg_trns *t = NULL;
665 t = xmalloc (sizeof (*t));
666 t->trns_id = trns_index;
669 reg_get_name (dict, name, prefix);
670 new_var = dict_create_var (dict, name, 0);
671 assert (new_var != NULL);
673 add_transformation (ds, f, regression_trns_free, t);
678 subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models)
680 pspp_linreg_cache **lc;
684 assert (models != NULL);
688 /* Count the number of transformations we will need. */
689 for (i = 0; i < REGRESSION_SV_count; i++)
696 n_trns *= cmd.n_dependent;
698 for (lc = models; lc < models + cmd.n_dependent; lc++)
700 assert (*lc != NULL);
701 assert ((*lc)->depvar != NULL);
702 if (cmd.a_save[REGRESSION_SV_RESID])
704 reg_save_var (ds, "RES", regression_trns_resid_proc, *lc,
705 &(*lc)->resid, n_trns);
707 if (cmd.a_save[REGRESSION_SV_PRED])
709 reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc,
710 &(*lc)->pred, n_trns);
716 for (lc = models; lc < models + cmd.n_dependent; lc++)
720 pspp_linreg_cache_free (*lc);
727 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
731 for (i = 0; i < n_vars; i++)
742 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
746 struct variable **varlist;
748 fprintf (fp, "%s", reg_export_categorical_encode_1);
750 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
751 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
753 struct pspp_coeff *coeff = c->coeff[i];
754 const struct variable *v = pspp_coeff_get_var (coeff, 0);
755 if (var_is_alpha (v))
757 if (!reg_inserted (v, varlist, n_vars))
759 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
761 varlist[n_vars] = (struct variable *) v;
766 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
767 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
769 for (i = 0; i < n_vars - 1; i++)
771 fprintf (fp, "&%s,\n\t\t", var_get_name (varlist[i]));
773 fprintf (fp, "&%s};\n\t", var_get_name (varlist[i]));
775 for (i = 0; i < n_vars; i++)
777 int n_categories = cat_get_n_categories (varlist[i]);
780 fprintf (fp, "%s.name = \"%s\";\n\t",
781 var_get_name (varlist[i]), var_get_name (varlist[i]));
782 fprintf (fp, "%s.n_vals = %d;\n\t",
783 var_get_name (varlist[i]), n_categories);
785 for (j = 0; j < n_categories; j++)
787 const union value *val = cat_subscript_to_value (j, varlist[i]);
788 fprintf (fp, "%s.values[%d] = \"%s\";\n\t",
789 var_get_name (varlist[i]), j,
790 var_get_value_name (varlist[i], val));
793 fprintf (fp, "%s", reg_export_categorical_encode_2);
797 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
800 struct pspp_coeff *coeff;
801 const struct variable *v;
803 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
804 for (i = 1; i < c->n_indeps; i++)
807 v = pspp_coeff_get_var (coeff, 0);
808 fprintf (fp, "\"%s\",\n\t\t", var_get_name (v));
811 v = pspp_coeff_get_var (coeff, 0);
812 fprintf (fp, "\"%s\"};\n\t", var_get_name (v));
815 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
817 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
818 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
819 reg_print_depvars (fp, c);
820 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
822 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
823 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
826 reg_has_categorical (pspp_linreg_cache * c)
829 const struct variable *v;
831 for (i = 1; i < c->n_coeffs; i++)
833 v = pspp_coeff_get_var (c->coeff[i], 0);
834 if (var_is_alpha (v))
841 subcommand_export (int export, pspp_linreg_cache * c)
846 int n_quantiles = 100;
848 struct pspp_coeff *coeff;
853 assert (model_file != NULL);
854 fp = fopen (fh_get_file_name (model_file), "w");
856 fprintf (fp, "%s", reg_preamble);
857 reg_print_getvar (fp, c);
858 if (reg_has_categorical (c))
860 reg_print_categorical_encoding (fp, c);
862 fprintf (fp, "%s", reg_export_t_quantiles_1);
863 for (i = 0; i < n_quantiles - 1; i++)
865 tmp = 0.5 + 0.005 * (double) i;
866 fprintf (fp, "%.15e,\n\t\t",
867 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
869 fprintf (fp, "%.15e};\n\t",
870 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
871 fprintf (fp, "%s", reg_export_t_quantiles_2);
872 fprintf (fp, "%s", reg_mean_cmt);
873 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
874 fprintf (fp, "const char *var_names[])\n{\n\t");
875 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
876 for (i = 1; i < c->n_indeps; i++)
879 fprintf (fp, "%.15e,\n\t\t", coeff->estimate);
882 fprintf (fp, "%.15e};\n\t", coeff->estimate);
884 fprintf (fp, "double estimate = %.15e;\n\t", coeff->estimate);
885 fprintf (fp, "int i;\n\tint j;\n\n\t");
886 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
887 fprintf (fp, "%s", reg_getvar);
888 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
890 for (i = 0; i < c->cov->size1 - 1; i++)
893 for (j = 0; j < c->cov->size2 - 1; j++)
895 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
897 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
900 for (j = 0; j < c->cov->size2 - 1; j++)
902 fprintf (fp, "%.15e, ",
903 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
905 fprintf (fp, "%.15e}\n\t",
906 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
907 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
909 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
910 fprintf (fp, "%s", reg_variance);
911 fprintf (fp, "%s", reg_export_confidence_interval);
912 tmp = c->mse * c->mse;
913 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
914 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
915 fprintf (fp, "%s", reg_export_prediction_interval_3);
917 fp = fopen ("pspp_model_reg.h", "w");
918 fprintf (fp, "%s", reg_header);
924 regression_custom_export (struct lexer *lexer, struct dataset *ds UNUSED,
925 struct cmd_regression *cmd UNUSED, void *aux UNUSED)
927 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
928 if (!lex_force_match (lexer, '('))
931 if (lex_match (lexer, '*'))
935 model_file = fh_parse (lexer, FH_REF_FILE);
936 if (model_file == NULL)
940 if (!lex_force_match (lexer, ')'))
947 cmd_regression (struct lexer *lexer, struct dataset *ds)
949 struct casegrouper *grouper;
950 struct casereader *group;
954 if (!parse_regression (lexer, ds, &cmd, NULL))
957 models = xnmalloc (cmd.n_dependent, sizeof *models);
958 for (i = 0; i < cmd.n_dependent; i++)
964 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
965 while (casegrouper_get_next_group (grouper, &group))
966 run_regression (group, &cmd, ds);
967 ok = casegrouper_destroy (grouper);
968 ok = proc_commit (ds) && ok;
970 subcommand_save (ds, cmd.sbc_save, models);
973 return ok ? CMD_SUCCESS : CMD_FAILURE;
977 Is variable k the dependent variable?
980 is_depvar (size_t k, const struct variable *v)
982 return v == v_variables[k];
985 /* Parser for the variables sub command */
987 regression_custom_variables (struct lexer *lexer, struct dataset *ds,
988 struct cmd_regression *cmd UNUSED,
991 const struct dictionary *dict = dataset_dict (ds);
993 lex_match (lexer, '=');
995 if ((lex_token (lexer) != T_ID
996 || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
997 && lex_token (lexer) != T_ALL)
1001 if (!parse_variables_const
1002 (lexer, dict, &v_variables, &n_variables, PV_NONE))
1007 assert (n_variables);
1012 /* Identify the explanatory variables in v_variables. Returns
1013 the number of independent variables. */
1015 identify_indep_vars (const struct variable **indep_vars,
1016 const struct variable *depvar)
1018 int n_indep_vars = 0;
1021 for (i = 0; i < n_variables; i++)
1022 if (!is_depvar (i, depvar))
1023 indep_vars[n_indep_vars++] = v_variables[i];
1024 if ((n_indep_vars < 2) && is_depvar (0, depvar))
1027 There is only one independent variable, and it is the same
1028 as the dependent variable. Print a warning and continue.
1031 gettext ("The dependent variable is equal to the independent variable. The least squares line is therefore Y=X. Standard errors and related statistics may be meaningless."));
1033 indep_vars[0] = v_variables[0];
1035 return n_indep_vars;
1038 /* Encode categorical variables.
1039 Returns number of valid cases. */
1041 prepare_categories (struct casereader *input,
1042 const struct variable **vars, size_t n_vars,
1043 struct moments_var *mom)
1049 assert (vars != NULL);
1050 assert (mom != NULL);
1052 for (i = 0; i < n_vars; i++)
1053 if (var_is_alpha (vars[i]))
1054 cat_stored_values_create (vars[i]);
1057 for (; casereader_read (input, &c); case_destroy (&c))
1060 The second condition ensures the program will run even if
1061 there is only one variable to act as both explanatory and
1064 for (i = 0; i < n_vars; i++)
1066 const union value *val = case_data (&c, vars[i]);
1067 if (var_is_alpha (vars[i]))
1068 cat_value_update (vars[i], val);
1070 moments1_add (mom[i].m, val->f, 1.0);
1074 casereader_destroy (input);
1080 coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
1082 c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
1083 c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
1084 c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
1085 pspp_coeff_init (c->coeff + 1, dm);
1089 Put the moments in the linreg cache.
1092 compute_moments (pspp_linreg_cache * c, struct moments_var *mom,
1093 struct design_matrix *dm, size_t n)
1103 Scan the variable names in the columns of the design matrix.
1104 When we find the variable we need, insert its mean in the cache.
1106 for (i = 0; i < dm->m->size2; i++)
1108 for (j = 0; j < n; j++)
1110 if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
1112 moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
1113 &skewness, &kurtosis);
1114 gsl_vector_set (c->indep_means, i, mean);
1115 gsl_vector_set (c->indep_std, i, sqrt (variance));
1122 run_regression (struct casereader *input, struct cmd_regression *cmd,
1129 const struct variable **indep_vars;
1130 struct design_matrix *X;
1131 struct moments_var *mom;
1134 pspp_linreg_opts lopts;
1136 assert (models != NULL);
1138 if (!casereader_peek (input, 0, &c))
1140 output_split_file_values (ds, &c);
1145 dict_get_vars (dataset_dict (ds), &v_variables, &n_variables,
1149 for (i = 0; i < cmd->n_dependent; i++)
1151 if (!var_is_numeric (cmd->v_dependent[i]))
1153 msg (SE, _("Dependent variable must be numeric."));
1158 mom = xnmalloc (n_variables, sizeof (*mom));
1159 for (i = 0; i < n_variables; i++)
1161 (mom + i)->m = moments1_create (MOMENT_VARIANCE);
1162 (mom + i)->v = v_variables[i];
1164 lopts.get_depvar_mean_std = 1;
1166 lopts.get_indep_mean_std = xnmalloc (n_variables, sizeof (int));
1167 indep_vars = xnmalloc (n_variables, sizeof *indep_vars);
1169 for (k = 0; k < cmd->n_dependent; k++)
1171 const struct variable *dep_var;
1172 struct casereader *reader;
1175 size_t n_data; /* Number of valid cases. */
1177 dep_var = cmd->v_dependent[k];
1178 n_indep = identify_indep_vars (indep_vars, dep_var);
1179 reader = casereader_clone (input);
1180 reader = casereader_create_filter_missing (reader, indep_vars, n_indep,
1182 reader = casereader_create_filter_missing (reader, &dep_var, 1,
1184 n_data = prepare_categories (casereader_clone (reader),
1185 indep_vars, n_indep, mom);
1187 if ((n_data > 0) && (n_indep > 0))
1189 Y = gsl_vector_alloc (n_data);
1191 design_matrix_create (n_indep,
1192 (const struct variable **) indep_vars,
1194 for (i = 0; i < X->m->size2; i++)
1196 lopts.get_indep_mean_std[i] = 1;
1198 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1199 models[k]->indep_means = gsl_vector_alloc (X->m->size2);
1200 models[k]->indep_std = gsl_vector_alloc (X->m->size2);
1201 models[k]->depvar = dep_var;
1203 For large data sets, use QR decomposition.
1205 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1207 models[k]->method = PSPP_LINREG_QR;
1211 The second pass fills the design matrix.
1213 reader = casereader_create_counter (reader, &row, -1);
1214 for (; casereader_read (reader, &c); case_destroy (&c))
1216 for (i = 0; i < n_indep; ++i)
1218 const struct variable *v = indep_vars[i];
1219 const union value *val = case_data (&c, v);
1220 if (var_is_alpha (v))
1221 design_matrix_set_categorical (X, row, v, val);
1223 design_matrix_set_numeric (X, row, v, val);
1225 gsl_vector_set (Y, row, case_num (&c, dep_var));
1228 Now that we know the number of coefficients, allocate space
1229 and store pointers to the variables that correspond to the
1232 coeff_init (models[k], X);
1235 Find the least-squares estimates and other statistics.
1237 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1238 compute_moments (models[k], mom, X, n_variables);
1240 if (!taint_has_tainted_successor (casereader_get_taint (input)))
1242 subcommand_statistics (cmd->a_statistics, models[k]);
1243 subcommand_export (cmd->sbc_export, models[k]);
1246 gsl_vector_free (Y);
1247 design_matrix_destroy (X);
1252 gettext ("No valid data found. This command was skipped."));
1254 casereader_destroy (reader);
1257 free (lopts.get_indep_mean_std);
1258 casereader_destroy (input);