1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <gsl/gsl_matrix.h>
21 #include <gsl/gsl_vector.h>
25 #include "regression-export.h"
26 #include <data/case.h>
27 #include <data/casegrouper.h>
28 #include <data/casereader.h>
29 #include <data/category.h>
30 #include <data/dictionary.h>
31 #include <data/missing-values.h>
32 #include <data/procedure.h>
33 #include <data/transformations.h>
34 #include <data/value-labels.h>
35 #include <data/variable.h>
36 #include <language/command.h>
37 #include <language/dictionary/split-file.h>
38 #include <language/data-io/file-handle.h>
39 #include <language/lexer/lexer.h>
40 #include <libpspp/alloc.h>
41 #include <libpspp/compiler.h>
42 #include <libpspp/message.h>
43 #include <libpspp/taint.h>
44 #include <math/design-matrix.h>
45 #include <math/coefficient.h>
46 #include <math/linreg/linreg.h>
47 #include <math/moments.h>
48 #include <output/table.h>
51 #define _(msgid) gettext (msgid)
53 #define REG_LARGE_DATA 1000
58 "REGRESSION" (regression_):
79 +save[sv_]=resid,pred;
84 static struct cmd_regression cmd;
87 Moments for each of the variables used.
92 const struct variable *v;
96 Transformations for saving predicted values
101 int n_trns; /* Number of transformations. */
102 int trns_id; /* Which trns is this one? */
103 pspp_linreg_cache *c; /* Linear model for this trns. */
106 Variables used (both explanatory and response).
108 static const struct variable **v_variables;
113 static size_t n_variables;
116 File where the model will be saved if the EXPORT subcommand
119 static struct file_handle *model_file;
121 static bool run_regression (struct casereader *, struct cmd_regression *,
122 struct dataset *, pspp_linreg_cache **);
125 STATISTICS subcommand output functions.
127 static void reg_stats_r (pspp_linreg_cache *);
128 static void reg_stats_coeff (pspp_linreg_cache *);
129 static void reg_stats_anova (pspp_linreg_cache *);
130 static void reg_stats_outs (pspp_linreg_cache *);
131 static void reg_stats_zpp (pspp_linreg_cache *);
132 static void reg_stats_label (pspp_linreg_cache *);
133 static void reg_stats_sha (pspp_linreg_cache *);
134 static void reg_stats_ci (pspp_linreg_cache *);
135 static void reg_stats_f (pspp_linreg_cache *);
136 static void reg_stats_bcov (pspp_linreg_cache *);
137 static void reg_stats_ses (pspp_linreg_cache *);
138 static void reg_stats_xtx (pspp_linreg_cache *);
139 static void reg_stats_collin (pspp_linreg_cache *);
140 static void reg_stats_tol (pspp_linreg_cache *);
141 static void reg_stats_selection (pspp_linreg_cache *);
142 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
143 int, pspp_linreg_cache *);
146 reg_stats_r (pspp_linreg_cache * c)
156 rsq = c->ssm / c->sst;
157 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
158 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
159 t = tab_create (n_cols, n_rows, 0);
160 tab_dim (t, tab_natural_dimensions);
161 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
162 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
163 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
164 tab_vline (t, TAL_0, 1, 0, 0);
166 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
167 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
168 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
169 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
170 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
171 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
172 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
173 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
174 tab_title (t, _("Model Summary"));
179 Table showing estimated regression coefficients.
182 reg_stats_coeff (pspp_linreg_cache * c)
194 const struct variable *v;
195 const union value *val;
200 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
201 n_rows = c->n_coeffs + 2;
203 t = tab_create (n_cols, n_rows, 0);
204 tab_headers (t, 2, 0, 1, 0);
205 tab_dim (t, tab_natural_dimensions);
206 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
207 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
208 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
209 tab_vline (t, TAL_0, 1, 0, 0);
211 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
212 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
213 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
214 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
215 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
216 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
217 coeff = c->coeff[0]->estimate;
218 tab_float (t, 2, 1, 0, coeff, 10, 2);
219 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
220 tab_float (t, 3, 1, 0, std_err, 10, 2);
221 beta = coeff / c->depvar_std;
222 tab_float (t, 4, 1, 0, beta, 10, 2);
223 t_stat = coeff / std_err;
224 tab_float (t, 5, 1, 0, t_stat, 10, 2);
225 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
226 tab_float (t, 6, 1, 0, pval, 10, 2);
227 for (j = 1; j <= c->n_indeps; j++)
229 v = pspp_coeff_get_var (c->coeff[j], 0);
230 label = var_to_string (v);
231 /* Do not overwrite the variable's name. */
232 strncpy (tmp, label, MAX_STRING);
233 if (var_is_alpha (v))
236 Append the value associated with this coefficient.
237 This makes sense only if we us the usual binary encoding
241 val = pspp_coeff_get_value (c->coeff[j], v);
242 val_s = var_get_value_name (v, val);
243 strncat (tmp, val_s, MAX_STRING);
246 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
248 Regression coefficients.
250 coeff = c->coeff[j]->estimate;
251 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
253 Standard error of the coefficients.
255 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
256 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
258 'Standardized' coefficient, i.e., regression coefficient
259 if all variables had unit variance.
261 beta = gsl_vector_get (c->indep_std, j);
262 beta *= coeff / c->depvar_std;
263 tab_float (t, 4, j + 1, 0, beta, 10, 2);
266 Test statistic for H0: coefficient is 0.
268 t_stat = coeff / std_err;
269 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
271 P values for the test statistic above.
274 2 * gsl_cdf_tdist_Q (fabs (t_stat),
275 (double) (c->n_obs - c->n_coeffs));
276 tab_float (t, 6, j + 1, 0, pval, 10, 2);
278 tab_title (t, _("Coefficients"));
284 Display the ANOVA table.
287 reg_stats_anova (pspp_linreg_cache * c)
291 const double msm = c->ssm / c->dfm;
292 const double mse = c->sse / c->dfe;
293 const double F = msm / mse;
294 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
299 t = tab_create (n_cols, n_rows, 0);
300 tab_headers (t, 2, 0, 1, 0);
301 tab_dim (t, tab_natural_dimensions);
303 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
305 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
306 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
307 tab_vline (t, TAL_0, 1, 0, 0);
309 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
310 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
311 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
312 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
313 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
315 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
316 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
317 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
319 /* Sums of Squares */
320 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
321 tab_float (t, 2, 3, 0, c->sst, 10, 2);
322 tab_float (t, 2, 2, 0, c->sse, 10, 2);
325 /* Degrees of freedom */
326 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
327 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
328 tab_float (t, 3, 3, 0, c->dft, 4, 0);
332 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
333 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
335 tab_float (t, 5, 1, 0, F, 8, 3);
337 tab_float (t, 6, 1, 0, pval, 8, 3);
339 tab_title (t, _("ANOVA"));
343 reg_stats_outs (pspp_linreg_cache * c)
348 reg_stats_zpp (pspp_linreg_cache * c)
353 reg_stats_label (pspp_linreg_cache * c)
358 reg_stats_sha (pspp_linreg_cache * c)
363 reg_stats_ci (pspp_linreg_cache * c)
368 reg_stats_f (pspp_linreg_cache * c)
373 reg_stats_bcov (pspp_linreg_cache * c)
385 n_cols = c->n_indeps + 1 + 2;
386 n_rows = 2 * (c->n_indeps + 1);
387 t = tab_create (n_cols, n_rows, 0);
388 tab_headers (t, 2, 0, 1, 0);
389 tab_dim (t, tab_natural_dimensions);
390 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
391 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
392 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
393 tab_vline (t, TAL_0, 1, 0, 0);
394 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
395 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
396 for (i = 1; i < c->n_coeffs; i++)
398 const struct variable *v = pspp_coeff_get_var (c->coeff[i], 0);
399 label = var_to_string (v);
400 tab_text (t, 2, i, TAB_CENTER, label);
401 tab_text (t, i + 2, 0, TAB_CENTER, label);
402 for (k = 1; k < c->n_coeffs; k++)
404 col = (i <= k) ? k : i;
405 row = (i <= k) ? i : k;
406 tab_float (t, k + 2, i, TAB_CENTER,
407 gsl_matrix_get (c->cov, row, col), 8, 3);
410 tab_title (t, _("Coefficient Correlations"));
414 reg_stats_ses (pspp_linreg_cache * c)
419 reg_stats_xtx (pspp_linreg_cache * c)
424 reg_stats_collin (pspp_linreg_cache * c)
429 reg_stats_tol (pspp_linreg_cache * c)
434 reg_stats_selection (pspp_linreg_cache * c)
440 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
441 int keyword, pspp_linreg_cache * c)
450 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
453 The order here must match the order in which the STATISTICS
454 keywords appear in the specification section above.
481 Set everything but F.
483 for (i = 0; i < f; i++)
490 for (i = 0; i < all; i++)
498 Default output: ANOVA table, parameter estimates,
499 and statistics for variables not entered into model,
502 if (keywords[defaults] | d)
510 statistics_keyword_output (reg_stats_r, keywords[r], c);
511 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
512 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
513 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
514 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
515 statistics_keyword_output (reg_stats_label, keywords[label], c);
516 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
517 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
518 statistics_keyword_output (reg_stats_f, keywords[f], c);
519 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
520 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
521 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
522 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
523 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
524 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
528 Free the transformation. Free its linear model if this
529 transformation is the last one.
532 regression_trns_free (void *t_)
535 struct reg_trns *t = t_;
537 if (t->trns_id == t->n_trns)
539 result = pspp_linreg_cache_free (t->c);
547 Gets the predicted values.
550 regression_trns_pred_proc (void *t_, struct ccase *c,
551 casenumber case_idx UNUSED)
555 struct reg_trns *trns = t_;
556 pspp_linreg_cache *model;
557 union value *output = NULL;
558 const union value **vals = NULL;
559 const struct variable **vars = NULL;
561 assert (trns != NULL);
563 assert (model != NULL);
564 assert (model->depvar != NULL);
565 assert (model->pred != NULL);
567 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
568 n_vals = (*model->get_vars) (model, vars);
570 vals = xnmalloc (n_vals, sizeof (*vals));
571 output = case_data_rw (c, model->pred);
572 assert (output != NULL);
574 for (i = 0; i < n_vals; i++)
576 vals[i] = case_data (c, vars[i]);
578 output->f = (*model->predict) ((const struct variable **) vars,
579 vals, model, n_vals);
582 return TRNS_CONTINUE;
589 regression_trns_resid_proc (void *t_, struct ccase *c,
590 casenumber case_idx UNUSED)
594 struct reg_trns *trns = t_;
595 pspp_linreg_cache *model;
596 union value *output = NULL;
597 const union value **vals = NULL;
598 const union value *obs = NULL;
599 const struct variable **vars = NULL;
601 assert (trns != NULL);
603 assert (model != NULL);
604 assert (model->depvar != NULL);
605 assert (model->resid != NULL);
607 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
608 n_vals = (*model->get_vars) (model, vars);
610 vals = xnmalloc (n_vals, sizeof (*vals));
611 output = case_data_rw (c, model->resid);
612 assert (output != NULL);
614 for (i = 0; i < n_vals; i++)
616 vals[i] = case_data (c, vars[i]);
618 obs = case_data (c, model->depvar);
619 output->f = (*model->residual) ((const struct variable **) vars,
620 vals, obs, model, n_vals);
623 return TRNS_CONTINUE;
627 Returns false if NAME is a duplicate of any existing variable name.
630 try_name (const struct dictionary *dict, const char *name)
632 if (dict_lookup_var (dict, name) != NULL)
639 reg_get_name (const struct dictionary *dict, char name[LONG_NAME_LEN],
640 const char prefix[LONG_NAME_LEN])
644 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
645 while (!try_name (dict, name))
648 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
653 reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
654 pspp_linreg_cache * c, struct variable **v, int n_trns)
656 struct dictionary *dict = dataset_dict (ds);
657 static int trns_index = 1;
658 char name[LONG_NAME_LEN];
659 struct variable *new_var;
660 struct reg_trns *t = NULL;
662 t = xmalloc (sizeof (*t));
663 t->trns_id = trns_index;
666 reg_get_name (dict, name, prefix);
667 new_var = dict_create_var (dict, name, 0);
668 assert (new_var != NULL);
670 add_transformation (ds, f, regression_trns_free, t);
675 subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models)
677 pspp_linreg_cache **lc;
681 assert (models != NULL);
685 /* Count the number of transformations we will need. */
686 for (i = 0; i < REGRESSION_SV_count; i++)
693 n_trns *= cmd.n_dependent;
695 for (lc = models; lc < models + cmd.n_dependent; lc++)
697 assert (*lc != NULL);
698 assert ((*lc)->depvar != NULL);
699 if (cmd.a_save[REGRESSION_SV_RESID])
701 reg_save_var (ds, "RES", regression_trns_resid_proc, *lc,
702 &(*lc)->resid, n_trns);
704 if (cmd.a_save[REGRESSION_SV_PRED])
706 reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc,
707 &(*lc)->pred, n_trns);
713 for (lc = models; lc < models + cmd.n_dependent; lc++)
717 pspp_linreg_cache_free (*lc);
724 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
728 for (i = 0; i < n_vars; i++)
739 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
743 struct variable **varlist;
745 fprintf (fp, "%s", reg_export_categorical_encode_1);
747 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
748 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
750 struct pspp_coeff *coeff = c->coeff[i];
751 const struct variable *v = pspp_coeff_get_var (coeff, 0);
752 if (var_is_alpha (v))
754 if (!reg_inserted (v, varlist, n_vars))
756 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
758 varlist[n_vars] = (struct variable *) v;
763 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
764 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
766 for (i = 0; i < n_vars - 1; i++)
768 fprintf (fp, "&%s,\n\t\t", var_get_name (varlist[i]));
770 fprintf (fp, "&%s};\n\t", var_get_name (varlist[i]));
772 for (i = 0; i < n_vars; i++)
774 int n_categories = cat_get_n_categories (varlist[i]);
777 fprintf (fp, "%s.name = \"%s\";\n\t",
778 var_get_name (varlist[i]), var_get_name (varlist[i]));
779 fprintf (fp, "%s.n_vals = %d;\n\t",
780 var_get_name (varlist[i]), n_categories);
782 for (j = 0; j < n_categories; j++)
784 const union value *val = cat_subscript_to_value (j, varlist[i]);
785 fprintf (fp, "%s.values[%d] = \"%s\";\n\t",
786 var_get_name (varlist[i]), j,
787 var_get_value_name (varlist[i], val));
790 fprintf (fp, "%s", reg_export_categorical_encode_2);
794 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
797 struct pspp_coeff *coeff;
798 const struct variable *v;
800 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
801 for (i = 1; i < c->n_indeps; i++)
804 v = pspp_coeff_get_var (coeff, 0);
805 fprintf (fp, "\"%s\",\n\t\t", var_get_name (v));
808 v = pspp_coeff_get_var (coeff, 0);
809 fprintf (fp, "\"%s\"};\n\t", var_get_name (v));
812 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
814 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
815 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
816 reg_print_depvars (fp, c);
817 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
819 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
820 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
823 reg_has_categorical (pspp_linreg_cache * c)
826 const struct variable *v;
828 for (i = 1; i < c->n_coeffs; i++)
830 v = pspp_coeff_get_var (c->coeff[i], 0);
831 if (var_is_alpha (v))
838 subcommand_export (int export, pspp_linreg_cache * c)
843 int n_quantiles = 100;
845 struct pspp_coeff *coeff;
850 assert (model_file != NULL);
851 fp = fopen (fh_get_file_name (model_file), "w");
853 fprintf (fp, "%s", reg_preamble);
854 reg_print_getvar (fp, c);
855 if (reg_has_categorical (c))
857 reg_print_categorical_encoding (fp, c);
859 fprintf (fp, "%s", reg_export_t_quantiles_1);
860 for (i = 0; i < n_quantiles - 1; i++)
862 tmp = 0.5 + 0.005 * (double) i;
863 fprintf (fp, "%.15e,\n\t\t",
864 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
866 fprintf (fp, "%.15e};\n\t",
867 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
868 fprintf (fp, "%s", reg_export_t_quantiles_2);
869 fprintf (fp, "%s", reg_mean_cmt);
870 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
871 fprintf (fp, "const char *var_names[])\n{\n\t");
872 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
873 for (i = 1; i < c->n_indeps; i++)
876 fprintf (fp, "%.15e,\n\t\t", coeff->estimate);
879 fprintf (fp, "%.15e};\n\t", coeff->estimate);
881 fprintf (fp, "double estimate = %.15e;\n\t", coeff->estimate);
882 fprintf (fp, "int i;\n\tint j;\n\n\t");
883 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
884 fprintf (fp, "%s", reg_getvar);
885 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
887 for (i = 0; i < c->cov->size1 - 1; i++)
890 for (j = 0; j < c->cov->size2 - 1; j++)
892 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
894 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
897 for (j = 0; j < c->cov->size2 - 1; j++)
899 fprintf (fp, "%.15e, ",
900 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
902 fprintf (fp, "%.15e}\n\t",
903 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
904 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
906 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
907 fprintf (fp, "%s", reg_variance);
908 fprintf (fp, "%s", reg_export_confidence_interval);
909 tmp = c->mse * c->mse;
910 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
911 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
912 fprintf (fp, "%s", reg_export_prediction_interval_3);
914 fp = fopen ("pspp_model_reg.h", "w");
915 fprintf (fp, "%s", reg_header);
921 regression_custom_export (struct lexer *lexer, struct dataset *ds UNUSED,
922 struct cmd_regression *cmd UNUSED, void *aux UNUSED)
924 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
925 if (!lex_force_match (lexer, '('))
928 if (lex_match (lexer, '*'))
932 model_file = fh_parse (lexer, FH_REF_FILE);
933 if (model_file == NULL)
937 if (!lex_force_match (lexer, ')'))
944 cmd_regression (struct lexer *lexer, struct dataset *ds)
946 struct casegrouper *grouper;
947 struct casereader *group;
948 pspp_linreg_cache **models;
952 if (!parse_regression (lexer, ds, &cmd, NULL))
955 models = xnmalloc (cmd.n_dependent, sizeof *models);
956 for (i = 0; i < cmd.n_dependent; i++)
962 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
963 while (casegrouper_get_next_group (grouper, &group))
964 run_regression (group, &cmd, ds, models);
965 ok = casegrouper_destroy (grouper);
966 ok = proc_commit (ds) && ok;
968 subcommand_save (ds, cmd.sbc_save, models);
971 free_regression (&cmd);
973 return ok ? CMD_SUCCESS : CMD_FAILURE;
977 Is variable k the dependent variable?
980 is_depvar (size_t k, const struct variable *v)
982 return v == v_variables[k];
985 /* Parser for the variables sub command */
987 regression_custom_variables (struct lexer *lexer, struct dataset *ds,
988 struct cmd_regression *cmd UNUSED,
991 const struct dictionary *dict = dataset_dict (ds);
993 lex_match (lexer, '=');
995 if ((lex_token (lexer) != T_ID
996 || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
997 && lex_token (lexer) != T_ALL)
1001 if (!parse_variables_const
1002 (lexer, dict, &v_variables, &n_variables, PV_NONE))
1007 assert (n_variables);
1012 /* Identify the explanatory variables in v_variables. Returns
1013 the number of independent variables. */
1015 identify_indep_vars (const struct variable **indep_vars,
1016 const struct variable *depvar)
1018 int n_indep_vars = 0;
1021 for (i = 0; i < n_variables; i++)
1022 if (!is_depvar (i, depvar))
1023 indep_vars[n_indep_vars++] = v_variables[i];
1024 if ((n_indep_vars < 2) && is_depvar (0, depvar))
1027 There is only one independent variable, and it is the same
1028 as the dependent variable. Print a warning and continue.
1031 gettext ("The dependent variable is equal to the independent variable."
1032 "The least squares line is therefore Y=X."
1033 "Standard errors and related statistics may be meaningless."));
1035 indep_vars[0] = v_variables[0];
1037 return n_indep_vars;
1040 /* Encode categorical variables.
1041 Returns number of valid cases. */
1043 prepare_categories (struct casereader *input,
1044 const struct variable **vars, size_t n_vars,
1045 struct moments_var *mom)
1051 assert (vars != NULL);
1052 assert (mom != NULL);
1054 for (i = 0; i < n_vars; i++)
1055 if (var_is_alpha (vars[i]))
1056 cat_stored_values_create (vars[i]);
1059 for (; casereader_read (input, &c); case_destroy (&c))
1062 The second condition ensures the program will run even if
1063 there is only one variable to act as both explanatory and
1066 for (i = 0; i < n_vars; i++)
1068 const union value *val = case_data (&c, vars[i]);
1069 if (var_is_alpha (vars[i]))
1070 cat_value_update (vars[i], val);
1072 moments1_add (mom[i].m, val->f, 1.0);
1076 casereader_destroy (input);
1082 coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
1084 c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
1085 c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
1086 c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
1087 pspp_coeff_init (c->coeff + 1, dm);
1091 Put the moments in the linreg cache.
1094 compute_moments (pspp_linreg_cache * c, struct moments_var *mom,
1095 struct design_matrix *dm, size_t n)
1105 Scan the variable names in the columns of the design matrix.
1106 When we find the variable we need, insert its mean in the cache.
1108 for (i = 0; i < dm->m->size2; i++)
1110 for (j = 0; j < n; j++)
1112 if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
1114 moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
1115 &skewness, &kurtosis);
1116 gsl_vector_set (c->indep_means, i, mean);
1117 gsl_vector_set (c->indep_std, i, sqrt (variance));
1124 run_regression (struct casereader *input, struct cmd_regression *cmd,
1125 struct dataset *ds, pspp_linreg_cache **models)
1131 const struct variable **indep_vars;
1132 struct design_matrix *X;
1133 struct moments_var *mom;
1136 pspp_linreg_opts lopts;
1138 assert (models != NULL);
1140 if (!casereader_peek (input, 0, &c))
1142 casereader_destroy (input);
1145 output_split_file_values (ds, &c);
1150 dict_get_vars (dataset_dict (ds), &v_variables, &n_variables,
1154 for (i = 0; i < cmd->n_dependent; i++)
1156 if (!var_is_numeric (cmd->v_dependent[i]))
1158 msg (SE, _("Dependent variable must be numeric."));
1163 mom = xnmalloc (n_variables, sizeof (*mom));
1164 for (i = 0; i < n_variables; i++)
1166 (mom + i)->m = moments1_create (MOMENT_VARIANCE);
1167 (mom + i)->v = v_variables[i];
1169 lopts.get_depvar_mean_std = 1;
1171 lopts.get_indep_mean_std = xnmalloc (n_variables, sizeof (int));
1172 indep_vars = xnmalloc (n_variables, sizeof *indep_vars);
1174 for (k = 0; k < cmd->n_dependent; k++)
1176 const struct variable *dep_var;
1177 struct casereader *reader;
1180 size_t n_data; /* Number of valid cases. */
1182 dep_var = cmd->v_dependent[k];
1183 n_indep = identify_indep_vars (indep_vars, dep_var);
1184 reader = casereader_clone (input);
1185 reader = casereader_create_filter_missing (reader, indep_vars, n_indep,
1187 reader = casereader_create_filter_missing (reader, &dep_var, 1,
1189 n_data = prepare_categories (casereader_clone (reader),
1190 indep_vars, n_indep, mom);
1192 if ((n_data > 0) && (n_indep > 0))
1194 Y = gsl_vector_alloc (n_data);
1196 design_matrix_create (n_indep,
1197 (const struct variable **) indep_vars,
1199 for (i = 0; i < X->m->size2; i++)
1201 lopts.get_indep_mean_std[i] = 1;
1203 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1204 models[k]->depvar = dep_var;
1206 For large data sets, use QR decomposition.
1208 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1210 models[k]->method = PSPP_LINREG_QR;
1214 The second pass fills the design matrix.
1216 reader = casereader_create_counter (reader, &row, -1);
1217 for (; casereader_read (reader, &c); case_destroy (&c))
1219 for (i = 0; i < n_indep; ++i)
1221 const struct variable *v = indep_vars[i];
1222 const union value *val = case_data (&c, v);
1223 if (var_is_alpha (v))
1224 design_matrix_set_categorical (X, row, v, val);
1226 design_matrix_set_numeric (X, row, v, val);
1228 gsl_vector_set (Y, row, case_num (&c, dep_var));
1231 Now that we know the number of coefficients, allocate space
1232 and store pointers to the variables that correspond to the
1235 coeff_init (models[k], X);
1238 Find the least-squares estimates and other statistics.
1240 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1241 compute_moments (models[k], mom, X, n_variables);
1243 if (!taint_has_tainted_successor (casereader_get_taint (input)))
1245 subcommand_statistics (cmd->a_statistics, models[k]);
1246 subcommand_export (cmd->sbc_export, models[k]);
1249 gsl_vector_free (Y);
1250 design_matrix_destroy (X);
1255 gettext ("No valid data found. This command was skipped."));
1257 casereader_destroy (reader);
1259 for (i = 0; i < n_variables; i++)
1261 moments1_destroy ((mom + i)->m);
1265 free (lopts.get_indep_mean_std);
1266 casereader_destroy (input);