1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 #include <gsl/gsl_cdf.h>
22 #include <gsl/gsl_matrix.h>
23 #include <gsl/gsl_vector.h>
27 #include "regression-export.h"
28 #include <data/case.h>
29 #include <data/casegrouper.h>
30 #include <data/casereader.h>
31 #include <data/category.h>
32 #include <data/dictionary.h>
33 #include <data/missing-values.h>
34 #include <data/procedure.h>
35 #include <data/transformations.h>
36 #include <data/value-labels.h>
37 #include <data/variable.h>
38 #include <language/command.h>
39 #include <language/dictionary/split-file.h>
40 #include <language/data-io/file-handle.h>
41 #include <language/lexer/lexer.h>
42 #include <libpspp/alloc.h>
43 #include <libpspp/compiler.h>
44 #include <libpspp/message.h>
45 #include <libpspp/taint.h>
46 #include <math/design-matrix.h>
47 #include <math/coefficient.h>
48 #include <math/linreg/linreg.h>
49 #include <math/moments.h>
50 #include <output/table.h>
53 #define _(msgid) gettext (msgid)
55 #define REG_LARGE_DATA 1000
60 "REGRESSION" (regression_):
81 +save[sv_]=resid,pred;
86 static struct cmd_regression cmd;
89 Moments for each of the variables used.
94 const struct variable *v;
97 /* Linear regression models. */
98 static pspp_linreg_cache **models = NULL;
101 Transformations for saving predicted values
106 int n_trns; /* Number of transformations. */
107 int trns_id; /* Which trns is this one? */
108 pspp_linreg_cache *c; /* Linear model for this trns. */
111 Variables used (both explanatory and response).
113 static const struct variable **v_variables;
118 static size_t n_variables;
121 File where the model will be saved if the EXPORT subcommand
124 static struct file_handle *model_file;
126 static bool run_regression (struct casereader *, struct cmd_regression *,
130 STATISTICS subcommand output functions.
132 static void reg_stats_r (pspp_linreg_cache *);
133 static void reg_stats_coeff (pspp_linreg_cache *);
134 static void reg_stats_anova (pspp_linreg_cache *);
135 static void reg_stats_outs (pspp_linreg_cache *);
136 static void reg_stats_zpp (pspp_linreg_cache *);
137 static void reg_stats_label (pspp_linreg_cache *);
138 static void reg_stats_sha (pspp_linreg_cache *);
139 static void reg_stats_ci (pspp_linreg_cache *);
140 static void reg_stats_f (pspp_linreg_cache *);
141 static void reg_stats_bcov (pspp_linreg_cache *);
142 static void reg_stats_ses (pspp_linreg_cache *);
143 static void reg_stats_xtx (pspp_linreg_cache *);
144 static void reg_stats_collin (pspp_linreg_cache *);
145 static void reg_stats_tol (pspp_linreg_cache *);
146 static void reg_stats_selection (pspp_linreg_cache *);
147 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
148 int, pspp_linreg_cache *);
151 reg_stats_r (pspp_linreg_cache * c)
161 rsq = c->ssm / c->sst;
162 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
163 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
164 t = tab_create (n_cols, n_rows, 0);
165 tab_dim (t, tab_natural_dimensions);
166 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
167 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
168 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
169 tab_vline (t, TAL_0, 1, 0, 0);
171 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
172 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
173 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
174 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
175 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
176 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
177 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
178 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
179 tab_title (t, _("Model Summary"));
184 Table showing estimated regression coefficients.
187 reg_stats_coeff (pspp_linreg_cache * c)
199 const struct variable *v;
200 const union value *val;
205 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
206 n_rows = c->n_coeffs + 2;
208 t = tab_create (n_cols, n_rows, 0);
209 tab_headers (t, 2, 0, 1, 0);
210 tab_dim (t, tab_natural_dimensions);
211 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
212 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
213 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
214 tab_vline (t, TAL_0, 1, 0, 0);
216 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
217 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
218 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
219 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
220 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
221 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
222 coeff = c->coeff[0]->estimate;
223 tab_float (t, 2, 1, 0, coeff, 10, 2);
224 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
225 tab_float (t, 3, 1, 0, std_err, 10, 2);
226 beta = coeff / c->depvar_std;
227 tab_float (t, 4, 1, 0, beta, 10, 2);
228 t_stat = coeff / std_err;
229 tab_float (t, 5, 1, 0, t_stat, 10, 2);
230 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
231 tab_float (t, 6, 1, 0, pval, 10, 2);
232 for (j = 1; j <= c->n_indeps; j++)
234 v = pspp_coeff_get_var (c->coeff[j], 0);
235 label = var_to_string (v);
236 /* Do not overwrite the variable's name. */
237 strncpy (tmp, label, MAX_STRING);
238 if (var_is_alpha (v))
241 Append the value associated with this coefficient.
242 This makes sense only if we us the usual binary encoding
246 val = pspp_coeff_get_value (c->coeff[j], v);
247 val_s = var_get_value_name (v, val);
248 strncat (tmp, val_s, MAX_STRING);
251 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
253 Regression coefficients.
255 coeff = c->coeff[j]->estimate;
256 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
258 Standard error of the coefficients.
260 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
261 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
263 'Standardized' coefficient, i.e., regression coefficient
264 if all variables had unit variance.
266 beta = gsl_vector_get (c->indep_std, j);
267 beta *= coeff / c->depvar_std;
268 tab_float (t, 4, j + 1, 0, beta, 10, 2);
271 Test statistic for H0: coefficient is 0.
273 t_stat = coeff / std_err;
274 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
276 P values for the test statistic above.
279 2 * gsl_cdf_tdist_Q (fabs (t_stat),
280 (double) (c->n_obs - c->n_coeffs));
281 tab_float (t, 6, j + 1, 0, pval, 10, 2);
283 tab_title (t, _("Coefficients"));
289 Display the ANOVA table.
292 reg_stats_anova (pspp_linreg_cache * c)
296 const double msm = c->ssm / c->dfm;
297 const double mse = c->sse / c->dfe;
298 const double F = msm / mse;
299 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
304 t = tab_create (n_cols, n_rows, 0);
305 tab_headers (t, 2, 0, 1, 0);
306 tab_dim (t, tab_natural_dimensions);
308 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
310 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
311 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
312 tab_vline (t, TAL_0, 1, 0, 0);
314 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
315 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
316 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
317 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
318 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
320 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
321 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
322 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
324 /* Sums of Squares */
325 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
326 tab_float (t, 2, 3, 0, c->sst, 10, 2);
327 tab_float (t, 2, 2, 0, c->sse, 10, 2);
330 /* Degrees of freedom */
331 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
332 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
333 tab_float (t, 3, 3, 0, c->dft, 4, 0);
337 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
338 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
340 tab_float (t, 5, 1, 0, F, 8, 3);
342 tab_float (t, 6, 1, 0, pval, 8, 3);
344 tab_title (t, _("ANOVA"));
348 reg_stats_outs (pspp_linreg_cache * c)
353 reg_stats_zpp (pspp_linreg_cache * c)
358 reg_stats_label (pspp_linreg_cache * c)
363 reg_stats_sha (pspp_linreg_cache * c)
368 reg_stats_ci (pspp_linreg_cache * c)
373 reg_stats_f (pspp_linreg_cache * c)
378 reg_stats_bcov (pspp_linreg_cache * c)
390 n_cols = c->n_indeps + 1 + 2;
391 n_rows = 2 * (c->n_indeps + 1);
392 t = tab_create (n_cols, n_rows, 0);
393 tab_headers (t, 2, 0, 1, 0);
394 tab_dim (t, tab_natural_dimensions);
395 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
396 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
397 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
398 tab_vline (t, TAL_0, 1, 0, 0);
399 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
400 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
401 for (i = 1; i < c->n_coeffs; i++)
403 const struct variable *v = pspp_coeff_get_var (c->coeff[i], 0);
404 label = var_to_string (v);
405 tab_text (t, 2, i, TAB_CENTER, label);
406 tab_text (t, i + 2, 0, TAB_CENTER, label);
407 for (k = 1; k < c->n_coeffs; k++)
409 col = (i <= k) ? k : i;
410 row = (i <= k) ? i : k;
411 tab_float (t, k + 2, i, TAB_CENTER,
412 gsl_matrix_get (c->cov, row, col), 8, 3);
415 tab_title (t, _("Coefficient Correlations"));
419 reg_stats_ses (pspp_linreg_cache * c)
424 reg_stats_xtx (pspp_linreg_cache * c)
429 reg_stats_collin (pspp_linreg_cache * c)
434 reg_stats_tol (pspp_linreg_cache * c)
439 reg_stats_selection (pspp_linreg_cache * c)
445 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
446 int keyword, pspp_linreg_cache * c)
455 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
458 The order here must match the order in which the STATISTICS
459 keywords appear in the specification section above.
486 Set everything but F.
488 for (i = 0; i < f; i++)
495 for (i = 0; i < all; i++)
503 Default output: ANOVA table, parameter estimates,
504 and statistics for variables not entered into model,
507 if (keywords[defaults] | d)
515 statistics_keyword_output (reg_stats_r, keywords[r], c);
516 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
517 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
518 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
519 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
520 statistics_keyword_output (reg_stats_label, keywords[label], c);
521 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
522 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
523 statistics_keyword_output (reg_stats_f, keywords[f], c);
524 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
525 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
526 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
527 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
528 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
529 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
533 Free the transformation. Free its linear model if this
534 transformation is the last one.
537 regression_trns_free (void *t_)
540 struct reg_trns *t = t_;
542 if (t->trns_id == t->n_trns)
544 result = pspp_linreg_cache_free (t->c);
552 Gets the predicted values.
555 regression_trns_pred_proc (void *t_, struct ccase *c,
556 casenumber case_idx UNUSED)
560 struct reg_trns *trns = t_;
561 pspp_linreg_cache *model;
562 union value *output = NULL;
563 const union value **vals = NULL;
564 const struct variable **vars = NULL;
566 assert (trns != NULL);
568 assert (model != NULL);
569 assert (model->depvar != NULL);
570 assert (model->pred != NULL);
572 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
573 n_vals = (*model->get_vars) (model, vars);
575 vals = xnmalloc (n_vals, sizeof (*vals));
576 output = case_data_rw (c, model->pred);
577 assert (output != NULL);
579 for (i = 0; i < n_vals; i++)
581 vals[i] = case_data (c, vars[i]);
583 output->f = (*model->predict) ((const struct variable **) vars,
584 vals, model, n_vals);
587 return TRNS_CONTINUE;
594 regression_trns_resid_proc (void *t_, struct ccase *c,
595 casenumber case_idx UNUSED)
599 struct reg_trns *trns = t_;
600 pspp_linreg_cache *model;
601 union value *output = NULL;
602 const union value **vals = NULL;
603 const union value *obs = NULL;
604 const struct variable **vars = NULL;
606 assert (trns != NULL);
608 assert (model != NULL);
609 assert (model->depvar != NULL);
610 assert (model->resid != NULL);
612 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
613 n_vals = (*model->get_vars) (model, vars);
615 vals = xnmalloc (n_vals, sizeof (*vals));
616 output = case_data_rw (c, model->resid);
617 assert (output != NULL);
619 for (i = 0; i < n_vals; i++)
621 vals[i] = case_data (c, vars[i]);
623 obs = case_data (c, model->depvar);
624 output->f = (*model->residual) ((const struct variable **) vars,
625 vals, obs, model, n_vals);
628 return TRNS_CONTINUE;
632 Returns false if NAME is a duplicate of any existing variable name.
635 try_name (const struct dictionary *dict, const char *name)
637 if (dict_lookup_var (dict, name) != NULL)
644 reg_get_name (const struct dictionary *dict, char name[LONG_NAME_LEN],
645 const char prefix[LONG_NAME_LEN])
649 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
650 while (!try_name (dict, name))
653 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
658 reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
659 pspp_linreg_cache * c, struct variable **v, int n_trns)
661 struct dictionary *dict = dataset_dict (ds);
662 static int trns_index = 1;
663 char name[LONG_NAME_LEN];
664 struct variable *new_var;
665 struct reg_trns *t = NULL;
667 t = xmalloc (sizeof (*t));
668 t->trns_id = trns_index;
671 reg_get_name (dict, name, prefix);
672 new_var = dict_create_var (dict, name, 0);
673 assert (new_var != NULL);
675 add_transformation (ds, f, regression_trns_free, t);
680 subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models)
682 pspp_linreg_cache **lc;
686 assert (models != NULL);
690 /* Count the number of transformations we will need. */
691 for (i = 0; i < REGRESSION_SV_count; i++)
698 n_trns *= cmd.n_dependent;
700 for (lc = models; lc < models + cmd.n_dependent; lc++)
702 assert (*lc != NULL);
703 assert ((*lc)->depvar != NULL);
704 if (cmd.a_save[REGRESSION_SV_RESID])
706 reg_save_var (ds, "RES", regression_trns_resid_proc, *lc,
707 &(*lc)->resid, n_trns);
709 if (cmd.a_save[REGRESSION_SV_PRED])
711 reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc,
712 &(*lc)->pred, n_trns);
718 for (lc = models; lc < models + cmd.n_dependent; lc++)
722 pspp_linreg_cache_free (*lc);
729 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
733 for (i = 0; i < n_vars; i++)
744 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
748 struct variable **varlist;
750 fprintf (fp, "%s", reg_export_categorical_encode_1);
752 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
753 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
755 struct pspp_coeff *coeff = c->coeff[i];
756 const struct variable *v = pspp_coeff_get_var (coeff, 0);
757 if (var_is_alpha (v))
759 if (!reg_inserted (v, varlist, n_vars))
761 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
763 varlist[n_vars] = (struct variable *) v;
768 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
769 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
771 for (i = 0; i < n_vars - 1; i++)
773 fprintf (fp, "&%s,\n\t\t", var_get_name (varlist[i]));
775 fprintf (fp, "&%s};\n\t", var_get_name (varlist[i]));
777 for (i = 0; i < n_vars; i++)
779 int n_categories = cat_get_n_categories (varlist[i]);
782 fprintf (fp, "%s.name = \"%s\";\n\t",
783 var_get_name (varlist[i]), var_get_name (varlist[i]));
784 fprintf (fp, "%s.n_vals = %d;\n\t",
785 var_get_name (varlist[i]), n_categories);
787 for (j = 0; j < n_categories; j++)
789 const union value *val = cat_subscript_to_value (j, varlist[i]);
790 fprintf (fp, "%s.values[%d] = \"%s\";\n\t",
791 var_get_name (varlist[i]), j,
792 var_get_value_name (varlist[i], val));
795 fprintf (fp, "%s", reg_export_categorical_encode_2);
799 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
802 struct pspp_coeff *coeff;
803 const struct variable *v;
805 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
806 for (i = 1; i < c->n_indeps; i++)
809 v = pspp_coeff_get_var (coeff, 0);
810 fprintf (fp, "\"%s\",\n\t\t", var_get_name (v));
813 v = pspp_coeff_get_var (coeff, 0);
814 fprintf (fp, "\"%s\"};\n\t", var_get_name (v));
817 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
819 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
820 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
821 reg_print_depvars (fp, c);
822 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
824 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
825 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
828 reg_has_categorical (pspp_linreg_cache * c)
831 const struct variable *v;
833 for (i = 1; i < c->n_coeffs; i++)
835 v = pspp_coeff_get_var (c->coeff[i], 0);
836 if (var_is_alpha (v))
843 subcommand_export (int export, pspp_linreg_cache * c)
848 int n_quantiles = 100;
850 struct pspp_coeff *coeff;
855 assert (model_file != NULL);
856 fp = fopen (fh_get_file_name (model_file), "w");
858 fprintf (fp, "%s", reg_preamble);
859 reg_print_getvar (fp, c);
860 if (reg_has_categorical (c))
862 reg_print_categorical_encoding (fp, c);
864 fprintf (fp, "%s", reg_export_t_quantiles_1);
865 for (i = 0; i < n_quantiles - 1; i++)
867 tmp = 0.5 + 0.005 * (double) i;
868 fprintf (fp, "%.15e,\n\t\t",
869 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
871 fprintf (fp, "%.15e};\n\t",
872 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
873 fprintf (fp, "%s", reg_export_t_quantiles_2);
874 fprintf (fp, "%s", reg_mean_cmt);
875 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
876 fprintf (fp, "const char *var_names[])\n{\n\t");
877 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
878 for (i = 1; i < c->n_indeps; i++)
881 fprintf (fp, "%.15e,\n\t\t", coeff->estimate);
884 fprintf (fp, "%.15e};\n\t", coeff->estimate);
886 fprintf (fp, "double estimate = %.15e;\n\t", coeff->estimate);
887 fprintf (fp, "int i;\n\tint j;\n\n\t");
888 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
889 fprintf (fp, "%s", reg_getvar);
890 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
892 for (i = 0; i < c->cov->size1 - 1; i++)
895 for (j = 0; j < c->cov->size2 - 1; j++)
897 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
899 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
902 for (j = 0; j < c->cov->size2 - 1; j++)
904 fprintf (fp, "%.15e, ",
905 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
907 fprintf (fp, "%.15e}\n\t",
908 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
909 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
911 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
912 fprintf (fp, "%s", reg_variance);
913 fprintf (fp, "%s", reg_export_confidence_interval);
914 tmp = c->mse * c->mse;
915 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
916 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
917 fprintf (fp, "%s", reg_export_prediction_interval_3);
919 fp = fopen ("pspp_model_reg.h", "w");
920 fprintf (fp, "%s", reg_header);
926 regression_custom_export (struct lexer *lexer, struct dataset *ds UNUSED,
927 struct cmd_regression *cmd UNUSED, void *aux UNUSED)
929 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
930 if (!lex_force_match (lexer, '('))
933 if (lex_match (lexer, '*'))
937 model_file = fh_parse (lexer, FH_REF_FILE);
938 if (model_file == NULL)
942 if (!lex_force_match (lexer, ')'))
949 cmd_regression (struct lexer *lexer, struct dataset *ds)
951 struct casegrouper *grouper;
952 struct casereader *group;
956 if (!parse_regression (lexer, ds, &cmd, NULL))
959 models = xnmalloc (cmd.n_dependent, sizeof *models);
960 for (i = 0; i < cmd.n_dependent; i++)
966 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
967 while (casegrouper_get_next_group (grouper, &group))
968 run_regression (group, &cmd, ds);
969 ok = casegrouper_destroy (grouper);
970 ok = proc_commit (ds) && ok;
972 subcommand_save (ds, cmd.sbc_save, models);
975 return ok ? CMD_SUCCESS : CMD_FAILURE;
979 Is variable k the dependent variable?
982 is_depvar (size_t k, const struct variable *v)
984 return v == v_variables[k];
987 /* Parser for the variables sub command */
989 regression_custom_variables (struct lexer *lexer, struct dataset *ds,
990 struct cmd_regression *cmd UNUSED,
993 const struct dictionary *dict = dataset_dict (ds);
995 lex_match (lexer, '=');
997 if ((lex_token (lexer) != T_ID
998 || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
999 && lex_token (lexer) != T_ALL)
1003 if (!parse_variables_const
1004 (lexer, dict, &v_variables, &n_variables, PV_NONE))
1009 assert (n_variables);
1014 /* Identify the explanatory variables in v_variables. Returns
1015 the number of independent variables. */
1017 identify_indep_vars (struct variable **indep_vars, struct variable *depvar)
1019 int n_indep_vars = 0;
1022 for (i = 0; i < n_variables; i++)
1023 if (!is_depvar (i, depvar))
1024 indep_vars[n_indep_vars++] = v_variables[i];
1026 return n_indep_vars;
1029 /* Encode categorical variables.
1030 Returns number of valid cases. */
1032 prepare_categories (struct casereader *input,
1033 struct variable **vars, size_t n_vars,
1034 struct moments_var *mom)
1040 for (i = 0; i < n_vars; i++)
1041 if (var_is_alpha (vars[i]))
1042 cat_stored_values_create (vars[i]);
1045 for (; casereader_read (input, &c); case_destroy (&c))
1048 The second condition ensures the program will run even if
1049 there is only one variable to act as both explanatory and
1052 for (i = 0; i < n_vars; i++)
1054 const union value *val = case_data (&c, vars[i]);
1055 if (var_is_alpha (vars[i]))
1056 cat_value_update (vars[i], val);
1058 moments1_add (mom[i].m, val->f, 1.0);
1062 casereader_destroy (input);
1068 coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
1070 c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
1071 c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
1072 c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
1073 pspp_coeff_init (c->coeff + 1, dm);
1077 Put the moments in the linreg cache.
1080 compute_moments (pspp_linreg_cache * c, struct moments_var *mom,
1081 struct design_matrix *dm, size_t n)
1091 Scan the variable names in the columns of the design matrix.
1092 When we find the variable we need, insert its mean in the cache.
1094 for (i = 0; i < dm->m->size2; i++)
1096 for (j = 0; j < n; j++)
1098 if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
1100 moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
1101 &skewness, &kurtosis);
1102 gsl_vector_set (c->indep_means, i, mean);
1103 gsl_vector_set (c->indep_std, i, sqrt (variance));
1110 run_regression (struct casereader *input, struct cmd_regression *cmd,
1117 const struct variable **indep_vars;
1118 struct design_matrix *X;
1119 struct moments_var *mom;
1122 pspp_linreg_opts lopts;
1124 assert (models != NULL);
1126 if (!casereader_peek (input, 0, &c))
1128 output_split_file_values (ds, &c);
1133 dict_get_vars (dataset_dict (ds), &v_variables, &n_variables,
1137 for (i = 0; i < cmd->n_dependent; i++)
1139 if (!var_is_numeric (cmd->v_dependent[i]))
1141 msg (SE, _("Dependent variable must be numeric."));
1146 mom = xnmalloc (n_variables, sizeof (*mom));
1147 for (i = 0; i < n_variables; i++)
1149 (mom + i)->m = moments1_create (MOMENT_VARIANCE);
1150 (mom + i)->v = v_variables[i];
1152 lopts.get_depvar_mean_std = 1;
1154 lopts.get_indep_mean_std = xnmalloc (n_variables, sizeof (int));
1155 indep_vars = xnmalloc (n_variables, sizeof *indep_vars);
1157 for (k = 0; k < cmd->n_dependent; k++)
1159 struct variable *dep_var;
1160 struct casereader *reader;
1163 size_t n_data; /* Number of valid cases. */
1165 dep_var = cmd->v_dependent[k];
1166 n_indep = identify_indep_vars (indep_vars, dep_var);
1168 reader = casereader_clone (input);
1169 reader = casereader_create_filter_missing (reader, indep_vars, n_indep,
1171 reader = casereader_create_filter_missing (reader, &dep_var, 1,
1173 n_data = prepare_categories (casereader_clone (reader),
1174 indep_vars, n_indep, mom);
1176 if ((n_data > 0) && (n_indep > 0))
1178 Y = gsl_vector_alloc (n_data);
1180 design_matrix_create (n_indep,
1181 (const struct variable **) indep_vars,
1183 for (i = 0; i < X->m->size2; i++)
1185 lopts.get_indep_mean_std[i] = 1;
1187 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1188 models[k]->indep_means = gsl_vector_alloc (X->m->size2);
1189 models[k]->indep_std = gsl_vector_alloc (X->m->size2);
1190 models[k]->depvar = dep_var;
1192 For large data sets, use QR decomposition.
1194 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1196 models[k]->method = PSPP_LINREG_SVD;
1200 The second pass fills the design matrix.
1202 reader = casereader_create_counter (reader, &row, -1);
1203 for (; casereader_read (reader, &c); case_destroy (&c))
1205 for (i = 0; i < n_indep; ++i)
1207 struct variable *v = indep_vars[i];
1208 const union value *val = case_data (&c, v);
1209 if (var_is_alpha (v))
1210 design_matrix_set_categorical (X, row, v, val);
1212 design_matrix_set_numeric (X, row, v, val);
1214 gsl_vector_set (Y, row, case_num (&c, dep_var));
1216 casereader_destroy (reader);
1218 Now that we know the number of coefficients, allocate space
1219 and store pointers to the variables that correspond to the
1222 coeff_init (models[k], X);
1225 Find the least-squares estimates and other statistics.
1227 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1228 compute_moments (models[k], mom, X, n_variables);
1230 if (!taint_has_tainted_successor (casereader_get_taint (input)))
1232 subcommand_statistics (cmd->a_statistics, models[k]);
1233 subcommand_export (cmd->sbc_export, models[k]);
1236 gsl_vector_free (Y);
1237 design_matrix_destroy (X);
1241 msg (SE, gettext ("No valid data found. This command was skipped."));
1245 free (lopts.get_indep_mean_std);
1246 casereader_destroy (input);