1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 #include <gsl/gsl_cdf.h>
22 #include <gsl/gsl_matrix.h>
23 #include <gsl/gsl_vector.h>
27 #include "regression-export.h"
28 #include <data/case.h>
29 #include <data/casefile.h>
30 #include <data/category.h>
31 #include <data/dictionary.h>
32 #include <data/missing-values.h>
33 #include <data/procedure.h>
34 #include <data/transformations.h>
35 #include <data/value-labels.h>
36 #include <data/variable.h>
37 #include <language/command.h>
38 #include <language/dictionary/split-file.h>
39 #include <language/data-io/file-handle.h>
40 #include <language/lexer/lexer.h>
41 #include <libpspp/alloc.h>
42 #include <libpspp/compiler.h>
43 #include <libpspp/message.h>
44 #include <math/design-matrix.h>
45 #include <math/coefficient.h>
46 #include <math/linreg/linreg.h>
47 #include <math/moments.h>
48 #include <output/table.h>
52 #define REG_LARGE_DATA 1000
57 "REGRESSION" (regression_):
78 +save[sv_]=resid,pred;
83 static struct cmd_regression cmd;
86 Moments for each of the variables used.
91 const struct variable *v;
94 /* Linear regression models. */
95 static pspp_linreg_cache **models = NULL;
98 Transformations for saving predicted values
103 int n_trns; /* Number of transformations. */
104 int trns_id; /* Which trns is this one? */
105 pspp_linreg_cache *c; /* Linear model for this trns. */
108 Variables used (both explanatory and response).
110 static const struct variable **v_variables;
115 static size_t n_variables;
118 File where the model will be saved if the EXPORT subcommand
121 static struct file_handle *model_file;
124 Return value for the procedure.
126 static int pspp_reg_rc = CMD_SUCCESS;
128 static bool run_regression (const struct ccase *,
129 const struct casefile *, void *,
130 const struct dataset *);
133 STATISTICS subcommand output functions.
135 static void reg_stats_r (pspp_linreg_cache *);
136 static void reg_stats_coeff (pspp_linreg_cache *);
137 static void reg_stats_anova (pspp_linreg_cache *);
138 static void reg_stats_outs (pspp_linreg_cache *);
139 static void reg_stats_zpp (pspp_linreg_cache *);
140 static void reg_stats_label (pspp_linreg_cache *);
141 static void reg_stats_sha (pspp_linreg_cache *);
142 static void reg_stats_ci (pspp_linreg_cache *);
143 static void reg_stats_f (pspp_linreg_cache *);
144 static void reg_stats_bcov (pspp_linreg_cache *);
145 static void reg_stats_ses (pspp_linreg_cache *);
146 static void reg_stats_xtx (pspp_linreg_cache *);
147 static void reg_stats_collin (pspp_linreg_cache *);
148 static void reg_stats_tol (pspp_linreg_cache *);
149 static void reg_stats_selection (pspp_linreg_cache *);
150 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
151 int, pspp_linreg_cache *);
154 reg_stats_r (pspp_linreg_cache * c)
164 rsq = c->ssm / c->sst;
165 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
166 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
167 t = tab_create (n_cols, n_rows, 0);
168 tab_dim (t, tab_natural_dimensions);
169 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
170 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
171 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
172 tab_vline (t, TAL_0, 1, 0, 0);
174 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
175 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
176 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
177 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
178 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
179 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
180 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
181 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
182 tab_title (t, _("Model Summary"));
187 Table showing estimated regression coefficients.
190 reg_stats_coeff (pspp_linreg_cache * c)
202 const struct variable *v;
203 const union value *val;
208 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
209 n_rows = c->n_coeffs + 2;
211 t = tab_create (n_cols, n_rows, 0);
212 tab_headers (t, 2, 0, 1, 0);
213 tab_dim (t, tab_natural_dimensions);
214 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
215 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
216 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
217 tab_vline (t, TAL_0, 1, 0, 0);
219 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
220 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
221 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
222 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
223 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
224 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
225 coeff = c->coeff[0]->estimate;
226 tab_float (t, 2, 1, 0, coeff, 10, 2);
227 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
228 tab_float (t, 3, 1, 0, std_err, 10, 2);
229 beta = coeff / c->depvar_std;
230 tab_float (t, 4, 1, 0, beta, 10, 2);
231 t_stat = coeff / std_err;
232 tab_float (t, 5, 1, 0, t_stat, 10, 2);
233 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
234 tab_float (t, 6, 1, 0, pval, 10, 2);
235 for (j = 1; j <= c->n_indeps; j++)
237 v = pspp_coeff_get_var (c->coeff[j], 0);
238 label = var_to_string (v);
239 /* Do not overwrite the variable's name. */
240 strncpy (tmp, label, MAX_STRING);
241 if (var_is_alpha (v))
244 Append the value associated with this coefficient.
245 This makes sense only if we us the usual binary encoding
249 val = pspp_coeff_get_value (c->coeff[j], v);
250 val_s = var_get_value_name (v, val);
251 strncat (tmp, val_s, MAX_STRING);
254 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
256 Regression coefficients.
258 coeff = c->coeff[j]->estimate;
259 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
261 Standard error of the coefficients.
263 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
264 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
266 'Standardized' coefficient, i.e., regression coefficient
267 if all variables had unit variance.
269 beta = gsl_vector_get (c->indep_std, j);
270 beta *= coeff / c->depvar_std;
271 tab_float (t, 4, j + 1, 0, beta, 10, 2);
274 Test statistic for H0: coefficient is 0.
276 t_stat = coeff / std_err;
277 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
279 P values for the test statistic above.
282 2 * gsl_cdf_tdist_Q (fabs (t_stat),
283 (double) (c->n_obs - c->n_coeffs));
284 tab_float (t, 6, j + 1, 0, pval, 10, 2);
286 tab_title (t, _("Coefficients"));
292 Display the ANOVA table.
295 reg_stats_anova (pspp_linreg_cache * c)
299 const double msm = c->ssm / c->dfm;
300 const double mse = c->sse / c->dfe;
301 const double F = msm / mse;
302 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
307 t = tab_create (n_cols, n_rows, 0);
308 tab_headers (t, 2, 0, 1, 0);
309 tab_dim (t, tab_natural_dimensions);
311 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
313 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
314 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
315 tab_vline (t, TAL_0, 1, 0, 0);
317 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
318 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
319 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
320 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
321 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
323 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
324 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
325 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
327 /* Sums of Squares */
328 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
329 tab_float (t, 2, 3, 0, c->sst, 10, 2);
330 tab_float (t, 2, 2, 0, c->sse, 10, 2);
333 /* Degrees of freedom */
334 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
335 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
336 tab_float (t, 3, 3, 0, c->dft, 4, 0);
340 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
341 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
343 tab_float (t, 5, 1, 0, F, 8, 3);
345 tab_float (t, 6, 1, 0, pval, 8, 3);
347 tab_title (t, _("ANOVA"));
351 reg_stats_outs (pspp_linreg_cache * c)
356 reg_stats_zpp (pspp_linreg_cache * c)
361 reg_stats_label (pspp_linreg_cache * c)
366 reg_stats_sha (pspp_linreg_cache * c)
371 reg_stats_ci (pspp_linreg_cache * c)
376 reg_stats_f (pspp_linreg_cache * c)
381 reg_stats_bcov (pspp_linreg_cache * c)
393 n_cols = c->n_indeps + 1 + 2;
394 n_rows = 2 * (c->n_indeps + 1);
395 t = tab_create (n_cols, n_rows, 0);
396 tab_headers (t, 2, 0, 1, 0);
397 tab_dim (t, tab_natural_dimensions);
398 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
399 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
400 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
401 tab_vline (t, TAL_0, 1, 0, 0);
402 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
403 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
404 for (i = 1; i < c->n_coeffs; i++)
406 const struct variable *v = pspp_coeff_get_var (c->coeff[i], 0);
407 label = var_to_string (v);
408 tab_text (t, 2, i, TAB_CENTER, label);
409 tab_text (t, i + 2, 0, TAB_CENTER, label);
410 for (k = 1; k < c->n_coeffs; k++)
412 col = (i <= k) ? k : i;
413 row = (i <= k) ? i : k;
414 tab_float (t, k + 2, i, TAB_CENTER,
415 gsl_matrix_get (c->cov, row, col), 8, 3);
418 tab_title (t, _("Coefficient Correlations"));
422 reg_stats_ses (pspp_linreg_cache * c)
427 reg_stats_xtx (pspp_linreg_cache * c)
432 reg_stats_collin (pspp_linreg_cache * c)
437 reg_stats_tol (pspp_linreg_cache * c)
442 reg_stats_selection (pspp_linreg_cache * c)
448 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
449 int keyword, pspp_linreg_cache * c)
458 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
461 The order here must match the order in which the STATISTICS
462 keywords appear in the specification section above.
489 Set everything but F.
491 for (i = 0; i < f; i++)
498 for (i = 0; i < all; i++)
506 Default output: ANOVA table, parameter estimates,
507 and statistics for variables not entered into model,
510 if (keywords[defaults] | d)
518 statistics_keyword_output (reg_stats_r, keywords[r], c);
519 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
520 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
521 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
522 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
523 statistics_keyword_output (reg_stats_label, keywords[label], c);
524 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
525 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
526 statistics_keyword_output (reg_stats_f, keywords[f], c);
527 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
528 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
529 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
530 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
531 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
532 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
536 Free the transformation. Free its linear model if this
537 transformation is the last one.
540 regression_trns_free (void *t_)
543 struct reg_trns *t = t_;
545 if (t->trns_id == t->n_trns)
547 result = pspp_linreg_cache_free (t->c);
555 Gets the predicted values.
558 regression_trns_pred_proc (void *t_, struct ccase *c,
559 casenumber case_idx UNUSED)
563 struct reg_trns *trns = t_;
564 pspp_linreg_cache *model;
565 union value *output = NULL;
566 const union value **vals = NULL;
567 const struct variable **vars = NULL;
569 assert (trns != NULL);
571 assert (model != NULL);
572 assert (model->depvar != NULL);
573 assert (model->pred != NULL);
575 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
576 n_vals = (*model->get_vars) (model, vars);
578 vals = xnmalloc (n_vals, sizeof (*vals));
579 output = case_data_rw (c, model->pred);
580 assert (output != NULL);
582 for (i = 0; i < n_vals; i++)
584 vals[i] = case_data (c, vars[i]);
586 output->f = (*model->predict) ((const struct variable **) vars,
587 vals, model, n_vals);
590 return TRNS_CONTINUE;
597 regression_trns_resid_proc (void *t_, struct ccase *c,
598 casenumber case_idx UNUSED)
602 struct reg_trns *trns = t_;
603 pspp_linreg_cache *model;
604 union value *output = NULL;
605 const union value **vals = NULL;
606 const union value *obs = NULL;
607 const struct variable **vars = NULL;
609 assert (trns != NULL);
611 assert (model != NULL);
612 assert (model->depvar != NULL);
613 assert (model->resid != NULL);
615 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
616 n_vals = (*model->get_vars) (model, vars);
618 vals = xnmalloc (n_vals, sizeof (*vals));
619 output = case_data_rw (c, model->resid);
620 assert (output != NULL);
622 for (i = 0; i < n_vals; i++)
624 vals[i] = case_data (c, vars[i]);
626 obs = case_data (c, model->depvar);
627 output->f = (*model->residual) ((const struct variable **) vars,
628 vals, obs, model, n_vals);
631 return TRNS_CONTINUE;
635 Returns false if NAME is a duplicate of any existing variable name.
638 try_name (const struct dictionary *dict, const char *name)
640 if (dict_lookup_var (dict, name) != NULL)
647 reg_get_name (const struct dictionary *dict, char name[LONG_NAME_LEN],
648 const char prefix[LONG_NAME_LEN])
652 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
653 while (!try_name (dict, name))
656 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
661 reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
662 pspp_linreg_cache * c, struct variable **v, int n_trns)
664 struct dictionary *dict = dataset_dict (ds);
665 static int trns_index = 1;
666 char name[LONG_NAME_LEN];
667 struct variable *new_var;
668 struct reg_trns *t = NULL;
670 t = xmalloc (sizeof (*t));
671 t->trns_id = trns_index;
674 reg_get_name (dict, name, prefix);
675 new_var = dict_create_var (dict, name, 0);
676 assert (new_var != NULL);
678 add_transformation (ds, f, regression_trns_free, t);
683 subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models)
685 pspp_linreg_cache **lc;
689 assert (models != NULL);
693 /* Count the number of transformations we will need. */
694 for (i = 0; i < REGRESSION_SV_count; i++)
701 n_trns *= cmd.n_dependent;
703 for (lc = models; lc < models + cmd.n_dependent; lc++)
705 assert (*lc != NULL);
706 assert ((*lc)->depvar != NULL);
707 if (cmd.a_save[REGRESSION_SV_RESID])
709 reg_save_var (ds, "RES", regression_trns_resid_proc, *lc,
710 &(*lc)->resid, n_trns);
712 if (cmd.a_save[REGRESSION_SV_PRED])
714 reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc,
715 &(*lc)->pred, n_trns);
721 for (lc = models; lc < models + cmd.n_dependent; lc++)
725 pspp_linreg_cache_free (*lc);
732 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
736 for (i = 0; i < n_vars; i++)
747 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
751 struct variable **varlist;
753 fprintf (fp, "%s", reg_export_categorical_encode_1);
755 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
756 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
758 struct pspp_coeff *coeff = c->coeff[i];
759 const struct variable *v = pspp_coeff_get_var (coeff, 0);
760 if (var_is_alpha (v))
762 if (!reg_inserted (v, varlist, n_vars))
764 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
766 varlist[n_vars] = (struct variable *) v;
771 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
772 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
774 for (i = 0; i < n_vars - 1; i++)
776 fprintf (fp, "&%s,\n\t\t", var_get_name (varlist[i]));
778 fprintf (fp, "&%s};\n\t", var_get_name (varlist[i]));
780 for (i = 0; i < n_vars; i++)
782 int n_categories = cat_get_n_categories (varlist[i]);
785 fprintf (fp, "%s.name = \"%s\";\n\t",
786 var_get_name (varlist[i]), var_get_name (varlist[i]));
787 fprintf (fp, "%s.n_vals = %d;\n\t",
788 var_get_name (varlist[i]), n_categories);
790 for (j = 0; j < n_categories; j++)
792 const union value *val = cat_subscript_to_value (j, varlist[i]);
793 fprintf (fp, "%s.values[%d] = \"%s\";\n\t",
794 var_get_name (varlist[i]), j,
795 var_get_value_name (varlist[i], val));
798 fprintf (fp, "%s", reg_export_categorical_encode_2);
802 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
805 struct pspp_coeff *coeff;
806 const struct variable *v;
808 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
809 for (i = 1; i < c->n_indeps; i++)
812 v = pspp_coeff_get_var (coeff, 0);
813 fprintf (fp, "\"%s\",\n\t\t", var_get_name (v));
816 v = pspp_coeff_get_var (coeff, 0);
817 fprintf (fp, "\"%s\"};\n\t", var_get_name (v));
820 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
822 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
823 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
824 reg_print_depvars (fp, c);
825 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
827 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
828 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
831 reg_has_categorical (pspp_linreg_cache * c)
834 const struct variable *v;
836 for (i = 1; i < c->n_coeffs; i++)
838 v = pspp_coeff_get_var (c->coeff[i], 0);
839 if (var_is_alpha (v))
846 subcommand_export (int export, pspp_linreg_cache * c)
851 int n_quantiles = 100;
853 struct pspp_coeff *coeff;
858 assert (model_file != NULL);
859 fp = fopen (fh_get_file_name (model_file), "w");
861 fprintf (fp, "%s", reg_preamble);
862 reg_print_getvar (fp, c);
863 if (reg_has_categorical (c))
865 reg_print_categorical_encoding (fp, c);
867 fprintf (fp, "%s", reg_export_t_quantiles_1);
868 for (i = 0; i < n_quantiles - 1; i++)
870 tmp = 0.5 + 0.005 * (double) i;
871 fprintf (fp, "%.15e,\n\t\t",
872 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
874 fprintf (fp, "%.15e};\n\t",
875 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
876 fprintf (fp, "%s", reg_export_t_quantiles_2);
877 fprintf (fp, "%s", reg_mean_cmt);
878 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
879 fprintf (fp, "const char *var_names[])\n{\n\t");
880 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
881 for (i = 1; i < c->n_indeps; i++)
884 fprintf (fp, "%.15e,\n\t\t", coeff->estimate);
887 fprintf (fp, "%.15e};\n\t", coeff->estimate);
889 fprintf (fp, "double estimate = %.15e;\n\t", coeff->estimate);
890 fprintf (fp, "int i;\n\tint j;\n\n\t");
891 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
892 fprintf (fp, "%s", reg_getvar);
893 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
895 for (i = 0; i < c->cov->size1 - 1; i++)
898 for (j = 0; j < c->cov->size2 - 1; j++)
900 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
902 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
905 for (j = 0; j < c->cov->size2 - 1; j++)
907 fprintf (fp, "%.15e, ",
908 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
910 fprintf (fp, "%.15e}\n\t",
911 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
912 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
914 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
915 fprintf (fp, "%s", reg_variance);
916 fprintf (fp, "%s", reg_export_confidence_interval);
917 tmp = c->mse * c->mse;
918 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
919 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
920 fprintf (fp, "%s", reg_export_prediction_interval_3);
922 fp = fopen ("pspp_model_reg.h", "w");
923 fprintf (fp, "%s", reg_header);
929 regression_custom_export (struct lexer *lexer, struct dataset *ds UNUSED,
930 struct cmd_regression *cmd UNUSED, void *aux UNUSED)
932 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
933 if (!lex_force_match (lexer, '('))
936 if (lex_match (lexer, '*'))
940 model_file = fh_parse (lexer, FH_REF_FILE);
941 if (model_file == NULL)
945 if (!lex_force_match (lexer, ')'))
952 cmd_regression (struct lexer *lexer, struct dataset *ds)
956 if (!parse_regression (lexer, ds, &cmd, NULL))
959 models = xnmalloc (cmd.n_dependent, sizeof *models);
960 for (i = 0; i < cmd.n_dependent; i++)
964 if (!multipass_procedure_with_splits (ds, run_regression, &cmd))
965 return CMD_CASCADING_FAILURE;
966 subcommand_save (ds, cmd.sbc_save, models);
973 Is variable k the dependent variable?
976 is_depvar (size_t k, const struct variable *v)
978 return v == v_variables[k];
982 Mark missing cases. Return the number of non-missing cases.
983 Compute the first two moments.
986 mark_missing_cases (const struct casefile *cf, const struct variable *v,
987 int *is_missing_case, double n_data,
988 struct moments_var *mom)
990 struct casereader *r;
993 const union value *val;
996 for (r = casefile_get_reader (cf, NULL);
997 casereader_read (r, &c); case_destroy (&c))
999 row = casereader_cnum (r) - 1;
1001 val = case_data (&c, v);
1004 moments1_add (mom->m, val->f, w);
1006 cat_value_update (v, val);
1007 if (var_is_value_missing (v, val, MV_ANY))
1009 if (!is_missing_case[row])
1011 /* Now it is missing. */
1013 is_missing_case[row] = 1;
1017 casereader_destroy (r);
1022 /* Parser for the variables sub command */
1024 regression_custom_variables (struct lexer *lexer, struct dataset *ds,
1025 struct cmd_regression *cmd UNUSED,
1028 const struct dictionary *dict = dataset_dict (ds);
1030 lex_match (lexer, '=');
1032 if ((lex_token (lexer) != T_ID
1033 || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
1034 && lex_token (lexer) != T_ALL)
1038 if (!parse_variables_const
1039 (lexer, dict, &v_variables, &n_variables, PV_NONE))
1044 assert (n_variables);
1050 Count the explanatory variables. The user may or may
1051 not have specified a response variable in the syntax.
1054 get_n_indep (const struct variable *v)
1059 result = n_variables;
1060 while (i < n_variables)
1062 if (is_depvar (i, v))
1069 return (result == 0) ? 1 : result;
1073 Read from the active file. Identify the explanatory variables in
1074 v_variables. Encode categorical variables. Drop cases with missing
1078 prepare_data (int n_data, int is_missing_case[],
1079 const struct variable **indep_vars,
1080 const struct variable *depvar, const struct casefile *cf,
1081 struct moments_var *mom)
1086 assert (indep_vars != NULL);
1088 for (i = 0; i < n_variables; i++)
1091 The second condition ensures the program will run even if
1092 there is only one variable to act as both explanatory and
1095 if ((!is_depvar (i, depvar)) || (n_variables == 1))
1097 indep_vars[j] = v_variables[i];
1099 if (var_is_alpha (v_variables[i]))
1101 /* Make a place to hold the binary vectors
1102 corresponding to this variable's values. */
1103 cat_stored_values_create (v_variables[i]);
1106 mark_missing_cases (cf, v_variables[i], is_missing_case, n_data,
1111 Mark missing cases for the dependent variable.
1113 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data, NULL);
1118 coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
1120 c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
1121 c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
1122 c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
1123 pspp_coeff_init (c->coeff + 1, dm);
1127 Put the moments in the linreg cache.
1130 compute_moments (pspp_linreg_cache * c, struct moments_var *mom,
1131 struct design_matrix *dm, size_t n)
1141 Scan the variable names in the columns of the design matrix.
1142 When we find the variable we need, insert its mean in the cache.
1144 for (i = 0; i < dm->m->size2; i++)
1146 for (j = 0; j < n; j++)
1148 if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
1150 moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
1151 &skewness, &kurtosis);
1152 gsl_vector_set (c->indep_means, i, mean);
1153 gsl_vector_set (c->indep_std, i, sqrt (variance));
1159 run_regression (const struct ccase *first,
1160 const struct casefile *cf, void *cmd_ UNUSED,
1161 const struct dataset *ds)
1164 size_t n_data = 0; /* Number of valide cases. */
1165 size_t n_cases; /* Number of cases. */
1171 Keep track of the missing cases.
1173 int *is_missing_case;
1174 const union value *val;
1175 struct casereader *r;
1177 const struct variable **indep_vars;
1178 struct design_matrix *X;
1179 struct moments_var *mom;
1182 pspp_linreg_opts lopts;
1184 assert (models != NULL);
1186 output_split_file_values (ds, first);
1190 dict_get_vars (dataset_dict (ds), &v_variables, &n_variables,
1194 n_cases = casefile_get_case_cnt (cf);
1196 for (i = 0; i < cmd.n_dependent; i++)
1198 if (!var_is_numeric (cmd.v_dependent[i]))
1200 msg (SE, gettext ("Dependent variable must be numeric."));
1201 pspp_reg_rc = CMD_FAILURE;
1206 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
1207 mom = xnmalloc (n_variables, sizeof (*mom));
1208 for (i = 0; i < n_variables; i++)
1210 (mom + i)->m = moments1_create (MOMENT_VARIANCE);
1211 (mom + i)->v = v_variables[i];
1213 lopts.get_depvar_mean_std = 1;
1215 for (k = 0; k < cmd.n_dependent; k++)
1217 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
1218 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
1219 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
1220 assert (indep_vars != NULL);
1222 for (i = 0; i < n_cases; i++)
1224 is_missing_case[i] = 0;
1226 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
1228 (const struct casefile *) cf, mom);
1229 if ((n_data > 0) && (n_indep > 0))
1231 Y = gsl_vector_alloc (n_data);
1233 design_matrix_create (n_indep,
1234 (const struct variable **) indep_vars,
1236 for (i = 0; i < X->m->size2; i++)
1238 lopts.get_indep_mean_std[i] = 1;
1240 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1241 models[k]->indep_means = gsl_vector_alloc (X->m->size2);
1242 models[k]->indep_std = gsl_vector_alloc (X->m->size2);
1243 models[k]->depvar = (const struct variable *) cmd.v_dependent[k];
1245 For large data sets, use QR decomposition.
1247 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1249 models[k]->method = PSPP_LINREG_SVD;
1253 The second pass fills the design matrix.
1256 for (r = casefile_get_reader (cf, NULL); casereader_read (r, &c);
1258 /* Iterate over the cases. */
1260 case_num = casereader_cnum (r) - 1;
1261 if (!is_missing_case[case_num])
1263 for (i = 0; i < n_variables; ++i) /* Iterate over the
1268 val = case_data (&c, v_variables[i]);
1270 Independent/dependent variable separation. The
1271 'variables' subcommand specifies a varlist which contains
1272 both dependent and independent variables. The dependent
1273 variables are specified with the 'dependent'
1274 subcommand, and maybe also in the 'variables' subcommand.
1275 We need to separate the two.
1277 if (!is_depvar (i, cmd.v_dependent[k]))
1279 if (var_is_alpha (v_variables[i]))
1281 design_matrix_set_categorical (X, row,
1287 design_matrix_set_numeric (X, row,
1288 v_variables[i], val);
1292 val = case_data (&c, cmd.v_dependent[k]);
1293 gsl_vector_set (Y, row, val->f);
1298 Now that we know the number of coefficients, allocate space
1299 and store pointers to the variables that correspond to the
1302 coeff_init (models[k], X);
1305 Find the least-squares estimates and other statistics.
1307 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1308 compute_moments (models[k], mom, X, n_variables);
1309 subcommand_statistics (cmd.a_statistics, models[k]);
1310 subcommand_export (cmd.sbc_export, models[k]);
1312 gsl_vector_free (Y);
1313 design_matrix_destroy (X);
1315 free (lopts.get_indep_mean_std);
1316 casereader_destroy (r);
1320 msg (SE, gettext ("No valid data found. This command was skipped."));
1323 for (i = 0; i < n_variables; i++)
1325 moments1_destroy ((mom + i)->m);
1328 free (is_missing_case);