1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_vector.h>
24 #include <gsl/gsl_matrix.h>
26 #include <libpspp/alloc.h>
27 #include <data/case.h>
28 #include <data/casefile.h>
29 #include <data/category.h>
30 #include <data/cat-routines.h>
31 #include <language/command.h>
32 #include <libpspp/compiler.h>
33 #include <math/design-matrix.h>
34 #include <data/dictionary.h>
35 #include <libpspp/message.h>
36 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <math/linreg/linreg.h>
40 #include <math/linreg/coefficient.h>
41 #include <data/missing-values.h>
42 #include "regression-export.h"
43 #include <output/table.h>
44 #include <data/value-labels.h>
45 #include <data/variable.h>
46 #include <procedure.h>
48 #define REG_LARGE_DATA 1000
53 "REGRESSION" (regression_):
79 static struct cmd_regression cmd;
81 /* Linear regression models. */
82 pspp_linreg_cache **models = NULL;
85 Transformations for saving predicted values
90 int n_trns; /* Number of transformations. */
91 int trns_id; /* Which trns is this one? */
92 pspp_linreg_cache *c; /* Linear model for this trns. */
95 Variables used (both explanatory and response).
97 static struct variable **v_variables;
102 static size_t n_variables;
105 File where the model will be saved if the EXPORT subcommand
108 struct file_handle *model_file;
111 Return value for the procedure.
113 int pspp_reg_rc = CMD_SUCCESS;
115 static bool run_regression (const struct casefile *, void *);
118 STATISTICS subcommand output functions.
120 static void reg_stats_r (pspp_linreg_cache *);
121 static void reg_stats_coeff (pspp_linreg_cache *);
122 static void reg_stats_anova (pspp_linreg_cache *);
123 static void reg_stats_outs (pspp_linreg_cache *);
124 static void reg_stats_zpp (pspp_linreg_cache *);
125 static void reg_stats_label (pspp_linreg_cache *);
126 static void reg_stats_sha (pspp_linreg_cache *);
127 static void reg_stats_ci (pspp_linreg_cache *);
128 static void reg_stats_f (pspp_linreg_cache *);
129 static void reg_stats_bcov (pspp_linreg_cache *);
130 static void reg_stats_ses (pspp_linreg_cache *);
131 static void reg_stats_xtx (pspp_linreg_cache *);
132 static void reg_stats_collin (pspp_linreg_cache *);
133 static void reg_stats_tol (pspp_linreg_cache *);
134 static void reg_stats_selection (pspp_linreg_cache *);
135 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
136 int, pspp_linreg_cache *);
139 reg_stats_r (pspp_linreg_cache * c)
149 rsq = c->ssm / c->sst;
150 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
151 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
152 t = tab_create (n_cols, n_rows, 0);
153 tab_dim (t, tab_natural_dimensions);
154 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
155 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
156 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
157 tab_vline (t, TAL_0, 1, 0, 0);
159 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
160 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
161 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
162 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
163 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
164 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
165 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
166 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
167 tab_title (t, _("Model Summary"));
172 Table showing estimated regression coefficients.
175 reg_stats_coeff (pspp_linreg_cache * c)
187 const struct variable *v;
188 const union value *val;
193 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
194 n_rows = c->n_coeffs + 2;
196 t = tab_create (n_cols, n_rows, 0);
197 tab_headers (t, 2, 0, 1, 0);
198 tab_dim (t, tab_natural_dimensions);
199 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
200 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
201 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
202 tab_vline (t, TAL_0, 1, 0, 0);
204 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
205 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
206 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
207 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
208 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
209 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
210 coeff = c->coeff[0].estimate;
211 tab_float (t, 2, 1, 0, coeff, 10, 2);
212 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
213 tab_float (t, 3, 1, 0, std_err, 10, 2);
214 beta = coeff / c->depvar_std;
215 tab_float (t, 4, 1, 0, beta, 10, 2);
216 t_stat = coeff / std_err;
217 tab_float (t, 5, 1, 0, t_stat, 10, 2);
218 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
219 tab_float (t, 6, 1, 0, pval, 10, 2);
220 for (j = 1; j <= c->n_indeps; j++)
222 v = pspp_linreg_coeff_get_var (c->coeff + j, 0);
223 label = var_to_string (v);
224 /* Do not overwrite the variable's name. */
225 strncpy (tmp, label, MAX_STRING);
226 if (v->type == ALPHA)
229 Append the value associated with this coefficient.
230 This makes sense only if we us the usual binary encoding
234 val = pspp_linreg_coeff_get_value (c->coeff + j, v);
235 val_s = value_to_string (val, v);
236 strncat (tmp, val_s, MAX_STRING);
239 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
241 Regression coefficients.
243 coeff = c->coeff[j].estimate;
244 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
246 Standard error of the coefficients.
248 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
249 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
251 'Standardized' coefficient, i.e., regression coefficient
252 if all variables had unit variance.
254 beta = gsl_vector_get (c->indep_std, j);
255 beta *= coeff / c->depvar_std;
256 tab_float (t, 4, j + 1, 0, beta, 10, 2);
259 Test statistic for H0: coefficient is 0.
261 t_stat = coeff / std_err;
262 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
264 P values for the test statistic above.
266 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
267 tab_float (t, 6, j + 1, 0, pval, 10, 2);
269 tab_title (t, _("Coefficients"));
275 Display the ANOVA table.
278 reg_stats_anova (pspp_linreg_cache * c)
282 const double msm = c->ssm / c->dfm;
283 const double mse = c->sse / c->dfe;
284 const double F = msm / mse;
285 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
290 t = tab_create (n_cols, n_rows, 0);
291 tab_headers (t, 2, 0, 1, 0);
292 tab_dim (t, tab_natural_dimensions);
294 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
296 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
297 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
298 tab_vline (t, TAL_0, 1, 0, 0);
300 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
301 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
302 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
303 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
304 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
306 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
307 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
308 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
310 /* Sums of Squares */
311 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
312 tab_float (t, 2, 3, 0, c->sst, 10, 2);
313 tab_float (t, 2, 2, 0, c->sse, 10, 2);
316 /* Degrees of freedom */
317 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
318 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
319 tab_float (t, 3, 3, 0, c->dft, 4, 0);
323 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
324 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
326 tab_float (t, 5, 1, 0, F, 8, 3);
328 tab_float (t, 6, 1, 0, pval, 8, 3);
330 tab_title (t, _("ANOVA"));
334 reg_stats_outs (pspp_linreg_cache * c)
339 reg_stats_zpp (pspp_linreg_cache * c)
344 reg_stats_label (pspp_linreg_cache * c)
349 reg_stats_sha (pspp_linreg_cache * c)
354 reg_stats_ci (pspp_linreg_cache * c)
359 reg_stats_f (pspp_linreg_cache * c)
364 reg_stats_bcov (pspp_linreg_cache * c)
376 n_cols = c->n_indeps + 1 + 2;
377 n_rows = 2 * (c->n_indeps + 1);
378 t = tab_create (n_cols, n_rows, 0);
379 tab_headers (t, 2, 0, 1, 0);
380 tab_dim (t, tab_natural_dimensions);
381 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
382 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
383 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
384 tab_vline (t, TAL_0, 1, 0, 0);
385 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
386 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
387 for (i = 1; i < c->n_coeffs; i++)
389 const struct variable *v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
390 label = var_to_string (v);
391 tab_text (t, 2, i, TAB_CENTER, label);
392 tab_text (t, i + 2, 0, TAB_CENTER, label);
393 for (k = 1; k < c->n_coeffs; k++)
395 col = (i <= k) ? k : i;
396 row = (i <= k) ? i : k;
397 tab_float (t, k + 2, i, TAB_CENTER,
398 gsl_matrix_get (c->cov, row, col), 8, 3);
401 tab_title (t, _("Coefficient Correlations"));
405 reg_stats_ses (pspp_linreg_cache * c)
410 reg_stats_xtx (pspp_linreg_cache * c)
415 reg_stats_collin (pspp_linreg_cache * c)
420 reg_stats_tol (pspp_linreg_cache * c)
425 reg_stats_selection (pspp_linreg_cache * c)
431 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
432 int keyword, pspp_linreg_cache * c)
441 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
444 The order here must match the order in which the STATISTICS
445 keywords appear in the specification section above.
472 Set everything but F.
474 for (i = 0; i < f; i++)
481 for (i = 0; i < all; i++)
489 Default output: ANOVA table, parameter estimates,
490 and statistics for variables not entered into model,
493 if (keywords[defaults] | d)
501 statistics_keyword_output (reg_stats_r, keywords[r], c);
502 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
503 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
504 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
505 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
506 statistics_keyword_output (reg_stats_label, keywords[label], c);
507 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
508 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
509 statistics_keyword_output (reg_stats_f, keywords[f], c);
510 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
511 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
512 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
513 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
514 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
515 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
518 Free the transformation. Free its linear model if this
519 transformation is the last one.
522 bool regression_trns_free (void *t_)
525 struct reg_trns *t = t_;
527 if (t->trns_id == t->n_trns)
529 result = pspp_linreg_cache_free (t->c);
536 Gets the predicted values.
539 regression_trns_pred_proc (void *t_, struct ccase *c, int case_idx UNUSED)
544 union value *tmp = NULL;
545 struct reg_trns *t = t_;
546 pspp_linreg_cache *model;
548 const union value **vals = NULL;
549 struct variable **vars = NULL;
550 struct variable **model_vars = NULL;
554 assert (model != NULL);
555 assert (model->depvar != NULL);
556 assert (model->pred != NULL);
558 dict_get_vars (default_dict, &vars, &n_vars, 1u << DC_SYSTEM);
559 vals = xnmalloc (n_vars, sizeof (*vals));
560 model_vars = xnmalloc (n_vars, sizeof (*model_vars));
561 assert (vals != NULL);
562 output = case_data_rw (c, model->pred->fv);
563 assert (output != NULL);
565 for (i = 0; i < n_vars; i++)
567 /* Use neither the predicted values nor the dependent variable. */
568 if (vars[i]->index != model->pred->index &&
569 vars[i]->index != model->depvar->index)
571 if (vars[i]->type == ALPHA && vars[i]->obs_vals != NULL)
573 tmp = vars[i]->obs_vals->vals;
580 Make sure the variable we use is in the linear model.
582 if (pspp_linreg_get_coeff (model, vars[i], tmp) != NULL)
584 vals[n_vals] = case_data (c, i);
585 model_vars[n_vals] = vars[i];
590 output->f = (*model->predict) ((const struct variable **) vars,
591 vals, model, n_vals);
594 return TRNS_CONTINUE;
600 regression_trns_resid_proc (void *t_, struct ccase *c, int case_idx UNUSED)
604 struct reg_trns *trns = t_;
605 pspp_linreg_cache *model;
606 union value *output = NULL;
607 const union value **vals = NULL;
608 const union value *obs = NULL;
609 struct variable **vars = NULL;
611 assert (trns!= NULL);
613 assert (model != NULL);
614 assert (model->depvar != NULL);
615 assert (model->resid != NULL);
617 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
618 n_vals = (*model->get_vars) (model, vars);
620 vals = xnmalloc (n_vals, sizeof (*vals));
621 output = case_data_rw (c, model->resid->fv);
622 assert (output != NULL);
624 for (i = 0; i < n_vals; i++)
626 vals[i] = case_data (c, vars[i]->fv);
628 obs = case_data (c, model->depvar->fv);
629 output->f = (*model->residual) ((const struct variable **) vars,
630 vals, obs, model, n_vals);
633 return TRNS_CONTINUE;
636 Returns 0 if NAME is a duplicate of any existing variable name.
639 try_name (char *name)
641 if (dict_lookup_var (default_dict, name) != NULL)
647 void reg_get_name (char name[LONG_NAME_LEN], const char prefix[LONG_NAME_LEN])
651 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
652 while (!try_name(name))
655 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
659 reg_save_var (const char *prefix, trns_proc_func *f,
660 pspp_linreg_cache *c, struct variable **v,
663 static int trns_index = 1;
664 char name[LONG_NAME_LEN];
665 struct variable *new_var;
666 struct reg_trns *t = NULL;
668 t = xmalloc (sizeof (*t));
669 t->trns_id = trns_index;
672 reg_get_name (name, prefix);
673 new_var = dict_create_var (default_dict, name, 0);
674 assert (new_var != NULL);
676 add_transformation (f, regression_trns_free, t);
680 subcommand_save (int save, pspp_linreg_cache **models)
682 pspp_linreg_cache **lc;
686 assert (models != NULL);
690 /* Count the number of transformations we will need. */
691 for (i = 0; i < REGRESSION_SV_count; i++)
698 n_trns *= cmd.n_dependent;
700 for (lc = models; lc < models + cmd.n_dependent; lc++)
702 assert (*lc != NULL);
703 assert ((*lc)->depvar != NULL);
704 if (cmd.a_save[REGRESSION_SV_RESID])
706 reg_save_var ("RES", regression_trns_resid_proc, *lc, &(*lc)->resid, n_trns);
708 if (cmd.a_save[REGRESSION_SV_PRED])
710 reg_save_var ("PRED", regression_trns_pred_proc, *lc, &(*lc)->pred, n_trns);
716 for (lc = models; lc < models + cmd.n_dependent; lc++)
718 assert (*lc != NULL);
719 pspp_linreg_cache_free (*lc);
724 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
728 for (i = 0; i < n_vars; i++)
730 if (v->index == varlist[i]->index)
738 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
743 struct variable **varlist;
744 struct pspp_linreg_coeff *coeff;
745 const struct variable *v;
748 fprintf (fp, "%s", reg_export_categorical_encode_1);
750 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
751 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
753 coeff = c->coeff + i;
754 v = pspp_linreg_coeff_get_var (coeff, 0);
755 if (v->type == ALPHA)
757 if (!reg_inserted (v, varlist, n_vars))
759 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
761 varlist[n_vars] = (struct variable *) v;
766 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
767 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
769 for (i = 0; i < n_vars - 1; i++)
771 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
773 fprintf (fp, "&%s};\n\t", varlist[i]->name);
775 for (i = 0; i < n_vars; i++)
777 coeff = c->coeff + i;
778 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
780 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
781 varlist[i]->obs_vals->n_categories);
783 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
785 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
786 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
787 value_to_string (val, varlist[i]));
790 fprintf (fp, "%s", reg_export_categorical_encode_2);
794 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
797 struct pspp_linreg_coeff *coeff;
798 const struct variable *v;
800 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
801 for (i = 1; i < c->n_indeps; i++)
803 coeff = c->coeff + i;
804 v = pspp_linreg_coeff_get_var (coeff, 0);
805 fprintf (fp, "\"%s\",\n\t\t", v->name);
807 coeff = c->coeff + i;
808 v = pspp_linreg_coeff_get_var (coeff, 0);
809 fprintf (fp, "\"%s\"};\n\t", v->name);
812 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
814 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
815 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
816 reg_print_depvars (fp, c);
817 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
819 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
820 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
823 reg_has_categorical (pspp_linreg_cache * c)
826 const struct variable *v;
828 for (i = 1; i < c->n_coeffs; i++)
830 v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
831 if (v->type == ALPHA)
840 subcommand_export (int export, pspp_linreg_cache * c)
845 int n_quantiles = 100;
848 struct pspp_linreg_coeff coeff;
853 assert (model_file != NULL);
854 fp = fopen (fh_get_file_name (model_file), "w");
856 fprintf (fp, "%s", reg_preamble);
857 reg_print_getvar (fp, c);
858 if (reg_has_categorical (c))
860 reg_print_categorical_encoding (fp, c);
862 fprintf (fp, "%s", reg_export_t_quantiles_1);
863 increment = 0.5 / (double) increment;
864 for (i = 0; i < n_quantiles - 1; i++)
866 tmp = 0.5 + 0.005 * (double) i;
867 fprintf (fp, "%.15e,\n\t\t",
868 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
870 fprintf (fp, "%.15e};\n\t",
871 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
872 fprintf (fp, "%s", reg_export_t_quantiles_2);
873 fprintf (fp, "%s", reg_mean_cmt);
874 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
875 fprintf (fp, "const char *var_names[])\n{\n\t");
876 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
877 for (i = 1; i < c->n_indeps; i++)
880 fprintf (fp, "%.15e,\n\t\t", coeff.estimate);
883 fprintf (fp, "%.15e};\n\t", coeff.estimate);
885 fprintf (fp, "double estimate = %.15e;\n\t", coeff.estimate);
886 fprintf (fp, "int i;\n\tint j;\n\n\t");
887 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
888 fprintf (fp, "%s", reg_getvar);
889 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
891 for (i = 0; i < c->cov->size1 - 1; i++)
894 for (j = 0; j < c->cov->size2 - 1; j++)
896 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
898 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
901 for (j = 0; j < c->cov->size2 - 1; j++)
903 fprintf (fp, "%.15e, ",
904 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
906 fprintf (fp, "%.15e}\n\t",
907 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
908 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
910 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
911 fprintf (fp, "%s", reg_variance);
912 fprintf (fp, "%s", reg_export_confidence_interval);
913 tmp = c->mse * c->mse;
914 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
915 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
916 fprintf (fp, "%s", reg_export_prediction_interval_3);
918 fp = fopen ("pspp_model_reg.h", "w");
919 fprintf (fp, "%s", reg_header);
924 regression_custom_export (struct cmd_regression *cmd UNUSED)
926 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
927 if (!lex_force_match ('('))
934 model_file = fh_parse (FH_REF_FILE);
935 if (model_file == NULL)
939 if (!lex_force_match (')'))
946 cmd_regression (void)
948 if (!parse_regression (&cmd))
951 models = xnmalloc (cmd.n_dependent, sizeof *models);
952 if (!multipass_procedure_with_splits (run_regression, &cmd))
953 return CMD_CASCADING_FAILURE;
954 subcommand_save (cmd.sbc_save, models);
961 Is variable k the dependent variable?
964 is_depvar (size_t k, const struct variable *v)
967 compare_var_names returns 0 if the variable
970 if (!compare_var_names (v, v_variables[k], NULL))
977 Mark missing cases. Return the number of non-missing cases.
980 mark_missing_cases (const struct casefile *cf, struct variable *v,
981 int *is_missing_case, double n_data)
983 struct casereader *r;
986 const union value *val;
988 for (r = casefile_get_reader (cf);
989 casereader_read (r, &c); case_destroy (&c))
991 row = casereader_cnum (r) - 1;
993 val = case_data (&c, v->fv);
994 cat_value_update (v, val);
995 if (mv_is_value_missing (&v->miss, val))
997 if (!is_missing_case[row])
999 /* Now it is missing. */
1001 is_missing_case[row] = 1;
1005 casereader_destroy (r);
1010 /* Parser for the variables sub command */
1012 regression_custom_variables(struct cmd_regression *cmd UNUSED)
1017 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
1022 if (!parse_variables (default_dict, &v_variables, &n_variables,
1028 assert(n_variables);
1033 Count the explanatory variables. The user may or may
1034 not have specified a response variable in the syntax.
1037 int get_n_indep (const struct variable *v)
1042 result = n_variables;
1043 while (i < n_variables)
1045 if (is_depvar (i, v))
1055 Read from the active file. Identify the explanatory variables in
1056 v_variables. Encode categorical variables. Drop cases with missing
1060 int prepare_data (int n_data, int is_missing_case[],
1061 struct variable **indep_vars,
1062 struct variable *depvar,
1063 const struct casefile *cf)
1068 assert (indep_vars != NULL);
1070 for (i = 0; i < n_variables; i++)
1072 if (!is_depvar (i, depvar))
1074 indep_vars[j] = v_variables[i];
1076 if (v_variables[i]->type == ALPHA)
1078 /* Make a place to hold the binary vectors
1079 corresponding to this variable's values. */
1080 cat_stored_values_create (v_variables[i]);
1082 n_data = mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
1086 Mark missing cases for the dependent variable.
1088 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
1093 run_regression (const struct casefile *cf, void *cmd_ UNUSED)
1096 size_t n_data = 0; /* Number of valide cases. */
1097 size_t n_cases; /* Number of cases. */
1103 Keep track of the missing cases.
1105 int *is_missing_case;
1106 const union value *val;
1107 struct casereader *r;
1109 struct variable **indep_vars;
1110 struct design_matrix *X;
1113 pspp_linreg_opts lopts;
1115 assert (models != NULL);
1118 dict_get_vars (default_dict, &v_variables, &n_variables,
1122 n_cases = casefile_get_case_cnt (cf);
1124 for (i = 0; i < cmd.n_dependent; i++)
1126 if (cmd.v_dependent[i]->type != NUMERIC)
1128 msg (SE, gettext ("Dependent variable must be numeric."));
1129 pspp_reg_rc = CMD_FAILURE;
1134 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
1136 lopts.get_depvar_mean_std = 1;
1138 for (k = 0; k < cmd.n_dependent; k++)
1140 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
1141 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
1142 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
1143 assert (indep_vars != NULL);
1145 for (i = 0; i < n_cases; i++)
1147 is_missing_case[i] = 0;
1149 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
1151 (const struct casefile *) cf);
1152 Y = gsl_vector_alloc (n_data);
1155 design_matrix_create (n_indep, (const struct variable **) indep_vars,
1157 for (i = 0; i < X->m->size2; i++)
1159 lopts.get_indep_mean_std[i] = 1;
1161 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1162 models[k]->indep_means = gsl_vector_alloc (X->m->size2);
1163 models[k]->indep_std = gsl_vector_alloc (X->m->size2);
1164 models[k]->depvar = (const struct variable *) cmd.v_dependent[k];
1166 For large data sets, use QR decomposition.
1168 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1170 models[k]->method = PSPP_LINREG_SVD;
1174 The second pass fills the design matrix.
1177 for (r = casefile_get_reader (cf); casereader_read (r, &c);
1179 /* Iterate over the cases. */
1181 case_num = casereader_cnum (r) - 1;
1182 if (!is_missing_case[case_num])
1184 for (i = 0; i < n_variables; ++i) /* Iterate over the
1189 val = case_data (&c, v_variables[i]->fv);
1191 Independent/dependent variable separation. The
1192 'variables' subcommand specifies a varlist which contains
1193 both dependent and independent variables. The dependent
1194 variables are specified with the 'dependent'
1195 subcommand, and maybe also in the 'variables' subcommand.
1196 We need to separate the two.
1198 if (!is_depvar (i, cmd.v_dependent[k]))
1200 if (v_variables[i]->type == ALPHA)
1202 design_matrix_set_categorical (X, row, v_variables[i], val);
1204 else if (v_variables[i]->type == NUMERIC)
1206 design_matrix_set_numeric (X, row, v_variables[i], val);
1210 val = case_data (&c, cmd.v_dependent[k]->fv);
1211 gsl_vector_set (Y, row, val->f);
1216 Now that we know the number of coefficients, allocate space
1217 and store pointers to the variables that correspond to the
1220 pspp_linreg_coeff_init (models[k], X);
1223 Find the least-squares estimates and other statistics.
1225 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1226 subcommand_statistics (cmd.a_statistics, models[k]);
1227 subcommand_export (cmd.sbc_export, models[k]);
1229 gsl_vector_free (Y);
1230 design_matrix_destroy (X);
1232 free (lopts.get_indep_mean_std);
1233 casereader_destroy (r);
1236 free (is_missing_case);