1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_vector.h>
24 #include <gsl/gsl_matrix.h>
26 #include <libpspp/alloc.h>
27 #include <data/case.h>
28 #include <data/casefile.h>
29 #include <data/category.h>
30 #include <data/cat-routines.h>
31 #include <language/command.h>
32 #include <libpspp/compiler.h>
33 #include <math/design-matrix.h>
34 #include <data/dictionary.h>
35 #include <libpspp/message.h>
36 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <math/linreg/linreg.h>
40 #include <math/linreg/coefficient.h>
41 #include <data/missing-values.h>
42 #include "regression-export.h"
43 #include <output/table.h>
44 #include <data/value-labels.h>
45 #include <data/variable.h>
46 #include <procedure.h>
48 #define REG_LARGE_DATA 1000
53 "REGRESSION" (regression_):
79 static struct cmd_regression cmd;
81 /* Linear regression models. */
82 pspp_linreg_cache **models = NULL;
85 Transformations for saving predicted values
90 int n_trns; /* Number of transformations. */
91 int trns_id; /* Which trns is this one? */
92 pspp_linreg_cache *c; /* Linear model for this trns. */
95 Variables used (both explanatory and response).
97 static struct variable **v_variables;
102 static size_t n_variables;
105 File where the model will be saved if the EXPORT subcommand
108 struct file_handle *model_file;
111 Return value for the procedure.
113 int pspp_reg_rc = CMD_SUCCESS;
115 static bool run_regression (const struct casefile *, void *);
118 STATISTICS subcommand output functions.
120 static void reg_stats_r (pspp_linreg_cache *);
121 static void reg_stats_coeff (pspp_linreg_cache *);
122 static void reg_stats_anova (pspp_linreg_cache *);
123 static void reg_stats_outs (pspp_linreg_cache *);
124 static void reg_stats_zpp (pspp_linreg_cache *);
125 static void reg_stats_label (pspp_linreg_cache *);
126 static void reg_stats_sha (pspp_linreg_cache *);
127 static void reg_stats_ci (pspp_linreg_cache *);
128 static void reg_stats_f (pspp_linreg_cache *);
129 static void reg_stats_bcov (pspp_linreg_cache *);
130 static void reg_stats_ses (pspp_linreg_cache *);
131 static void reg_stats_xtx (pspp_linreg_cache *);
132 static void reg_stats_collin (pspp_linreg_cache *);
133 static void reg_stats_tol (pspp_linreg_cache *);
134 static void reg_stats_selection (pspp_linreg_cache *);
135 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
136 int, pspp_linreg_cache *);
139 reg_stats_r (pspp_linreg_cache * c)
149 rsq = c->ssm / c->sst;
150 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
151 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
152 t = tab_create (n_cols, n_rows, 0);
153 tab_dim (t, tab_natural_dimensions);
154 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
155 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
156 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
157 tab_vline (t, TAL_0, 1, 0, 0);
159 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
160 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
161 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
162 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
163 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
164 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
165 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
166 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
167 tab_title (t, _("Model Summary"));
172 Table showing estimated regression coefficients.
175 reg_stats_coeff (pspp_linreg_cache * c)
187 const struct variable *v;
188 const union value *val;
193 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
194 n_rows = c->n_coeffs + 2;
196 t = tab_create (n_cols, n_rows, 0);
197 tab_headers (t, 2, 0, 1, 0);
198 tab_dim (t, tab_natural_dimensions);
199 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
200 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
201 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
202 tab_vline (t, TAL_0, 1, 0, 0);
204 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
205 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
206 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
207 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
208 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
209 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
210 coeff = c->coeff[0].estimate;
211 tab_float (t, 2, 1, 0, coeff, 10, 2);
212 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
213 tab_float (t, 3, 1, 0, std_err, 10, 2);
214 beta = coeff / c->depvar_std;
215 tab_float (t, 4, 1, 0, beta, 10, 2);
216 t_stat = coeff / std_err;
217 tab_float (t, 5, 1, 0, t_stat, 10, 2);
218 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
219 tab_float (t, 6, 1, 0, pval, 10, 2);
220 for (j = 1; j <= c->n_indeps; j++)
222 v = pspp_linreg_coeff_get_var (c->coeff + j, 0);
223 label = var_to_string (v);
224 /* Do not overwrite the variable's name. */
225 strncpy (tmp, label, MAX_STRING);
226 if (v->type == ALPHA)
229 Append the value associated with this coefficient.
230 This makes sense only if we us the usual binary encoding
234 val = pspp_linreg_coeff_get_value (c->coeff + j, v);
235 val_s = value_to_string (val, v);
236 strncat (tmp, val_s, MAX_STRING);
239 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
241 Regression coefficients.
243 coeff = c->coeff[j].estimate;
244 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
246 Standard error of the coefficients.
248 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
249 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
251 'Standardized' coefficient, i.e., regression coefficient
252 if all variables had unit variance.
254 beta = gsl_vector_get (c->indep_std, j);
255 beta *= coeff / c->depvar_std;
256 tab_float (t, 4, j + 1, 0, beta, 10, 2);
259 Test statistic for H0: coefficient is 0.
261 t_stat = coeff / std_err;
262 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
264 P values for the test statistic above.
266 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
267 tab_float (t, 6, j + 1, 0, pval, 10, 2);
269 tab_title (t, _("Coefficients"));
275 Display the ANOVA table.
278 reg_stats_anova (pspp_linreg_cache * c)
282 const double msm = c->ssm / c->dfm;
283 const double mse = c->sse / c->dfe;
284 const double F = msm / mse;
285 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
290 t = tab_create (n_cols, n_rows, 0);
291 tab_headers (t, 2, 0, 1, 0);
292 tab_dim (t, tab_natural_dimensions);
294 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
296 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
297 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
298 tab_vline (t, TAL_0, 1, 0, 0);
300 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
301 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
302 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
303 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
304 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
306 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
307 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
308 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
310 /* Sums of Squares */
311 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
312 tab_float (t, 2, 3, 0, c->sst, 10, 2);
313 tab_float (t, 2, 2, 0, c->sse, 10, 2);
316 /* Degrees of freedom */
317 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
318 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
319 tab_float (t, 3, 3, 0, c->dft, 4, 0);
323 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
324 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
326 tab_float (t, 5, 1, 0, F, 8, 3);
328 tab_float (t, 6, 1, 0, pval, 8, 3);
330 tab_title (t, _("ANOVA"));
334 reg_stats_outs (pspp_linreg_cache * c)
339 reg_stats_zpp (pspp_linreg_cache * c)
344 reg_stats_label (pspp_linreg_cache * c)
349 reg_stats_sha (pspp_linreg_cache * c)
354 reg_stats_ci (pspp_linreg_cache * c)
359 reg_stats_f (pspp_linreg_cache * c)
364 reg_stats_bcov (pspp_linreg_cache * c)
376 n_cols = c->n_indeps + 1 + 2;
377 n_rows = 2 * (c->n_indeps + 1);
378 t = tab_create (n_cols, n_rows, 0);
379 tab_headers (t, 2, 0, 1, 0);
380 tab_dim (t, tab_natural_dimensions);
381 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
382 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
383 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
384 tab_vline (t, TAL_0, 1, 0, 0);
385 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
386 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
387 for (i = 1; i < c->n_coeffs; i++)
389 const struct variable *v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
390 label = var_to_string (v);
391 tab_text (t, 2, i, TAB_CENTER, label);
392 tab_text (t, i + 2, 0, TAB_CENTER, label);
393 for (k = 1; k < c->n_coeffs; k++)
395 col = (i <= k) ? k : i;
396 row = (i <= k) ? i : k;
397 tab_float (t, k + 2, i, TAB_CENTER,
398 gsl_matrix_get (c->cov, row, col), 8, 3);
401 tab_title (t, _("Coefficient Correlations"));
405 reg_stats_ses (pspp_linreg_cache * c)
410 reg_stats_xtx (pspp_linreg_cache * c)
415 reg_stats_collin (pspp_linreg_cache * c)
420 reg_stats_tol (pspp_linreg_cache * c)
425 reg_stats_selection (pspp_linreg_cache * c)
431 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
432 int keyword, pspp_linreg_cache * c)
441 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
444 The order here must match the order in which the STATISTICS
445 keywords appear in the specification section above.
472 Set everything but F.
474 for (i = 0; i < f; i++)
481 for (i = 0; i < all; i++)
489 Default output: ANOVA table, parameter estimates,
490 and statistics for variables not entered into model,
493 if (keywords[defaults] | d)
501 statistics_keyword_output (reg_stats_r, keywords[r], c);
502 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
503 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
504 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
505 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
506 statistics_keyword_output (reg_stats_label, keywords[label], c);
507 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
508 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
509 statistics_keyword_output (reg_stats_f, keywords[f], c);
510 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
511 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
512 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
513 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
514 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
515 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
518 Free the transformation. Free its linear model if this
519 transformation is the last one.
522 bool regression_trns_free (void *t_)
525 struct reg_trns *t = t_;
527 if (t->trns_id == t->n_trns)
529 result = pspp_linreg_cache_free (t->c);
536 Gets the predicted values.
539 regression_trns_pred_proc (void *t_, struct ccase *c, int case_idx UNUSED)
543 struct reg_trns *trns = t_;
544 pspp_linreg_cache *model;
545 union value *output = NULL;
546 const union value **vals = NULL;
547 struct variable **vars = NULL;
549 assert (trns!= NULL);
551 assert (model != NULL);
552 assert (model->depvar != NULL);
553 assert (model->pred != NULL);
555 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
556 n_vals = (*model->get_vars) (model, vars);
558 vals = xnmalloc (n_vals, sizeof (*vals));
559 output = case_data_rw (c, model->pred->fv);
560 assert (output != NULL);
562 for (i = 0; i < n_vals; i++)
564 vals[i] = case_data (c, vars[i]->fv);
566 output->f = (*model->predict) ((const struct variable **) vars,
567 vals, model, n_vals);
570 return TRNS_CONTINUE;
576 regression_trns_resid_proc (void *t_, struct ccase *c, int case_idx UNUSED)
580 struct reg_trns *trns = t_;
581 pspp_linreg_cache *model;
582 union value *output = NULL;
583 const union value **vals = NULL;
584 const union value *obs = NULL;
585 struct variable **vars = NULL;
587 assert (trns!= NULL);
589 assert (model != NULL);
590 assert (model->depvar != NULL);
591 assert (model->resid != NULL);
593 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
594 n_vals = (*model->get_vars) (model, vars);
596 vals = xnmalloc (n_vals, sizeof (*vals));
597 output = case_data_rw (c, model->resid->fv);
598 assert (output != NULL);
600 for (i = 0; i < n_vals; i++)
602 vals[i] = case_data (c, vars[i]->fv);
604 obs = case_data (c, model->depvar->fv);
605 output->f = (*model->residual) ((const struct variable **) vars,
606 vals, obs, model, n_vals);
609 return TRNS_CONTINUE;
612 Returns 0 if NAME is a duplicate of any existing variable name.
615 try_name (char *name)
617 if (dict_lookup_var (default_dict, name) != NULL)
623 void reg_get_name (char name[LONG_NAME_LEN], const char prefix[LONG_NAME_LEN])
627 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
628 while (!try_name(name))
631 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
635 reg_save_var (const char *prefix, trns_proc_func *f,
636 pspp_linreg_cache *c, struct variable **v,
639 static int trns_index = 1;
640 char name[LONG_NAME_LEN];
641 struct variable *new_var;
642 struct reg_trns *t = NULL;
644 t = xmalloc (sizeof (*t));
645 t->trns_id = trns_index;
648 reg_get_name (name, prefix);
649 new_var = dict_create_var (default_dict, name, 0);
650 assert (new_var != NULL);
652 add_transformation (f, regression_trns_free, t);
656 subcommand_save (int save, pspp_linreg_cache **models)
658 pspp_linreg_cache **lc;
662 assert (models != NULL);
666 /* Count the number of transformations we will need. */
667 for (i = 0; i < REGRESSION_SV_count; i++)
674 n_trns *= cmd.n_dependent;
676 for (lc = models; lc < models + cmd.n_dependent; lc++)
678 assert (*lc != NULL);
679 assert ((*lc)->depvar != NULL);
680 if (cmd.a_save[REGRESSION_SV_RESID])
682 reg_save_var ("RES", regression_trns_resid_proc, *lc, &(*lc)->resid, n_trns);
684 if (cmd.a_save[REGRESSION_SV_PRED])
686 reg_save_var ("PRED", regression_trns_pred_proc, *lc, &(*lc)->pred, n_trns);
692 for (lc = models; lc < models + cmd.n_dependent; lc++)
694 assert (*lc != NULL);
695 pspp_linreg_cache_free (*lc);
700 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
704 for (i = 0; i < n_vars; i++)
706 if (v->index == varlist[i]->index)
714 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
719 struct variable **varlist;
720 struct pspp_linreg_coeff *coeff;
721 const struct variable *v;
724 fprintf (fp, "%s", reg_export_categorical_encode_1);
726 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
727 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
729 coeff = c->coeff + i;
730 v = pspp_linreg_coeff_get_var (coeff, 0);
731 if (v->type == ALPHA)
733 if (!reg_inserted (v, varlist, n_vars))
735 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
737 varlist[n_vars] = (struct variable *) v;
742 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
743 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
745 for (i = 0; i < n_vars - 1; i++)
747 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
749 fprintf (fp, "&%s};\n\t", varlist[i]->name);
751 for (i = 0; i < n_vars; i++)
753 coeff = c->coeff + i;
754 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
756 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
757 varlist[i]->obs_vals->n_categories);
759 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
761 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
762 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
763 value_to_string (val, varlist[i]));
766 fprintf (fp, "%s", reg_export_categorical_encode_2);
770 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
773 struct pspp_linreg_coeff *coeff;
774 const struct variable *v;
776 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
777 for (i = 1; i < c->n_indeps; i++)
779 coeff = c->coeff + i;
780 v = pspp_linreg_coeff_get_var (coeff, 0);
781 fprintf (fp, "\"%s\",\n\t\t", v->name);
783 coeff = c->coeff + i;
784 v = pspp_linreg_coeff_get_var (coeff, 0);
785 fprintf (fp, "\"%s\"};\n\t", v->name);
788 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
790 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
791 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
792 reg_print_depvars (fp, c);
793 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
795 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
796 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
799 reg_has_categorical (pspp_linreg_cache * c)
802 const struct variable *v;
804 for (i = 1; i < c->n_coeffs; i++)
806 v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
807 if (v->type == ALPHA)
816 subcommand_export (int export, pspp_linreg_cache * c)
821 int n_quantiles = 100;
824 struct pspp_linreg_coeff coeff;
829 assert (model_file != NULL);
830 fp = fopen (fh_get_file_name (model_file), "w");
832 fprintf (fp, "%s", reg_preamble);
833 reg_print_getvar (fp, c);
834 if (reg_has_categorical (c))
836 reg_print_categorical_encoding (fp, c);
838 fprintf (fp, "%s", reg_export_t_quantiles_1);
839 increment = 0.5 / (double) increment;
840 for (i = 0; i < n_quantiles - 1; i++)
842 tmp = 0.5 + 0.005 * (double) i;
843 fprintf (fp, "%.15e,\n\t\t",
844 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
846 fprintf (fp, "%.15e};\n\t",
847 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
848 fprintf (fp, "%s", reg_export_t_quantiles_2);
849 fprintf (fp, "%s", reg_mean_cmt);
850 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
851 fprintf (fp, "const char *var_names[])\n{\n\t");
852 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
853 for (i = 1; i < c->n_indeps; i++)
856 fprintf (fp, "%.15e,\n\t\t", coeff.estimate);
859 fprintf (fp, "%.15e};\n\t", coeff.estimate);
861 fprintf (fp, "double estimate = %.15e;\n\t", coeff.estimate);
862 fprintf (fp, "int i;\n\tint j;\n\n\t");
863 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
864 fprintf (fp, "%s", reg_getvar);
865 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
867 for (i = 0; i < c->cov->size1 - 1; i++)
870 for (j = 0; j < c->cov->size2 - 1; j++)
872 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
874 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
877 for (j = 0; j < c->cov->size2 - 1; j++)
879 fprintf (fp, "%.15e, ",
880 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
882 fprintf (fp, "%.15e}\n\t",
883 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
884 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
886 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
887 fprintf (fp, "%s", reg_variance);
888 fprintf (fp, "%s", reg_export_confidence_interval);
889 tmp = c->mse * c->mse;
890 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
891 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
892 fprintf (fp, "%s", reg_export_prediction_interval_3);
894 fp = fopen ("pspp_model_reg.h", "w");
895 fprintf (fp, "%s", reg_header);
900 regression_custom_export (struct cmd_regression *cmd UNUSED)
902 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
903 if (!lex_force_match ('('))
910 model_file = fh_parse (FH_REF_FILE);
911 if (model_file == NULL)
915 if (!lex_force_match (')'))
922 cmd_regression (void)
924 if (!parse_regression (&cmd))
927 models = xnmalloc (cmd.n_dependent, sizeof *models);
928 if (!multipass_procedure_with_splits (run_regression, &cmd))
929 return CMD_CASCADING_FAILURE;
930 subcommand_save (cmd.sbc_save, models);
937 Is variable k the dependent variable?
940 is_depvar (size_t k, const struct variable *v)
943 compare_var_names returns 0 if the variable
946 if (!compare_var_names (v, v_variables[k], NULL))
953 Mark missing cases. Return the number of non-missing cases.
956 mark_missing_cases (const struct casefile *cf, struct variable *v,
957 int *is_missing_case, double n_data)
959 struct casereader *r;
962 const union value *val;
964 for (r = casefile_get_reader (cf);
965 casereader_read (r, &c); case_destroy (&c))
967 row = casereader_cnum (r) - 1;
969 val = case_data (&c, v->fv);
970 cat_value_update (v, val);
971 if (mv_is_value_missing (&v->miss, val))
973 if (!is_missing_case[row])
975 /* Now it is missing. */
977 is_missing_case[row] = 1;
981 casereader_destroy (r);
986 /* Parser for the variables sub command */
988 regression_custom_variables(struct cmd_regression *cmd UNUSED)
993 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
998 if (!parse_variables (default_dict, &v_variables, &n_variables,
1004 assert(n_variables);
1009 Count the explanatory variables. The user may or may
1010 not have specified a response variable in the syntax.
1013 int get_n_indep (const struct variable *v)
1018 result = n_variables;
1019 while (i < n_variables)
1021 if (is_depvar (i, v))
1031 Read from the active file. Identify the explanatory variables in
1032 v_variables. Encode categorical variables. Drop cases with missing
1036 int prepare_data (int n_data, int is_missing_case[],
1037 struct variable **indep_vars,
1038 struct variable *depvar,
1039 const struct casefile *cf)
1044 assert (indep_vars != NULL);
1046 for (i = 0; i < n_variables; i++)
1048 if (!is_depvar (i, depvar))
1050 indep_vars[j] = v_variables[i];
1052 if (v_variables[i]->type == ALPHA)
1054 /* Make a place to hold the binary vectors
1055 corresponding to this variable's values. */
1056 cat_stored_values_create (v_variables[i]);
1058 n_data = mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
1062 Mark missing cases for the dependent variable.
1064 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
1069 run_regression (const struct casefile *cf, void *cmd_ UNUSED)
1072 size_t n_data = 0; /* Number of valide cases. */
1073 size_t n_cases; /* Number of cases. */
1079 Keep track of the missing cases.
1081 int *is_missing_case;
1082 const union value *val;
1083 struct casereader *r;
1085 struct variable **indep_vars;
1086 struct design_matrix *X;
1089 pspp_linreg_opts lopts;
1091 assert (models != NULL);
1094 dict_get_vars (default_dict, &v_variables, &n_variables,
1098 n_cases = casefile_get_case_cnt (cf);
1100 for (i = 0; i < cmd.n_dependent; i++)
1102 if (cmd.v_dependent[i]->type != NUMERIC)
1104 msg (SE, gettext ("Dependent variable must be numeric."));
1105 pspp_reg_rc = CMD_FAILURE;
1110 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
1112 lopts.get_depvar_mean_std = 1;
1114 for (k = 0; k < cmd.n_dependent; k++)
1116 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
1117 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
1118 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
1119 assert (indep_vars != NULL);
1121 for (i = 0; i < n_cases; i++)
1123 is_missing_case[i] = 0;
1125 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
1127 (const struct casefile *) cf);
1128 Y = gsl_vector_alloc (n_data);
1131 design_matrix_create (n_indep, (const struct variable **) indep_vars,
1133 for (i = 0; i < X->m->size2; i++)
1135 lopts.get_indep_mean_std[i] = 1;
1137 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1138 models[k]->indep_means = gsl_vector_alloc (X->m->size2);
1139 models[k]->indep_std = gsl_vector_alloc (X->m->size2);
1140 models[k]->depvar = (const struct variable *) cmd.v_dependent[k];
1142 For large data sets, use QR decomposition.
1144 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1146 models[k]->method = PSPP_LINREG_SVD;
1150 The second pass fills the design matrix.
1153 for (r = casefile_get_reader (cf); casereader_read (r, &c);
1155 /* Iterate over the cases. */
1157 case_num = casereader_cnum (r) - 1;
1158 if (!is_missing_case[case_num])
1160 for (i = 0; i < n_variables; ++i) /* Iterate over the
1165 val = case_data (&c, v_variables[i]->fv);
1167 Independent/dependent variable separation. The
1168 'variables' subcommand specifies a varlist which contains
1169 both dependent and independent variables. The dependent
1170 variables are specified with the 'dependent'
1171 subcommand, and maybe also in the 'variables' subcommand.
1172 We need to separate the two.
1174 if (!is_depvar (i, cmd.v_dependent[k]))
1176 if (v_variables[i]->type == ALPHA)
1178 design_matrix_set_categorical (X, row, v_variables[i], val);
1180 else if (v_variables[i]->type == NUMERIC)
1182 design_matrix_set_numeric (X, row, v_variables[i], val);
1186 val = case_data (&c, cmd.v_dependent[k]->fv);
1187 gsl_vector_set (Y, row, val->f);
1192 Now that we know the number of coefficients, allocate space
1193 and store pointers to the variables that correspond to the
1196 pspp_linreg_coeff_init (models[k], X);
1199 Find the least-squares estimates and other statistics.
1201 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1202 subcommand_statistics (cmd.a_statistics, models[k]);
1203 subcommand_export (cmd.sbc_export, models[k]);
1205 gsl_vector_free (Y);
1206 design_matrix_destroy (X);
1208 free (lopts.get_indep_mean_std);
1209 casereader_destroy (r);
1212 free (is_missing_case);