1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_matrix.h>
24 #include <gsl/gsl_vector.h>
28 #include "regression-export.h"
29 #include <data/case.h>
30 #include <data/casefile.h>
31 #include <data/cat-routines.h>
32 #include <data/category.h>
33 #include <data/dictionary.h>
34 #include <data/missing-values.h>
35 #include <data/procedure.h>
36 #include <data/transformations.h>
37 #include <data/value-labels.h>
38 #include <data/variable.h>
39 #include <language/command.h>
40 #include <language/dictionary/split-file.h>
41 #include <language/data-io/file-handle.h>
42 #include <language/lexer/lexer.h>
43 #include <libpspp/alloc.h>
44 #include <libpspp/compiler.h>
45 #include <libpspp/message.h>
46 #include <math/design-matrix.h>
47 #include <math/linreg/coefficient.h>
48 #include <math/linreg/linreg.h>
49 #include <output/table.h>
53 #define REG_LARGE_DATA 1000
58 "REGRESSION" (regression_):
84 static struct cmd_regression cmd;
86 /* Linear regression models. */
87 pspp_linreg_cache **models = NULL;
90 Transformations for saving predicted values
95 int n_trns; /* Number of transformations. */
96 int trns_id; /* Which trns is this one? */
97 pspp_linreg_cache *c; /* Linear model for this trns. */
100 Variables used (both explanatory and response).
102 static struct variable **v_variables;
107 static size_t n_variables;
110 File where the model will be saved if the EXPORT subcommand
113 struct file_handle *model_file;
116 Return value for the procedure.
118 int pspp_reg_rc = CMD_SUCCESS;
120 static bool run_regression (const struct ccase *,
121 const struct casefile *, void *);
124 STATISTICS subcommand output functions.
126 static void reg_stats_r (pspp_linreg_cache *);
127 static void reg_stats_coeff (pspp_linreg_cache *);
128 static void reg_stats_anova (pspp_linreg_cache *);
129 static void reg_stats_outs (pspp_linreg_cache *);
130 static void reg_stats_zpp (pspp_linreg_cache *);
131 static void reg_stats_label (pspp_linreg_cache *);
132 static void reg_stats_sha (pspp_linreg_cache *);
133 static void reg_stats_ci (pspp_linreg_cache *);
134 static void reg_stats_f (pspp_linreg_cache *);
135 static void reg_stats_bcov (pspp_linreg_cache *);
136 static void reg_stats_ses (pspp_linreg_cache *);
137 static void reg_stats_xtx (pspp_linreg_cache *);
138 static void reg_stats_collin (pspp_linreg_cache *);
139 static void reg_stats_tol (pspp_linreg_cache *);
140 static void reg_stats_selection (pspp_linreg_cache *);
141 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
142 int, pspp_linreg_cache *);
145 reg_stats_r (pspp_linreg_cache * c)
155 rsq = c->ssm / c->sst;
156 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
157 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
158 t = tab_create (n_cols, n_rows, 0);
159 tab_dim (t, tab_natural_dimensions);
160 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
161 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
162 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
163 tab_vline (t, TAL_0, 1, 0, 0);
165 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
166 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
167 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
168 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
169 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
170 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
171 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
172 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
173 tab_title (t, _("Model Summary"));
178 Table showing estimated regression coefficients.
181 reg_stats_coeff (pspp_linreg_cache * c)
193 const struct variable *v;
194 const union value *val;
199 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
200 n_rows = c->n_coeffs + 2;
202 t = tab_create (n_cols, n_rows, 0);
203 tab_headers (t, 2, 0, 1, 0);
204 tab_dim (t, tab_natural_dimensions);
205 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
206 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
207 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
208 tab_vline (t, TAL_0, 1, 0, 0);
210 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
211 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
212 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
213 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
214 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
215 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
216 coeff = c->coeff[0].estimate;
217 tab_float (t, 2, 1, 0, coeff, 10, 2);
218 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
219 tab_float (t, 3, 1, 0, std_err, 10, 2);
220 beta = coeff / c->depvar_std;
221 tab_float (t, 4, 1, 0, beta, 10, 2);
222 t_stat = coeff / std_err;
223 tab_float (t, 5, 1, 0, t_stat, 10, 2);
224 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
225 tab_float (t, 6, 1, 0, pval, 10, 2);
226 for (j = 1; j <= c->n_indeps; j++)
228 v = pspp_linreg_coeff_get_var (c->coeff + j, 0);
229 label = var_to_string (v);
230 /* Do not overwrite the variable's name. */
231 strncpy (tmp, label, MAX_STRING);
232 if (v->type == ALPHA)
235 Append the value associated with this coefficient.
236 This makes sense only if we us the usual binary encoding
240 val = pspp_linreg_coeff_get_value (c->coeff + j, v);
241 val_s = value_to_string (val, v);
242 strncat (tmp, val_s, MAX_STRING);
245 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
247 Regression coefficients.
249 coeff = c->coeff[j].estimate;
250 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
252 Standard error of the coefficients.
254 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
255 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
257 'Standardized' coefficient, i.e., regression coefficient
258 if all variables had unit variance.
260 beta = gsl_vector_get (c->indep_std, j);
261 beta *= coeff / c->depvar_std;
262 tab_float (t, 4, j + 1, 0, beta, 10, 2);
265 Test statistic for H0: coefficient is 0.
267 t_stat = coeff / std_err;
268 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
270 P values for the test statistic above.
272 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
273 tab_float (t, 6, j + 1, 0, pval, 10, 2);
275 tab_title (t, _("Coefficients"));
281 Display the ANOVA table.
284 reg_stats_anova (pspp_linreg_cache * c)
288 const double msm = c->ssm / c->dfm;
289 const double mse = c->sse / c->dfe;
290 const double F = msm / mse;
291 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
296 t = tab_create (n_cols, n_rows, 0);
297 tab_headers (t, 2, 0, 1, 0);
298 tab_dim (t, tab_natural_dimensions);
300 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
302 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
303 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
304 tab_vline (t, TAL_0, 1, 0, 0);
306 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
307 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
308 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
309 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
310 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
312 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
313 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
314 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
316 /* Sums of Squares */
317 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
318 tab_float (t, 2, 3, 0, c->sst, 10, 2);
319 tab_float (t, 2, 2, 0, c->sse, 10, 2);
322 /* Degrees of freedom */
323 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
324 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
325 tab_float (t, 3, 3, 0, c->dft, 4, 0);
329 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
330 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
332 tab_float (t, 5, 1, 0, F, 8, 3);
334 tab_float (t, 6, 1, 0, pval, 8, 3);
336 tab_title (t, _("ANOVA"));
340 reg_stats_outs (pspp_linreg_cache * c)
345 reg_stats_zpp (pspp_linreg_cache * c)
350 reg_stats_label (pspp_linreg_cache * c)
355 reg_stats_sha (pspp_linreg_cache * c)
360 reg_stats_ci (pspp_linreg_cache * c)
365 reg_stats_f (pspp_linreg_cache * c)
370 reg_stats_bcov (pspp_linreg_cache * c)
382 n_cols = c->n_indeps + 1 + 2;
383 n_rows = 2 * (c->n_indeps + 1);
384 t = tab_create (n_cols, n_rows, 0);
385 tab_headers (t, 2, 0, 1, 0);
386 tab_dim (t, tab_natural_dimensions);
387 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
388 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
389 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
390 tab_vline (t, TAL_0, 1, 0, 0);
391 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
392 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
393 for (i = 1; i < c->n_coeffs; i++)
395 const struct variable *v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
396 label = var_to_string (v);
397 tab_text (t, 2, i, TAB_CENTER, label);
398 tab_text (t, i + 2, 0, TAB_CENTER, label);
399 for (k = 1; k < c->n_coeffs; k++)
401 col = (i <= k) ? k : i;
402 row = (i <= k) ? i : k;
403 tab_float (t, k + 2, i, TAB_CENTER,
404 gsl_matrix_get (c->cov, row, col), 8, 3);
407 tab_title (t, _("Coefficient Correlations"));
411 reg_stats_ses (pspp_linreg_cache * c)
416 reg_stats_xtx (pspp_linreg_cache * c)
421 reg_stats_collin (pspp_linreg_cache * c)
426 reg_stats_tol (pspp_linreg_cache * c)
431 reg_stats_selection (pspp_linreg_cache * c)
437 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
438 int keyword, pspp_linreg_cache * c)
447 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
450 The order here must match the order in which the STATISTICS
451 keywords appear in the specification section above.
478 Set everything but F.
480 for (i = 0; i < f; i++)
487 for (i = 0; i < all; i++)
495 Default output: ANOVA table, parameter estimates,
496 and statistics for variables not entered into model,
499 if (keywords[defaults] | d)
507 statistics_keyword_output (reg_stats_r, keywords[r], c);
508 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
509 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
510 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
511 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
512 statistics_keyword_output (reg_stats_label, keywords[label], c);
513 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
514 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
515 statistics_keyword_output (reg_stats_f, keywords[f], c);
516 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
517 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
518 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
519 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
520 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
521 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
524 Free the transformation. Free its linear model if this
525 transformation is the last one.
528 bool regression_trns_free (void *t_)
531 struct reg_trns *t = t_;
533 if (t->trns_id == t->n_trns)
535 result = pspp_linreg_cache_free (t->c);
542 Gets the predicted values.
545 regression_trns_pred_proc (void *t_, struct ccase *c, int case_idx UNUSED)
549 struct reg_trns *trns = t_;
550 pspp_linreg_cache *model;
551 union value *output = NULL;
552 const union value **vals = NULL;
553 struct variable **vars = NULL;
555 assert (trns!= NULL);
557 assert (model != NULL);
558 assert (model->depvar != NULL);
559 assert (model->pred != NULL);
561 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
562 n_vals = (*model->get_vars) (model, vars);
564 vals = xnmalloc (n_vals, sizeof (*vals));
565 output = case_data_rw (c, model->pred->fv);
566 assert (output != NULL);
568 for (i = 0; i < n_vals; i++)
570 vals[i] = case_data (c, vars[i]->fv);
572 output->f = (*model->predict) ((const struct variable **) vars,
573 vals, model, n_vals);
576 return TRNS_CONTINUE;
582 regression_trns_resid_proc (void *t_, struct ccase *c, int case_idx UNUSED)
586 struct reg_trns *trns = t_;
587 pspp_linreg_cache *model;
588 union value *output = NULL;
589 const union value **vals = NULL;
590 const union value *obs = NULL;
591 struct variable **vars = NULL;
593 assert (trns!= NULL);
595 assert (model != NULL);
596 assert (model->depvar != NULL);
597 assert (model->resid != NULL);
599 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
600 n_vals = (*model->get_vars) (model, vars);
602 vals = xnmalloc (n_vals, sizeof (*vals));
603 output = case_data_rw (c, model->resid->fv);
604 assert (output != NULL);
606 for (i = 0; i < n_vals; i++)
608 vals[i] = case_data (c, vars[i]->fv);
610 obs = case_data (c, model->depvar->fv);
611 output->f = (*model->residual) ((const struct variable **) vars,
612 vals, obs, model, n_vals);
615 return TRNS_CONTINUE;
618 Returns 0 if NAME is a duplicate of any existing variable name.
621 try_name (char *name)
623 if (dict_lookup_var (default_dict, name) != NULL)
629 void reg_get_name (char name[LONG_NAME_LEN], const char prefix[LONG_NAME_LEN])
633 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
634 while (!try_name(name))
637 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
641 reg_save_var (const char *prefix, trns_proc_func *f,
642 pspp_linreg_cache *c, struct variable **v,
645 static int trns_index = 1;
646 char name[LONG_NAME_LEN];
647 struct variable *new_var;
648 struct reg_trns *t = NULL;
650 t = xmalloc (sizeof (*t));
651 t->trns_id = trns_index;
654 reg_get_name (name, prefix);
655 new_var = dict_create_var (default_dict, name, 0);
656 assert (new_var != NULL);
658 add_transformation (f, regression_trns_free, t);
662 subcommand_save (int save, pspp_linreg_cache **models)
664 pspp_linreg_cache **lc;
668 assert (models != NULL);
672 /* Count the number of transformations we will need. */
673 for (i = 0; i < REGRESSION_SV_count; i++)
680 n_trns *= cmd.n_dependent;
682 for (lc = models; lc < models + cmd.n_dependent; lc++)
684 assert (*lc != NULL);
685 assert ((*lc)->depvar != NULL);
686 if (cmd.a_save[REGRESSION_SV_RESID])
688 reg_save_var ("RES", regression_trns_resid_proc, *lc, &(*lc)->resid, n_trns);
690 if (cmd.a_save[REGRESSION_SV_PRED])
692 reg_save_var ("PRED", regression_trns_pred_proc, *lc, &(*lc)->pred, n_trns);
698 for (lc = models; lc < models + cmd.n_dependent; lc++)
700 assert (*lc != NULL);
701 pspp_linreg_cache_free (*lc);
706 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
710 for (i = 0; i < n_vars; i++)
712 if (v->index == varlist[i]->index)
720 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
725 struct variable **varlist;
726 struct pspp_linreg_coeff *coeff;
727 const struct variable *v;
730 fprintf (fp, "%s", reg_export_categorical_encode_1);
732 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
733 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
735 coeff = c->coeff + i;
736 v = pspp_linreg_coeff_get_var (coeff, 0);
737 if (v->type == ALPHA)
739 if (!reg_inserted (v, varlist, n_vars))
741 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
743 varlist[n_vars] = (struct variable *) v;
748 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
749 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
751 for (i = 0; i < n_vars - 1; i++)
753 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
755 fprintf (fp, "&%s};\n\t", varlist[i]->name);
757 for (i = 0; i < n_vars; i++)
759 coeff = c->coeff + i;
760 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
762 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
763 varlist[i]->obs_vals->n_categories);
765 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
767 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
768 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
769 value_to_string (val, varlist[i]));
772 fprintf (fp, "%s", reg_export_categorical_encode_2);
776 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
779 struct pspp_linreg_coeff *coeff;
780 const struct variable *v;
782 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
783 for (i = 1; i < c->n_indeps; i++)
785 coeff = c->coeff + i;
786 v = pspp_linreg_coeff_get_var (coeff, 0);
787 fprintf (fp, "\"%s\",\n\t\t", v->name);
789 coeff = c->coeff + i;
790 v = pspp_linreg_coeff_get_var (coeff, 0);
791 fprintf (fp, "\"%s\"};\n\t", v->name);
794 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
796 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
797 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
798 reg_print_depvars (fp, c);
799 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
801 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
802 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
805 reg_has_categorical (pspp_linreg_cache * c)
808 const struct variable *v;
810 for (i = 1; i < c->n_coeffs; i++)
812 v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
813 if (v->type == ALPHA)
822 subcommand_export (int export, pspp_linreg_cache * c)
827 int n_quantiles = 100;
830 struct pspp_linreg_coeff coeff;
835 assert (model_file != NULL);
836 fp = fopen (fh_get_file_name (model_file), "w");
838 fprintf (fp, "%s", reg_preamble);
839 reg_print_getvar (fp, c);
840 if (reg_has_categorical (c))
842 reg_print_categorical_encoding (fp, c);
844 fprintf (fp, "%s", reg_export_t_quantiles_1);
845 increment = 0.5 / (double) increment;
846 for (i = 0; i < n_quantiles - 1; i++)
848 tmp = 0.5 + 0.005 * (double) i;
849 fprintf (fp, "%.15e,\n\t\t",
850 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
852 fprintf (fp, "%.15e};\n\t",
853 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
854 fprintf (fp, "%s", reg_export_t_quantiles_2);
855 fprintf (fp, "%s", reg_mean_cmt);
856 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
857 fprintf (fp, "const char *var_names[])\n{\n\t");
858 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
859 for (i = 1; i < c->n_indeps; i++)
862 fprintf (fp, "%.15e,\n\t\t", coeff.estimate);
865 fprintf (fp, "%.15e};\n\t", coeff.estimate);
867 fprintf (fp, "double estimate = %.15e;\n\t", coeff.estimate);
868 fprintf (fp, "int i;\n\tint j;\n\n\t");
869 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
870 fprintf (fp, "%s", reg_getvar);
871 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
873 for (i = 0; i < c->cov->size1 - 1; i++)
876 for (j = 0; j < c->cov->size2 - 1; j++)
878 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
880 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
883 for (j = 0; j < c->cov->size2 - 1; j++)
885 fprintf (fp, "%.15e, ",
886 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
888 fprintf (fp, "%.15e}\n\t",
889 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
890 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
892 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
893 fprintf (fp, "%s", reg_variance);
894 fprintf (fp, "%s", reg_export_confidence_interval);
895 tmp = c->mse * c->mse;
896 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
897 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
898 fprintf (fp, "%s", reg_export_prediction_interval_3);
900 fp = fopen ("pspp_model_reg.h", "w");
901 fprintf (fp, "%s", reg_header);
906 regression_custom_export (struct cmd_regression *cmd UNUSED)
908 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
909 if (!lex_force_match ('('))
916 model_file = fh_parse (FH_REF_FILE);
917 if (model_file == NULL)
921 if (!lex_force_match (')'))
928 cmd_regression (void)
930 if (!parse_regression (&cmd))
933 models = xnmalloc (cmd.n_dependent, sizeof *models);
934 if (!multipass_procedure_with_splits (run_regression, &cmd))
935 return CMD_CASCADING_FAILURE;
936 subcommand_save (cmd.sbc_save, models);
943 Is variable k the dependent variable?
946 is_depvar (size_t k, const struct variable *v)
949 compare_var_names returns 0 if the variable
952 if (!compare_var_names (v, v_variables[k], NULL))
959 Mark missing cases. Return the number of non-missing cases.
962 mark_missing_cases (const struct casefile *cf, struct variable *v,
963 int *is_missing_case, double n_data)
965 struct casereader *r;
968 const union value *val;
970 for (r = casefile_get_reader (cf);
971 casereader_read (r, &c); case_destroy (&c))
973 row = casereader_cnum (r) - 1;
975 val = case_data (&c, v->fv);
976 cat_value_update (v, val);
977 if (mv_is_value_missing (&v->miss, val))
979 if (!is_missing_case[row])
981 /* Now it is missing. */
983 is_missing_case[row] = 1;
987 casereader_destroy (r);
992 /* Parser for the variables sub command */
994 regression_custom_variables(struct cmd_regression *cmd UNUSED)
999 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
1004 if (!parse_variables (default_dict, &v_variables, &n_variables,
1010 assert(n_variables);
1015 Count the explanatory variables. The user may or may
1016 not have specified a response variable in the syntax.
1019 int get_n_indep (const struct variable *v)
1024 result = n_variables;
1025 while (i < n_variables)
1027 if (is_depvar (i, v))
1037 Read from the active file. Identify the explanatory variables in
1038 v_variables. Encode categorical variables. Drop cases with missing
1042 int prepare_data (int n_data, int is_missing_case[],
1043 struct variable **indep_vars,
1044 struct variable *depvar,
1045 const struct casefile *cf)
1050 assert (indep_vars != NULL);
1052 for (i = 0; i < n_variables; i++)
1054 if (!is_depvar (i, depvar))
1056 indep_vars[j] = v_variables[i];
1058 if (v_variables[i]->type == ALPHA)
1060 /* Make a place to hold the binary vectors
1061 corresponding to this variable's values. */
1062 cat_stored_values_create (v_variables[i]);
1064 n_data = mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
1068 Mark missing cases for the dependent variable.
1070 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
1075 run_regression (const struct ccase *first,
1076 const struct casefile *cf, void *cmd_ UNUSED)
1079 size_t n_data = 0; /* Number of valide cases. */
1080 size_t n_cases; /* Number of cases. */
1086 Keep track of the missing cases.
1088 int *is_missing_case;
1089 const union value *val;
1090 struct casereader *r;
1092 struct variable **indep_vars;
1093 struct design_matrix *X;
1096 pspp_linreg_opts lopts;
1098 assert (models != NULL);
1100 output_split_file_values (first);
1104 dict_get_vars (default_dict, &v_variables, &n_variables,
1108 n_cases = casefile_get_case_cnt (cf);
1110 for (i = 0; i < cmd.n_dependent; i++)
1112 if (cmd.v_dependent[i]->type != NUMERIC)
1114 msg (SE, gettext ("Dependent variable must be numeric."));
1115 pspp_reg_rc = CMD_FAILURE;
1120 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
1122 lopts.get_depvar_mean_std = 1;
1124 for (k = 0; k < cmd.n_dependent; k++)
1126 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
1127 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
1128 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
1129 assert (indep_vars != NULL);
1131 for (i = 0; i < n_cases; i++)
1133 is_missing_case[i] = 0;
1135 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
1137 (const struct casefile *) cf);
1138 Y = gsl_vector_alloc (n_data);
1141 design_matrix_create (n_indep, (const struct variable **) indep_vars,
1143 for (i = 0; i < X->m->size2; i++)
1145 lopts.get_indep_mean_std[i] = 1;
1147 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1148 models[k]->indep_means = gsl_vector_alloc (X->m->size2);
1149 models[k]->indep_std = gsl_vector_alloc (X->m->size2);
1150 models[k]->depvar = (const struct variable *) cmd.v_dependent[k];
1152 For large data sets, use QR decomposition.
1154 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1156 models[k]->method = PSPP_LINREG_SVD;
1160 The second pass fills the design matrix.
1163 for (r = casefile_get_reader (cf); casereader_read (r, &c);
1165 /* Iterate over the cases. */
1167 case_num = casereader_cnum (r) - 1;
1168 if (!is_missing_case[case_num])
1170 for (i = 0; i < n_variables; ++i) /* Iterate over the
1175 val = case_data (&c, v_variables[i]->fv);
1177 Independent/dependent variable separation. The
1178 'variables' subcommand specifies a varlist which contains
1179 both dependent and independent variables. The dependent
1180 variables are specified with the 'dependent'
1181 subcommand, and maybe also in the 'variables' subcommand.
1182 We need to separate the two.
1184 if (!is_depvar (i, cmd.v_dependent[k]))
1186 if (v_variables[i]->type == ALPHA)
1188 design_matrix_set_categorical (X, row, v_variables[i], val);
1190 else if (v_variables[i]->type == NUMERIC)
1192 design_matrix_set_numeric (X, row, v_variables[i], val);
1196 val = case_data (&c, cmd.v_dependent[k]->fv);
1197 gsl_vector_set (Y, row, val->f);
1202 Now that we know the number of coefficients, allocate space
1203 and store pointers to the variables that correspond to the
1206 pspp_linreg_coeff_init (models[k], X);
1209 Find the least-squares estimates and other statistics.
1211 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1212 subcommand_statistics (cmd.a_statistics, models[k]);
1213 subcommand_export (cmd.sbc_export, models[k]);
1215 gsl_vector_free (Y);
1216 design_matrix_destroy (X);
1218 free (lopts.get_indep_mean_std);
1219 casereader_destroy (r);
1222 free (is_missing_case);