1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_matrix.h>
24 #include <gsl/gsl_vector.h>
28 #include "regression-export.h"
29 #include <data/case.h>
30 #include <data/casefile.h>
31 #include <data/cat-routines.h>
32 #include <data/category.h>
33 #include <data/dictionary.h>
34 #include <data/missing-values.h>
35 #include <data/procedure.h>
36 #include <data/transformations.h>
37 #include <data/value-labels.h>
38 #include <data/variable.h>
39 #include <language/command.h>
40 #include <language/dictionary/split-file.h>
41 #include <language/data-io/file-handle.h>
42 #include <language/lexer/lexer.h>
43 #include <libpspp/alloc.h>
44 #include <libpspp/compiler.h>
45 #include <libpspp/message.h>
46 #include <math/design-matrix.h>
47 #include <math/coefficient.h>
48 #include <math/linreg/linreg.h>
49 #include <output/table.h>
53 #define REG_LARGE_DATA 1000
58 "REGRESSION" (regression_):
79 +save[sv_]=resid,pred;
84 static struct cmd_regression cmd;
86 /* Linear regression models. */
87 static pspp_linreg_cache **models = NULL;
90 Transformations for saving predicted values
95 int n_trns; /* Number of transformations. */
96 int trns_id; /* Which trns is this one? */
97 pspp_linreg_cache *c; /* Linear model for this trns. */
100 Variables used (both explanatory and response).
102 static struct variable **v_variables;
107 static size_t n_variables;
110 File where the model will be saved if the EXPORT subcommand
113 static struct file_handle *model_file;
116 Return value for the procedure.
118 static int pspp_reg_rc = CMD_SUCCESS;
120 static bool run_regression (const struct ccase *,
121 const struct casefile *, void *);
124 STATISTICS subcommand output functions.
126 static void reg_stats_r (pspp_linreg_cache *);
127 static void reg_stats_coeff (pspp_linreg_cache *);
128 static void reg_stats_anova (pspp_linreg_cache *);
129 static void reg_stats_outs (pspp_linreg_cache *);
130 static void reg_stats_zpp (pspp_linreg_cache *);
131 static void reg_stats_label (pspp_linreg_cache *);
132 static void reg_stats_sha (pspp_linreg_cache *);
133 static void reg_stats_ci (pspp_linreg_cache *);
134 static void reg_stats_f (pspp_linreg_cache *);
135 static void reg_stats_bcov (pspp_linreg_cache *);
136 static void reg_stats_ses (pspp_linreg_cache *);
137 static void reg_stats_xtx (pspp_linreg_cache *);
138 static void reg_stats_collin (pspp_linreg_cache *);
139 static void reg_stats_tol (pspp_linreg_cache *);
140 static void reg_stats_selection (pspp_linreg_cache *);
141 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
142 int, pspp_linreg_cache *);
145 reg_stats_r (pspp_linreg_cache * c)
155 rsq = c->ssm / c->sst;
156 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
157 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
158 t = tab_create (n_cols, n_rows, 0);
159 tab_dim (t, tab_natural_dimensions);
160 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
161 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
162 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
163 tab_vline (t, TAL_0, 1, 0, 0);
165 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
166 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
167 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
168 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
169 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
170 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
171 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
172 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
173 tab_title (t, _("Model Summary"));
178 Table showing estimated regression coefficients.
181 reg_stats_coeff (pspp_linreg_cache * c)
193 const struct variable *v;
194 const union value *val;
199 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
200 n_rows = c->n_coeffs + 2;
202 t = tab_create (n_cols, n_rows, 0);
203 tab_headers (t, 2, 0, 1, 0);
204 tab_dim (t, tab_natural_dimensions);
205 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
206 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
207 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
208 tab_vline (t, TAL_0, 1, 0, 0);
210 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
211 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
212 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
213 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
214 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
215 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
216 coeff = c->coeff[0]->estimate;
217 tab_float (t, 2, 1, 0, coeff, 10, 2);
218 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
219 tab_float (t, 3, 1, 0, std_err, 10, 2);
220 beta = coeff / c->depvar_std;
221 tab_float (t, 4, 1, 0, beta, 10, 2);
222 t_stat = coeff / std_err;
223 tab_float (t, 5, 1, 0, t_stat, 10, 2);
224 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
225 tab_float (t, 6, 1, 0, pval, 10, 2);
226 for (j = 1; j <= c->n_indeps; j++)
228 v = pspp_coeff_get_var (c->coeff[j], 0);
229 label = var_to_string (v);
230 /* Do not overwrite the variable's name. */
231 strncpy (tmp, label, MAX_STRING);
232 if (v->type == ALPHA)
235 Append the value associated with this coefficient.
236 This makes sense only if we us the usual binary encoding
240 val = pspp_coeff_get_value (c->coeff[j], v);
241 val_s = value_to_string (val, v);
242 strncat (tmp, val_s, MAX_STRING);
245 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
247 Regression coefficients.
249 coeff = c->coeff[j]->estimate;
250 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
252 Standard error of the coefficients.
254 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
255 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
257 'Standardized' coefficient, i.e., regression coefficient
258 if all variables had unit variance.
260 beta = gsl_vector_get (c->indep_std, j);
261 beta *= coeff / c->depvar_std;
262 tab_float (t, 4, j + 1, 0, beta, 10, 2);
265 Test statistic for H0: coefficient is 0.
267 t_stat = coeff / std_err;
268 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
270 P values for the test statistic above.
272 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
273 tab_float (t, 6, j + 1, 0, pval, 10, 2);
275 tab_title (t, _("Coefficients"));
281 Display the ANOVA table.
284 reg_stats_anova (pspp_linreg_cache * c)
288 const double msm = c->ssm / c->dfm;
289 const double mse = c->sse / c->dfe;
290 const double F = msm / mse;
291 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
296 t = tab_create (n_cols, n_rows, 0);
297 tab_headers (t, 2, 0, 1, 0);
298 tab_dim (t, tab_natural_dimensions);
300 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
302 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
303 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
304 tab_vline (t, TAL_0, 1, 0, 0);
306 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
307 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
308 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
309 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
310 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
312 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
313 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
314 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
316 /* Sums of Squares */
317 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
318 tab_float (t, 2, 3, 0, c->sst, 10, 2);
319 tab_float (t, 2, 2, 0, c->sse, 10, 2);
322 /* Degrees of freedom */
323 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
324 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
325 tab_float (t, 3, 3, 0, c->dft, 4, 0);
329 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
330 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
332 tab_float (t, 5, 1, 0, F, 8, 3);
334 tab_float (t, 6, 1, 0, pval, 8, 3);
336 tab_title (t, _("ANOVA"));
340 reg_stats_outs (pspp_linreg_cache * c)
345 reg_stats_zpp (pspp_linreg_cache * c)
350 reg_stats_label (pspp_linreg_cache * c)
355 reg_stats_sha (pspp_linreg_cache * c)
360 reg_stats_ci (pspp_linreg_cache * c)
365 reg_stats_f (pspp_linreg_cache * c)
370 reg_stats_bcov (pspp_linreg_cache * c)
382 n_cols = c->n_indeps + 1 + 2;
383 n_rows = 2 * (c->n_indeps + 1);
384 t = tab_create (n_cols, n_rows, 0);
385 tab_headers (t, 2, 0, 1, 0);
386 tab_dim (t, tab_natural_dimensions);
387 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
388 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
389 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
390 tab_vline (t, TAL_0, 1, 0, 0);
391 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
392 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
393 for (i = 1; i < c->n_coeffs; i++)
395 const struct variable *v = pspp_coeff_get_var (c->coeff[i], 0);
396 label = var_to_string (v);
397 tab_text (t, 2, i, TAB_CENTER, label);
398 tab_text (t, i + 2, 0, TAB_CENTER, label);
399 for (k = 1; k < c->n_coeffs; k++)
401 col = (i <= k) ? k : i;
402 row = (i <= k) ? i : k;
403 tab_float (t, k + 2, i, TAB_CENTER,
404 gsl_matrix_get (c->cov, row, col), 8, 3);
407 tab_title (t, _("Coefficient Correlations"));
411 reg_stats_ses (pspp_linreg_cache * c)
416 reg_stats_xtx (pspp_linreg_cache * c)
421 reg_stats_collin (pspp_linreg_cache * c)
426 reg_stats_tol (pspp_linreg_cache * c)
431 reg_stats_selection (pspp_linreg_cache * c)
437 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
438 int keyword, pspp_linreg_cache * c)
447 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
450 The order here must match the order in which the STATISTICS
451 keywords appear in the specification section above.
478 Set everything but F.
480 for (i = 0; i < f; i++)
487 for (i = 0; i < all; i++)
495 Default output: ANOVA table, parameter estimates,
496 and statistics for variables not entered into model,
499 if (keywords[defaults] | d)
507 statistics_keyword_output (reg_stats_r, keywords[r], c);
508 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
509 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
510 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
511 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
512 statistics_keyword_output (reg_stats_label, keywords[label], c);
513 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
514 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
515 statistics_keyword_output (reg_stats_f, keywords[f], c);
516 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
517 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
518 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
519 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
520 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
521 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
525 Free the transformation. Free its linear model if this
526 transformation is the last one.
529 regression_trns_free (void *t_)
532 struct reg_trns *t = t_;
534 if (t->trns_id == t->n_trns)
536 result = pspp_linreg_cache_free (t->c);
544 Gets the predicted values.
547 regression_trns_pred_proc (void *t_, struct ccase *c,
548 casenum_t case_idx UNUSED)
552 struct reg_trns *trns = t_;
553 pspp_linreg_cache *model;
554 union value *output = NULL;
555 const union value **vals = NULL;
556 struct variable **vars = NULL;
558 assert (trns != NULL);
560 assert (model != NULL);
561 assert (model->depvar != NULL);
562 assert (model->pred != NULL);
564 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
565 n_vals = (*model->get_vars) (model, vars);
567 vals = xnmalloc (n_vals, sizeof (*vals));
568 output = case_data_rw (c, model->pred->fv);
569 assert (output != NULL);
571 for (i = 0; i < n_vals; i++)
573 vals[i] = case_data (c, vars[i]->fv);
575 output->f = (*model->predict) ((const struct variable **) vars,
576 vals, model, n_vals);
579 return TRNS_CONTINUE;
586 regression_trns_resid_proc (void *t_, struct ccase *c,
587 casenum_t case_idx UNUSED)
591 struct reg_trns *trns = t_;
592 pspp_linreg_cache *model;
593 union value *output = NULL;
594 const union value **vals = NULL;
595 const union value *obs = NULL;
596 struct variable **vars = NULL;
598 assert (trns != NULL);
600 assert (model != NULL);
601 assert (model->depvar != NULL);
602 assert (model->resid != NULL);
604 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
605 n_vals = (*model->get_vars) (model, vars);
607 vals = xnmalloc (n_vals, sizeof (*vals));
608 output = case_data_rw (c, model->resid->fv);
609 assert (output != NULL);
611 for (i = 0; i < n_vals; i++)
613 vals[i] = case_data (c, vars[i]->fv);
615 obs = case_data (c, model->depvar->fv);
616 output->f = (*model->residual) ((const struct variable **) vars,
617 vals, obs, model, n_vals);
620 return TRNS_CONTINUE;
624 Returns 0 if NAME is a duplicate of any existing variable name.
627 try_name (char *name)
629 if (dict_lookup_var (dataset_dict (current_dataset), name) != NULL)
635 reg_get_name (char name[LONG_NAME_LEN], const char prefix[LONG_NAME_LEN])
639 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
640 while (!try_name (name))
643 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
647 reg_save_var (const char *prefix, trns_proc_func * f,
648 pspp_linreg_cache * c, struct variable **v, int n_trns)
650 static int trns_index = 1;
651 char name[LONG_NAME_LEN];
652 struct variable *new_var;
653 struct reg_trns *t = NULL;
655 t = xmalloc (sizeof (*t));
656 t->trns_id = trns_index;
659 reg_get_name (name, prefix);
660 new_var = dict_create_var (dataset_dict (current_dataset), name, 0);
661 assert (new_var != NULL);
663 add_transformation (current_dataset, f, regression_trns_free, t);
667 subcommand_save (int save, pspp_linreg_cache ** models)
669 pspp_linreg_cache **lc;
673 assert (models != NULL);
677 /* Count the number of transformations we will need. */
678 for (i = 0; i < REGRESSION_SV_count; i++)
685 n_trns *= cmd.n_dependent;
687 for (lc = models; lc < models + cmd.n_dependent; lc++)
689 assert (*lc != NULL);
690 assert ((*lc)->depvar != NULL);
691 if (cmd.a_save[REGRESSION_SV_RESID])
693 reg_save_var ("RES", regression_trns_resid_proc, *lc,
694 &(*lc)->resid, n_trns);
696 if (cmd.a_save[REGRESSION_SV_PRED])
698 reg_save_var ("PRED", regression_trns_pred_proc, *lc,
699 &(*lc)->pred, n_trns);
705 for (lc = models; lc < models + cmd.n_dependent; lc++)
707 assert (*lc != NULL);
708 pspp_linreg_cache_free (*lc);
713 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
717 for (i = 0; i < n_vars; i++)
719 if (v->index == varlist[i]->index)
727 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
732 struct variable **varlist;
733 struct pspp_coeff *coeff;
734 const struct variable *v;
737 fprintf (fp, "%s", reg_export_categorical_encode_1);
739 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
740 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
743 v = pspp_coeff_get_var (coeff, 0);
744 if (v->type == ALPHA)
746 if (!reg_inserted (v, varlist, n_vars))
748 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
750 varlist[n_vars] = (struct variable *) v;
755 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
756 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
758 for (i = 0; i < n_vars - 1; i++)
760 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
762 fprintf (fp, "&%s};\n\t", varlist[i]->name);
764 for (i = 0; i < n_vars; i++)
767 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
769 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
770 varlist[i]->obs_vals->n_categories);
772 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
774 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
775 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
776 value_to_string (val, varlist[i]));
779 fprintf (fp, "%s", reg_export_categorical_encode_2);
783 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
786 struct pspp_coeff *coeff;
787 const struct variable *v;
789 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
790 for (i = 1; i < c->n_indeps; i++)
793 v = pspp_coeff_get_var (coeff, 0);
794 fprintf (fp, "\"%s\",\n\t\t", v->name);
797 v = pspp_coeff_get_var (coeff, 0);
798 fprintf (fp, "\"%s\"};\n\t", v->name);
801 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
803 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
804 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
805 reg_print_depvars (fp, c);
806 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
808 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
809 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
812 reg_has_categorical (pspp_linreg_cache * c)
815 const struct variable *v;
817 for (i = 1; i < c->n_coeffs; i++)
819 v = pspp_coeff_get_var (c->coeff[i], 0);
820 if (v->type == ALPHA)
829 subcommand_export (int export, pspp_linreg_cache * c)
834 int n_quantiles = 100;
837 struct pspp_coeff *coeff;
842 assert (model_file != NULL);
843 fp = fopen (fh_get_file_name (model_file), "w");
845 fprintf (fp, "%s", reg_preamble);
846 reg_print_getvar (fp, c);
847 if (reg_has_categorical (c))
849 reg_print_categorical_encoding (fp, c);
851 fprintf (fp, "%s", reg_export_t_quantiles_1);
852 increment = 0.5 / (double) increment;
853 for (i = 0; i < n_quantiles - 1; i++)
855 tmp = 0.5 + 0.005 * (double) i;
856 fprintf (fp, "%.15e,\n\t\t",
857 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
859 fprintf (fp, "%.15e};\n\t",
860 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
861 fprintf (fp, "%s", reg_export_t_quantiles_2);
862 fprintf (fp, "%s", reg_mean_cmt);
863 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
864 fprintf (fp, "const char *var_names[])\n{\n\t");
865 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
866 for (i = 1; i < c->n_indeps; i++)
869 fprintf (fp, "%.15e,\n\t\t", coeff->estimate);
872 fprintf (fp, "%.15e};\n\t", coeff->estimate);
874 fprintf (fp, "double estimate = %.15e;\n\t", coeff->estimate);
875 fprintf (fp, "int i;\n\tint j;\n\n\t");
876 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
877 fprintf (fp, "%s", reg_getvar);
878 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
880 for (i = 0; i < c->cov->size1 - 1; i++)
883 for (j = 0; j < c->cov->size2 - 1; j++)
885 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
887 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
890 for (j = 0; j < c->cov->size2 - 1; j++)
892 fprintf (fp, "%.15e, ",
893 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
895 fprintf (fp, "%.15e}\n\t",
896 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
897 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
899 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
900 fprintf (fp, "%s", reg_variance);
901 fprintf (fp, "%s", reg_export_confidence_interval);
902 tmp = c->mse * c->mse;
903 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
904 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
905 fprintf (fp, "%s", reg_export_prediction_interval_3);
907 fp = fopen ("pspp_model_reg.h", "w");
908 fprintf (fp, "%s", reg_header);
913 regression_custom_export (struct cmd_regression *cmd UNUSED, void *aux UNUSED)
915 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
916 if (!lex_force_match ('('))
923 model_file = fh_parse (FH_REF_FILE);
924 if (model_file == NULL)
928 if (!lex_force_match (')'))
935 cmd_regression (void)
937 if (!parse_regression (&cmd, NULL))
940 models = xnmalloc (cmd.n_dependent, sizeof *models);
941 if (!multipass_procedure_with_splits (current_dataset, run_regression, &cmd))
942 return CMD_CASCADING_FAILURE;
943 subcommand_save (cmd.sbc_save, models);
950 Is variable k the dependent variable?
953 is_depvar (size_t k, const struct variable *v)
956 compare_var_names returns 0 if the variable
959 if (!compare_var_names (v, v_variables[k], NULL))
966 Mark missing cases. Return the number of non-missing cases.
969 mark_missing_cases (const struct casefile *cf, struct variable *v,
970 int *is_missing_case, double n_data)
972 struct casereader *r;
975 const union value *val;
977 for (r = casefile_get_reader (cf);
978 casereader_read (r, &c); case_destroy (&c))
980 row = casereader_cnum (r) - 1;
982 val = case_data (&c, v->fv);
983 cat_value_update (v, val);
984 if (mv_is_value_missing (&v->miss, val))
986 if (!is_missing_case[row])
988 /* Now it is missing. */
990 is_missing_case[row] = 1;
994 casereader_destroy (r);
999 /* Parser for the variables sub command */
1001 regression_custom_variables (struct cmd_regression *cmd UNUSED,
1007 if ((token != T_ID || dict_lookup_var (dataset_dict (current_dataset), tokid) == NULL)
1012 if (!parse_variables (dataset_dict (current_dataset), &v_variables, &n_variables, PV_NONE))
1017 assert (n_variables);
1023 Count the explanatory variables. The user may or may
1024 not have specified a response variable in the syntax.
1027 get_n_indep (const struct variable *v)
1032 result = n_variables;
1033 while (i < n_variables)
1035 if (is_depvar (i, v))
1046 Read from the active file. Identify the explanatory variables in
1047 v_variables. Encode categorical variables. Drop cases with missing
1051 prepare_data (int n_data, int is_missing_case[],
1052 struct variable **indep_vars,
1053 struct variable *depvar, const struct casefile *cf)
1058 assert (indep_vars != NULL);
1060 for (i = 0; i < n_variables; i++)
1062 if (!is_depvar (i, depvar))
1064 indep_vars[j] = v_variables[i];
1066 if (v_variables[i]->type == ALPHA)
1068 /* Make a place to hold the binary vectors
1069 corresponding to this variable's values. */
1070 cat_stored_values_create (v_variables[i]);
1073 mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
1077 Mark missing cases for the dependent variable.
1079 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
1084 coeff_init (pspp_linreg_cache *c, struct design_matrix *dm)
1086 c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
1087 c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
1088 c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
1089 pspp_coeff_init (c->coeff + 1, dm);
1092 run_regression (const struct ccase *first,
1093 const struct casefile *cf, void *cmd_ UNUSED)
1096 size_t n_data = 0; /* Number of valide cases. */
1097 size_t n_cases; /* Number of cases. */
1103 Keep track of the missing cases.
1105 int *is_missing_case;
1106 const union value *val;
1107 struct casereader *r;
1109 struct variable **indep_vars;
1110 struct design_matrix *X;
1113 pspp_linreg_opts lopts;
1115 assert (models != NULL);
1117 output_split_file_values (first);
1121 dict_get_vars (dataset_dict (current_dataset), &v_variables, &n_variables,
1125 n_cases = casefile_get_case_cnt (cf);
1127 for (i = 0; i < cmd.n_dependent; i++)
1129 if (cmd.v_dependent[i]->type != NUMERIC)
1131 msg (SE, gettext ("Dependent variable must be numeric."));
1132 pspp_reg_rc = CMD_FAILURE;
1137 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
1139 lopts.get_depvar_mean_std = 1;
1141 for (k = 0; k < cmd.n_dependent; k++)
1143 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
1144 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
1145 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
1146 assert (indep_vars != NULL);
1148 for (i = 0; i < n_cases; i++)
1150 is_missing_case[i] = 0;
1152 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
1154 (const struct casefile *) cf);
1155 Y = gsl_vector_alloc (n_data);
1158 design_matrix_create (n_indep, (const struct variable **) indep_vars,
1160 for (i = 0; i < X->m->size2; i++)
1162 lopts.get_indep_mean_std[i] = 1;
1164 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1165 models[k]->indep_means = gsl_vector_alloc (X->m->size2);
1166 models[k]->indep_std = gsl_vector_alloc (X->m->size2);
1167 models[k]->depvar = (const struct variable *) cmd.v_dependent[k];
1169 For large data sets, use QR decomposition.
1171 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1173 models[k]->method = PSPP_LINREG_SVD;
1177 The second pass fills the design matrix.
1180 for (r = casefile_get_reader (cf); casereader_read (r, &c);
1182 /* Iterate over the cases. */
1184 case_num = casereader_cnum (r) - 1;
1185 if (!is_missing_case[case_num])
1187 for (i = 0; i < n_variables; ++i) /* Iterate over the
1192 val = case_data (&c, v_variables[i]->fv);
1194 Independent/dependent variable separation. The
1195 'variables' subcommand specifies a varlist which contains
1196 both dependent and independent variables. The dependent
1197 variables are specified with the 'dependent'
1198 subcommand, and maybe also in the 'variables' subcommand.
1199 We need to separate the two.
1201 if (!is_depvar (i, cmd.v_dependent[k]))
1203 if (v_variables[i]->type == ALPHA)
1205 design_matrix_set_categorical (X, row,
1206 v_variables[i], val);
1208 else if (v_variables[i]->type == NUMERIC)
1210 design_matrix_set_numeric (X, row, v_variables[i],
1215 val = case_data (&c, cmd.v_dependent[k]->fv);
1216 gsl_vector_set (Y, row, val->f);
1221 Now that we know the number of coefficients, allocate space
1222 and store pointers to the variables that correspond to the
1225 coeff_init (models[k], X);
1228 Find the least-squares estimates and other statistics.
1230 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1231 subcommand_statistics (cmd.a_statistics, models[k]);
1232 subcommand_export (cmd.sbc_export, models[k]);
1234 gsl_vector_free (Y);
1235 design_matrix_destroy (X);
1237 free (lopts.get_indep_mean_std);
1238 casereader_destroy (r);
1241 free (is_missing_case);