1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_matrix.h>
24 #include <gsl/gsl_vector.h>
28 #include "regression-export.h"
29 #include <data/case.h>
30 #include <data/casefile.h>
31 #include <data/cat-routines.h>
32 #include <data/category.h>
33 #include <data/dictionary.h>
34 #include <data/missing-values.h>
35 #include <data/procedure.h>
36 #include <data/transformations.h>
37 #include <data/value-labels.h>
38 #include <data/variable.h>
39 #include <language/command.h>
40 #include <language/dictionary/split-file.h>
41 #include <language/data-io/file-handle.h>
42 #include <language/lexer/lexer.h>
43 #include <libpspp/alloc.h>
44 #include <libpspp/compiler.h>
45 #include <libpspp/message.h>
46 #include <math/design-matrix.h>
47 #include <math/coefficient.h>
48 #include <math/linreg/linreg.h>
49 #include <output/table.h>
53 #define REG_LARGE_DATA 1000
58 "REGRESSION" (regression_):
79 +save[sv_]=resid,pred;
84 static struct cmd_regression cmd;
86 /* Linear regression models. */
87 static pspp_linreg_cache **models = NULL;
90 Transformations for saving predicted values
95 int n_trns; /* Number of transformations. */
96 int trns_id; /* Which trns is this one? */
97 pspp_linreg_cache *c; /* Linear model for this trns. */
100 Variables used (both explanatory and response).
102 static struct variable **v_variables;
107 static size_t n_variables;
110 File where the model will be saved if the EXPORT subcommand
113 static struct file_handle *model_file;
116 Return value for the procedure.
118 static int pspp_reg_rc = CMD_SUCCESS;
120 static bool run_regression (const struct ccase *,
121 const struct casefile *, void *,
122 const struct dataset *);
125 STATISTICS subcommand output functions.
127 static void reg_stats_r (pspp_linreg_cache *);
128 static void reg_stats_coeff (pspp_linreg_cache *);
129 static void reg_stats_anova (pspp_linreg_cache *);
130 static void reg_stats_outs (pspp_linreg_cache *);
131 static void reg_stats_zpp (pspp_linreg_cache *);
132 static void reg_stats_label (pspp_linreg_cache *);
133 static void reg_stats_sha (pspp_linreg_cache *);
134 static void reg_stats_ci (pspp_linreg_cache *);
135 static void reg_stats_f (pspp_linreg_cache *);
136 static void reg_stats_bcov (pspp_linreg_cache *);
137 static void reg_stats_ses (pspp_linreg_cache *);
138 static void reg_stats_xtx (pspp_linreg_cache *);
139 static void reg_stats_collin (pspp_linreg_cache *);
140 static void reg_stats_tol (pspp_linreg_cache *);
141 static void reg_stats_selection (pspp_linreg_cache *);
142 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
143 int, pspp_linreg_cache *);
146 reg_stats_r (pspp_linreg_cache * c)
156 rsq = c->ssm / c->sst;
157 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
158 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
159 t = tab_create (n_cols, n_rows, 0);
160 tab_dim (t, tab_natural_dimensions);
161 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
162 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
163 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
164 tab_vline (t, TAL_0, 1, 0, 0);
166 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
167 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
168 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
169 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
170 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
171 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
172 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
173 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
174 tab_title (t, _("Model Summary"));
179 Table showing estimated regression coefficients.
182 reg_stats_coeff (pspp_linreg_cache * c)
194 const struct variable *v;
195 const union value *val;
200 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
201 n_rows = c->n_coeffs + 2;
203 t = tab_create (n_cols, n_rows, 0);
204 tab_headers (t, 2, 0, 1, 0);
205 tab_dim (t, tab_natural_dimensions);
206 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
207 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
208 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
209 tab_vline (t, TAL_0, 1, 0, 0);
211 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
212 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
213 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
214 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
215 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
216 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
217 coeff = c->coeff[0]->estimate;
218 tab_float (t, 2, 1, 0, coeff, 10, 2);
219 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
220 tab_float (t, 3, 1, 0, std_err, 10, 2);
221 beta = coeff / c->depvar_std;
222 tab_float (t, 4, 1, 0, beta, 10, 2);
223 t_stat = coeff / std_err;
224 tab_float (t, 5, 1, 0, t_stat, 10, 2);
225 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
226 tab_float (t, 6, 1, 0, pval, 10, 2);
227 for (j = 1; j <= c->n_indeps; j++)
229 v = pspp_coeff_get_var (c->coeff[j], 0);
230 label = var_to_string (v);
231 /* Do not overwrite the variable's name. */
232 strncpy (tmp, label, MAX_STRING);
233 if (v->type == ALPHA)
236 Append the value associated with this coefficient.
237 This makes sense only if we us the usual binary encoding
241 val = pspp_coeff_get_value (c->coeff[j], v);
242 val_s = value_to_string (val, v);
243 strncat (tmp, val_s, MAX_STRING);
246 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
248 Regression coefficients.
250 coeff = c->coeff[j]->estimate;
251 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
253 Standard error of the coefficients.
255 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
256 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
258 'Standardized' coefficient, i.e., regression coefficient
259 if all variables had unit variance.
261 beta = gsl_vector_get (c->indep_std, j);
262 beta *= coeff / c->depvar_std;
263 tab_float (t, 4, j + 1, 0, beta, 10, 2);
266 Test statistic for H0: coefficient is 0.
268 t_stat = coeff / std_err;
269 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
271 P values for the test statistic above.
273 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
274 tab_float (t, 6, j + 1, 0, pval, 10, 2);
276 tab_title (t, _("Coefficients"));
282 Display the ANOVA table.
285 reg_stats_anova (pspp_linreg_cache * c)
289 const double msm = c->ssm / c->dfm;
290 const double mse = c->sse / c->dfe;
291 const double F = msm / mse;
292 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
297 t = tab_create (n_cols, n_rows, 0);
298 tab_headers (t, 2, 0, 1, 0);
299 tab_dim (t, tab_natural_dimensions);
301 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
303 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
304 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
305 tab_vline (t, TAL_0, 1, 0, 0);
307 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
308 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
309 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
310 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
311 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
313 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
314 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
315 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
317 /* Sums of Squares */
318 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
319 tab_float (t, 2, 3, 0, c->sst, 10, 2);
320 tab_float (t, 2, 2, 0, c->sse, 10, 2);
323 /* Degrees of freedom */
324 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
325 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
326 tab_float (t, 3, 3, 0, c->dft, 4, 0);
330 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
331 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
333 tab_float (t, 5, 1, 0, F, 8, 3);
335 tab_float (t, 6, 1, 0, pval, 8, 3);
337 tab_title (t, _("ANOVA"));
341 reg_stats_outs (pspp_linreg_cache * c)
346 reg_stats_zpp (pspp_linreg_cache * c)
351 reg_stats_label (pspp_linreg_cache * c)
356 reg_stats_sha (pspp_linreg_cache * c)
361 reg_stats_ci (pspp_linreg_cache * c)
366 reg_stats_f (pspp_linreg_cache * c)
371 reg_stats_bcov (pspp_linreg_cache * c)
383 n_cols = c->n_indeps + 1 + 2;
384 n_rows = 2 * (c->n_indeps + 1);
385 t = tab_create (n_cols, n_rows, 0);
386 tab_headers (t, 2, 0, 1, 0);
387 tab_dim (t, tab_natural_dimensions);
388 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
389 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
390 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
391 tab_vline (t, TAL_0, 1, 0, 0);
392 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
393 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
394 for (i = 1; i < c->n_coeffs; i++)
396 const struct variable *v = pspp_coeff_get_var (c->coeff[i], 0);
397 label = var_to_string (v);
398 tab_text (t, 2, i, TAB_CENTER, label);
399 tab_text (t, i + 2, 0, TAB_CENTER, label);
400 for (k = 1; k < c->n_coeffs; k++)
402 col = (i <= k) ? k : i;
403 row = (i <= k) ? i : k;
404 tab_float (t, k + 2, i, TAB_CENTER,
405 gsl_matrix_get (c->cov, row, col), 8, 3);
408 tab_title (t, _("Coefficient Correlations"));
412 reg_stats_ses (pspp_linreg_cache * c)
417 reg_stats_xtx (pspp_linreg_cache * c)
422 reg_stats_collin (pspp_linreg_cache * c)
427 reg_stats_tol (pspp_linreg_cache * c)
432 reg_stats_selection (pspp_linreg_cache * c)
438 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
439 int keyword, pspp_linreg_cache * c)
448 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
451 The order here must match the order in which the STATISTICS
452 keywords appear in the specification section above.
479 Set everything but F.
481 for (i = 0; i < f; i++)
488 for (i = 0; i < all; i++)
496 Default output: ANOVA table, parameter estimates,
497 and statistics for variables not entered into model,
500 if (keywords[defaults] | d)
508 statistics_keyword_output (reg_stats_r, keywords[r], c);
509 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
510 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
511 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
512 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
513 statistics_keyword_output (reg_stats_label, keywords[label], c);
514 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
515 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
516 statistics_keyword_output (reg_stats_f, keywords[f], c);
517 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
518 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
519 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
520 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
521 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
522 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
526 Free the transformation. Free its linear model if this
527 transformation is the last one.
530 regression_trns_free (void *t_)
533 struct reg_trns *t = t_;
535 if (t->trns_id == t->n_trns)
537 result = pspp_linreg_cache_free (t->c);
545 Gets the predicted values.
548 regression_trns_pred_proc (void *t_, struct ccase *c,
549 casenumber case_idx UNUSED)
553 struct reg_trns *trns = t_;
554 pspp_linreg_cache *model;
555 union value *output = NULL;
556 const union value **vals = NULL;
557 struct variable **vars = NULL;
559 assert (trns != NULL);
561 assert (model != NULL);
562 assert (model->depvar != NULL);
563 assert (model->pred != NULL);
565 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
566 n_vals = (*model->get_vars) (model, vars);
568 vals = xnmalloc (n_vals, sizeof (*vals));
569 output = case_data_rw (c, model->pred->fv);
570 assert (output != NULL);
572 for (i = 0; i < n_vals; i++)
574 vals[i] = case_data (c, vars[i]->fv);
576 output->f = (*model->predict) ((const struct variable **) vars,
577 vals, model, n_vals);
580 return TRNS_CONTINUE;
587 regression_trns_resid_proc (void *t_, struct ccase *c,
588 casenumber case_idx UNUSED)
592 struct reg_trns *trns = t_;
593 pspp_linreg_cache *model;
594 union value *output = NULL;
595 const union value **vals = NULL;
596 const union value *obs = NULL;
597 struct variable **vars = NULL;
599 assert (trns != NULL);
601 assert (model != NULL);
602 assert (model->depvar != NULL);
603 assert (model->resid != NULL);
605 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
606 n_vals = (*model->get_vars) (model, vars);
608 vals = xnmalloc (n_vals, sizeof (*vals));
609 output = case_data_rw (c, model->resid->fv);
610 assert (output != NULL);
612 for (i = 0; i < n_vals; i++)
614 vals[i] = case_data (c, vars[i]->fv);
616 obs = case_data (c, model->depvar->fv);
617 output->f = (*model->residual) ((const struct variable **) vars,
618 vals, obs, model, n_vals);
621 return TRNS_CONTINUE;
625 Returns false if NAME is a duplicate of any existing variable name.
628 try_name (const struct dictionary *dict, const char *name)
630 if (dict_lookup_var (dict, name) != NULL)
637 reg_get_name (const struct dictionary *dict, char name[LONG_NAME_LEN], const char prefix[LONG_NAME_LEN])
641 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
642 while (!try_name (dict, name))
645 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
650 reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
651 pspp_linreg_cache * c, struct variable **v, int n_trns)
653 struct dictionary *dict = dataset_dict (ds);
654 static int trns_index = 1;
655 char name[LONG_NAME_LEN];
656 struct variable *new_var;
657 struct reg_trns *t = NULL;
659 t = xmalloc (sizeof (*t));
660 t->trns_id = trns_index;
663 reg_get_name (dict, name, prefix);
664 new_var = dict_create_var (dict, name, 0);
665 assert (new_var != NULL);
667 add_transformation (ds, f, regression_trns_free, t);
672 subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models)
674 pspp_linreg_cache **lc;
678 assert (models != NULL);
682 /* Count the number of transformations we will need. */
683 for (i = 0; i < REGRESSION_SV_count; i++)
690 n_trns *= cmd.n_dependent;
692 for (lc = models; lc < models + cmd.n_dependent; lc++)
694 assert (*lc != NULL);
695 assert ((*lc)->depvar != NULL);
696 if (cmd.a_save[REGRESSION_SV_RESID])
698 reg_save_var (ds, "RES", regression_trns_resid_proc, *lc,
699 &(*lc)->resid, n_trns);
701 if (cmd.a_save[REGRESSION_SV_PRED])
703 reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc,
704 &(*lc)->pred, n_trns);
710 for (lc = models; lc < models + cmd.n_dependent; lc++)
712 assert (*lc != NULL);
713 pspp_linreg_cache_free (*lc);
719 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
723 for (i = 0; i < n_vars; i++)
725 if (v->index == varlist[i]->index)
734 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
739 struct variable **varlist;
740 struct pspp_coeff *coeff;
741 const struct variable *v;
744 fprintf (fp, "%s", reg_export_categorical_encode_1);
746 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
747 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
750 v = pspp_coeff_get_var (coeff, 0);
751 if (v->type == ALPHA)
753 if (!reg_inserted (v, varlist, n_vars))
755 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
757 varlist[n_vars] = (struct variable *) v;
762 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
763 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
765 for (i = 0; i < n_vars - 1; i++)
767 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
769 fprintf (fp, "&%s};\n\t", varlist[i]->name);
771 for (i = 0; i < n_vars; i++)
774 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
776 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
777 varlist[i]->obs_vals->n_categories);
779 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
781 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
782 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
783 value_to_string (val, varlist[i]));
786 fprintf (fp, "%s", reg_export_categorical_encode_2);
790 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
793 struct pspp_coeff *coeff;
794 const struct variable *v;
796 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
797 for (i = 1; i < c->n_indeps; i++)
800 v = pspp_coeff_get_var (coeff, 0);
801 fprintf (fp, "\"%s\",\n\t\t", v->name);
804 v = pspp_coeff_get_var (coeff, 0);
805 fprintf (fp, "\"%s\"};\n\t", v->name);
808 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
810 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
811 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
812 reg_print_depvars (fp, c);
813 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
815 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
816 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
819 reg_has_categorical (pspp_linreg_cache * c)
822 const struct variable *v;
824 for (i = 1; i < c->n_coeffs; i++)
826 v = pspp_coeff_get_var (c->coeff[i], 0);
827 if (v->type == ALPHA)
836 subcommand_export (int export, pspp_linreg_cache * c)
841 int n_quantiles = 100;
843 struct pspp_coeff *coeff;
848 assert (model_file != NULL);
849 fp = fopen (fh_get_file_name (model_file), "w");
851 fprintf (fp, "%s", reg_preamble);
852 reg_print_getvar (fp, c);
853 if (reg_has_categorical (c))
855 reg_print_categorical_encoding (fp, c);
857 fprintf (fp, "%s", reg_export_t_quantiles_1);
858 for (i = 0; i < n_quantiles - 1; i++)
860 tmp = 0.5 + 0.005 * (double) i;
861 fprintf (fp, "%.15e,\n\t\t",
862 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
864 fprintf (fp, "%.15e};\n\t",
865 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
866 fprintf (fp, "%s", reg_export_t_quantiles_2);
867 fprintf (fp, "%s", reg_mean_cmt);
868 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
869 fprintf (fp, "const char *var_names[])\n{\n\t");
870 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
871 for (i = 1; i < c->n_indeps; i++)
874 fprintf (fp, "%.15e,\n\t\t", coeff->estimate);
877 fprintf (fp, "%.15e};\n\t", coeff->estimate);
879 fprintf (fp, "double estimate = %.15e;\n\t", coeff->estimate);
880 fprintf (fp, "int i;\n\tint j;\n\n\t");
881 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
882 fprintf (fp, "%s", reg_getvar);
883 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
885 for (i = 0; i < c->cov->size1 - 1; i++)
888 for (j = 0; j < c->cov->size2 - 1; j++)
890 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
892 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
895 for (j = 0; j < c->cov->size2 - 1; j++)
897 fprintf (fp, "%.15e, ",
898 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
900 fprintf (fp, "%.15e}\n\t",
901 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
902 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
904 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
905 fprintf (fp, "%s", reg_variance);
906 fprintf (fp, "%s", reg_export_confidence_interval);
907 tmp = c->mse * c->mse;
908 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
909 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
910 fprintf (fp, "%s", reg_export_prediction_interval_3);
912 fp = fopen ("pspp_model_reg.h", "w");
913 fprintf (fp, "%s", reg_header);
919 regression_custom_export (struct dataset *ds UNUSED, struct cmd_regression *cmd UNUSED, void *aux UNUSED)
921 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
922 if (!lex_force_match ('('))
929 model_file = fh_parse (FH_REF_FILE);
930 if (model_file == NULL)
934 if (!lex_force_match (')'))
941 cmd_regression (struct dataset *ds)
943 if (!parse_regression (ds, &cmd, NULL))
946 models = xnmalloc (cmd.n_dependent, sizeof *models);
947 if (!multipass_procedure_with_splits (ds, run_regression, &cmd))
948 return CMD_CASCADING_FAILURE;
949 subcommand_save (ds, cmd.sbc_save, models);
956 Is variable k the dependent variable?
959 is_depvar (size_t k, const struct variable *v)
962 compare_var_names returns 0 if the variable
965 if (!compare_var_names (v, v_variables[k], NULL))
972 Mark missing cases. Return the number of non-missing cases.
975 mark_missing_cases (const struct casefile *cf, struct variable *v,
976 int *is_missing_case, double n_data)
978 struct casereader *r;
981 const union value *val;
983 for (r = casefile_get_reader (cf);
984 casereader_read (r, &c); case_destroy (&c))
986 row = casereader_cnum (r) - 1;
988 val = case_data (&c, v->fv);
989 cat_value_update (v, val);
990 if (mv_is_value_missing (&v->miss, val))
992 if (!is_missing_case[row])
994 /* Now it is missing. */
996 is_missing_case[row] = 1;
1000 casereader_destroy (r);
1005 /* Parser for the variables sub command */
1007 regression_custom_variables (struct dataset *ds,
1008 struct cmd_regression *cmd UNUSED,
1011 const struct dictionary *dict = dataset_dict (ds);
1015 if ((token != T_ID || dict_lookup_var (dict, tokid) == NULL)
1020 if (!parse_variables (dict, &v_variables, &n_variables, PV_NONE))
1025 assert (n_variables);
1031 Count the explanatory variables. The user may or may
1032 not have specified a response variable in the syntax.
1035 get_n_indep (const struct variable *v)
1040 result = n_variables;
1041 while (i < n_variables)
1043 if (is_depvar (i, v))
1054 Read from the active file. Identify the explanatory variables in
1055 v_variables. Encode categorical variables. Drop cases with missing
1059 prepare_data (int n_data, int is_missing_case[],
1060 struct variable **indep_vars,
1061 struct variable *depvar, const struct casefile *cf)
1066 assert (indep_vars != NULL);
1068 for (i = 0; i < n_variables; i++)
1070 if (!is_depvar (i, depvar))
1072 indep_vars[j] = v_variables[i];
1074 if (v_variables[i]->type == ALPHA)
1076 /* Make a place to hold the binary vectors
1077 corresponding to this variable's values. */
1078 cat_stored_values_create (v_variables[i]);
1081 mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
1085 Mark missing cases for the dependent variable.
1087 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
1092 coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
1094 c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
1095 c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
1096 c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
1097 pspp_coeff_init (c->coeff + 1, dm);
1101 run_regression (const struct ccase *first,
1102 const struct casefile *cf, void *cmd_ UNUSED, const struct dataset *ds)
1105 size_t n_data = 0; /* Number of valide cases. */
1106 size_t n_cases; /* Number of cases. */
1112 Keep track of the missing cases.
1114 int *is_missing_case;
1115 const union value *val;
1116 struct casereader *r;
1118 struct variable **indep_vars;
1119 struct design_matrix *X;
1122 pspp_linreg_opts lopts;
1124 assert (models != NULL);
1126 output_split_file_values (ds, first);
1130 dict_get_vars (dataset_dict (ds), &v_variables, &n_variables,
1134 n_cases = casefile_get_case_cnt (cf);
1136 for (i = 0; i < cmd.n_dependent; i++)
1138 if (cmd.v_dependent[i]->type != NUMERIC)
1140 msg (SE, gettext ("Dependent variable must be numeric."));
1141 pspp_reg_rc = CMD_FAILURE;
1146 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
1148 lopts.get_depvar_mean_std = 1;
1150 for (k = 0; k < cmd.n_dependent; k++)
1152 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
1153 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
1154 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
1155 assert (indep_vars != NULL);
1157 for (i = 0; i < n_cases; i++)
1159 is_missing_case[i] = 0;
1161 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
1163 (const struct casefile *) cf);
1164 Y = gsl_vector_alloc (n_data);
1167 design_matrix_create (n_indep, (const struct variable **) indep_vars,
1169 for (i = 0; i < X->m->size2; i++)
1171 lopts.get_indep_mean_std[i] = 1;
1173 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1174 models[k]->indep_means = gsl_vector_alloc (X->m->size2);
1175 models[k]->indep_std = gsl_vector_alloc (X->m->size2);
1176 models[k]->depvar = (const struct variable *) cmd.v_dependent[k];
1178 For large data sets, use QR decomposition.
1180 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1182 models[k]->method = PSPP_LINREG_SVD;
1186 The second pass fills the design matrix.
1189 for (r = casefile_get_reader (cf); casereader_read (r, &c);
1191 /* Iterate over the cases. */
1193 case_num = casereader_cnum (r) - 1;
1194 if (!is_missing_case[case_num])
1196 for (i = 0; i < n_variables; ++i) /* Iterate over the
1201 val = case_data (&c, v_variables[i]->fv);
1203 Independent/dependent variable separation. The
1204 'variables' subcommand specifies a varlist which contains
1205 both dependent and independent variables. The dependent
1206 variables are specified with the 'dependent'
1207 subcommand, and maybe also in the 'variables' subcommand.
1208 We need to separate the two.
1210 if (!is_depvar (i, cmd.v_dependent[k]))
1212 if (v_variables[i]->type == ALPHA)
1214 design_matrix_set_categorical (X, row,
1215 v_variables[i], val);
1217 else if (v_variables[i]->type == NUMERIC)
1219 design_matrix_set_numeric (X, row, v_variables[i],
1224 val = case_data (&c, cmd.v_dependent[k]->fv);
1225 gsl_vector_set (Y, row, val->f);
1230 Now that we know the number of coefficients, allocate space
1231 and store pointers to the variables that correspond to the
1234 coeff_init (models[k], X);
1237 Find the least-squares estimates and other statistics.
1239 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1240 subcommand_statistics (cmd.a_statistics, models[k]);
1241 subcommand_export (cmd.sbc_export, models[k]);
1243 gsl_vector_free (Y);
1244 design_matrix_destroy (X);
1246 free (lopts.get_indep_mean_std);
1247 casereader_destroy (r);
1250 free (is_missing_case);