1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_matrix.h>
24 #include <gsl/gsl_vector.h>
28 #include "regression-export.h"
29 #include <data/case.h>
30 #include <data/casefile.h>
31 #include <data/cat-routines.h>
32 #include <data/category.h>
33 #include <data/dictionary.h>
34 #include <data/missing-values.h>
35 #include <data/procedure.h>
36 #include <data/transformations.h>
37 #include <data/value-labels.h>
38 #include <data/variable.h>
39 #include <language/command.h>
40 #include <language/dictionary/split-file.h>
41 #include <language/data-io/file-handle.h>
42 #include <language/lexer/lexer.h>
43 #include <libpspp/alloc.h>
44 #include <libpspp/compiler.h>
45 #include <libpspp/message.h>
46 #include <math/design-matrix.h>
47 #include <math/coefficient.h>
48 #include <math/linreg/linreg.h>
49 #include <output/table.h>
53 #define REG_LARGE_DATA 1000
58 "REGRESSION" (regression_):
79 +save[sv_]=resid,pred;
84 static struct cmd_regression cmd;
86 /* Linear regression models. */
87 static pspp_linreg_cache **models = NULL;
90 Transformations for saving predicted values
95 int n_trns; /* Number of transformations. */
96 int trns_id; /* Which trns is this one? */
97 pspp_linreg_cache *c; /* Linear model for this trns. */
100 Variables used (both explanatory and response).
102 static struct variable **v_variables;
107 static size_t n_variables;
110 File where the model will be saved if the EXPORT subcommand
113 static struct file_handle *model_file;
116 Return value for the procedure.
118 static int pspp_reg_rc = CMD_SUCCESS;
120 static bool run_regression (const struct ccase *,
121 const struct casefile *, void *,
122 const struct dataset *);
125 STATISTICS subcommand output functions.
127 static void reg_stats_r (pspp_linreg_cache *);
128 static void reg_stats_coeff (pspp_linreg_cache *);
129 static void reg_stats_anova (pspp_linreg_cache *);
130 static void reg_stats_outs (pspp_linreg_cache *);
131 static void reg_stats_zpp (pspp_linreg_cache *);
132 static void reg_stats_label (pspp_linreg_cache *);
133 static void reg_stats_sha (pspp_linreg_cache *);
134 static void reg_stats_ci (pspp_linreg_cache *);
135 static void reg_stats_f (pspp_linreg_cache *);
136 static void reg_stats_bcov (pspp_linreg_cache *);
137 static void reg_stats_ses (pspp_linreg_cache *);
138 static void reg_stats_xtx (pspp_linreg_cache *);
139 static void reg_stats_collin (pspp_linreg_cache *);
140 static void reg_stats_tol (pspp_linreg_cache *);
141 static void reg_stats_selection (pspp_linreg_cache *);
142 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
143 int, pspp_linreg_cache *);
146 reg_stats_r (pspp_linreg_cache * c)
156 rsq = c->ssm / c->sst;
157 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
158 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
159 t = tab_create (n_cols, n_rows, 0);
160 tab_dim (t, tab_natural_dimensions);
161 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
162 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
163 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
164 tab_vline (t, TAL_0, 1, 0, 0);
166 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
167 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
168 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
169 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
170 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
171 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
172 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
173 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
174 tab_title (t, _("Model Summary"));
179 Table showing estimated regression coefficients.
182 reg_stats_coeff (pspp_linreg_cache * c)
194 const struct variable *v;
195 const union value *val;
200 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
201 n_rows = c->n_coeffs + 2;
203 t = tab_create (n_cols, n_rows, 0);
204 tab_headers (t, 2, 0, 1, 0);
205 tab_dim (t, tab_natural_dimensions);
206 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
207 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
208 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
209 tab_vline (t, TAL_0, 1, 0, 0);
211 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
212 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
213 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
214 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
215 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
216 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
217 coeff = c->coeff[0]->estimate;
218 tab_float (t, 2, 1, 0, coeff, 10, 2);
219 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
220 tab_float (t, 3, 1, 0, std_err, 10, 2);
221 beta = coeff / c->depvar_std;
222 tab_float (t, 4, 1, 0, beta, 10, 2);
223 t_stat = coeff / std_err;
224 tab_float (t, 5, 1, 0, t_stat, 10, 2);
225 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
226 tab_float (t, 6, 1, 0, pval, 10, 2);
227 for (j = 1; j <= c->n_indeps; j++)
229 v = pspp_coeff_get_var (c->coeff[j], 0);
230 label = var_to_string (v);
231 /* Do not overwrite the variable's name. */
232 strncpy (tmp, label, MAX_STRING);
233 if (var_is_alpha (v))
236 Append the value associated with this coefficient.
237 This makes sense only if we us the usual binary encoding
241 val = pspp_coeff_get_value (c->coeff[j], v);
242 val_s = var_get_value_name (v, val);
243 strncat (tmp, val_s, MAX_STRING);
246 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
248 Regression coefficients.
250 coeff = c->coeff[j]->estimate;
251 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
253 Standard error of the coefficients.
255 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
256 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
258 'Standardized' coefficient, i.e., regression coefficient
259 if all variables had unit variance.
261 beta = gsl_vector_get (c->indep_std, j);
262 beta *= coeff / c->depvar_std;
263 tab_float (t, 4, j + 1, 0, beta, 10, 2);
266 Test statistic for H0: coefficient is 0.
268 t_stat = coeff / std_err;
269 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
271 P values for the test statistic above.
273 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
274 tab_float (t, 6, j + 1, 0, pval, 10, 2);
276 tab_title (t, _("Coefficients"));
282 Display the ANOVA table.
285 reg_stats_anova (pspp_linreg_cache * c)
289 const double msm = c->ssm / c->dfm;
290 const double mse = c->sse / c->dfe;
291 const double F = msm / mse;
292 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
297 t = tab_create (n_cols, n_rows, 0);
298 tab_headers (t, 2, 0, 1, 0);
299 tab_dim (t, tab_natural_dimensions);
301 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
303 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
304 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
305 tab_vline (t, TAL_0, 1, 0, 0);
307 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
308 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
309 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
310 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
311 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
313 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
314 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
315 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
317 /* Sums of Squares */
318 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
319 tab_float (t, 2, 3, 0, c->sst, 10, 2);
320 tab_float (t, 2, 2, 0, c->sse, 10, 2);
323 /* Degrees of freedom */
324 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
325 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
326 tab_float (t, 3, 3, 0, c->dft, 4, 0);
330 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
331 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
333 tab_float (t, 5, 1, 0, F, 8, 3);
335 tab_float (t, 6, 1, 0, pval, 8, 3);
337 tab_title (t, _("ANOVA"));
341 reg_stats_outs (pspp_linreg_cache * c)
346 reg_stats_zpp (pspp_linreg_cache * c)
351 reg_stats_label (pspp_linreg_cache * c)
356 reg_stats_sha (pspp_linreg_cache * c)
361 reg_stats_ci (pspp_linreg_cache * c)
366 reg_stats_f (pspp_linreg_cache * c)
371 reg_stats_bcov (pspp_linreg_cache * c)
383 n_cols = c->n_indeps + 1 + 2;
384 n_rows = 2 * (c->n_indeps + 1);
385 t = tab_create (n_cols, n_rows, 0);
386 tab_headers (t, 2, 0, 1, 0);
387 tab_dim (t, tab_natural_dimensions);
388 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
389 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
390 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
391 tab_vline (t, TAL_0, 1, 0, 0);
392 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
393 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
394 for (i = 1; i < c->n_coeffs; i++)
396 const struct variable *v = pspp_coeff_get_var (c->coeff[i], 0);
397 label = var_to_string (v);
398 tab_text (t, 2, i, TAB_CENTER, label);
399 tab_text (t, i + 2, 0, TAB_CENTER, label);
400 for (k = 1; k < c->n_coeffs; k++)
402 col = (i <= k) ? k : i;
403 row = (i <= k) ? i : k;
404 tab_float (t, k + 2, i, TAB_CENTER,
405 gsl_matrix_get (c->cov, row, col), 8, 3);
408 tab_title (t, _("Coefficient Correlations"));
412 reg_stats_ses (pspp_linreg_cache * c)
417 reg_stats_xtx (pspp_linreg_cache * c)
422 reg_stats_collin (pspp_linreg_cache * c)
427 reg_stats_tol (pspp_linreg_cache * c)
432 reg_stats_selection (pspp_linreg_cache * c)
438 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
439 int keyword, pspp_linreg_cache * c)
448 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
451 The order here must match the order in which the STATISTICS
452 keywords appear in the specification section above.
479 Set everything but F.
481 for (i = 0; i < f; i++)
488 for (i = 0; i < all; i++)
496 Default output: ANOVA table, parameter estimates,
497 and statistics for variables not entered into model,
500 if (keywords[defaults] | d)
508 statistics_keyword_output (reg_stats_r, keywords[r], c);
509 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
510 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
511 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
512 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
513 statistics_keyword_output (reg_stats_label, keywords[label], c);
514 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
515 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
516 statistics_keyword_output (reg_stats_f, keywords[f], c);
517 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
518 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
519 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
520 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
521 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
522 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
526 Free the transformation. Free its linear model if this
527 transformation is the last one.
530 regression_trns_free (void *t_)
533 struct reg_trns *t = t_;
535 if (t->trns_id == t->n_trns)
537 result = pspp_linreg_cache_free (t->c);
545 Gets the predicted values.
548 regression_trns_pred_proc (void *t_, struct ccase *c,
549 casenumber case_idx UNUSED)
553 struct reg_trns *trns = t_;
554 pspp_linreg_cache *model;
555 union value *output = NULL;
556 const union value **vals = NULL;
557 struct variable **vars = NULL;
559 assert (trns != NULL);
561 assert (model != NULL);
562 assert (model->depvar != NULL);
563 assert (model->pred != NULL);
565 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
566 n_vals = (*model->get_vars) (model, vars);
568 vals = xnmalloc (n_vals, sizeof (*vals));
569 output = case_data_rw (c, model->pred);
570 assert (output != NULL);
572 for (i = 0; i < n_vals; i++)
574 vals[i] = case_data (c, vars[i]);
576 output->f = (*model->predict) ((const struct variable **) vars,
577 vals, model, n_vals);
580 return TRNS_CONTINUE;
587 regression_trns_resid_proc (void *t_, struct ccase *c,
588 casenumber case_idx UNUSED)
592 struct reg_trns *trns = t_;
593 pspp_linreg_cache *model;
594 union value *output = NULL;
595 const union value **vals = NULL;
596 const union value *obs = NULL;
597 struct variable **vars = NULL;
599 assert (trns != NULL);
601 assert (model != NULL);
602 assert (model->depvar != NULL);
603 assert (model->resid != NULL);
605 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
606 n_vals = (*model->get_vars) (model, vars);
608 vals = xnmalloc (n_vals, sizeof (*vals));
609 output = case_data_rw (c, model->resid);
610 assert (output != NULL);
612 for (i = 0; i < n_vals; i++)
614 vals[i] = case_data (c, vars[i]);
616 obs = case_data (c, model->depvar);
617 output->f = (*model->residual) ((const struct variable **) vars,
618 vals, obs, model, n_vals);
621 return TRNS_CONTINUE;
625 Returns false if NAME is a duplicate of any existing variable name.
628 try_name (const struct dictionary *dict, const char *name)
630 if (dict_lookup_var (dict, name) != NULL)
637 reg_get_name (const struct dictionary *dict, char name[LONG_NAME_LEN], const char prefix[LONG_NAME_LEN])
641 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
642 while (!try_name (dict, name))
645 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
650 reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
651 pspp_linreg_cache * c, struct variable **v, int n_trns)
653 struct dictionary *dict = dataset_dict (ds);
654 static int trns_index = 1;
655 char name[LONG_NAME_LEN];
656 struct variable *new_var;
657 struct reg_trns *t = NULL;
659 t = xmalloc (sizeof (*t));
660 t->trns_id = trns_index;
663 reg_get_name (dict, name, prefix);
664 new_var = dict_create_var (dict, name, 0);
665 assert (new_var != NULL);
667 add_transformation (ds, f, regression_trns_free, t);
672 subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models)
674 pspp_linreg_cache **lc;
678 assert (models != NULL);
682 /* Count the number of transformations we will need. */
683 for (i = 0; i < REGRESSION_SV_count; i++)
690 n_trns *= cmd.n_dependent;
692 for (lc = models; lc < models + cmd.n_dependent; lc++)
694 assert (*lc != NULL);
695 assert ((*lc)->depvar != NULL);
696 if (cmd.a_save[REGRESSION_SV_RESID])
698 reg_save_var (ds, "RES", regression_trns_resid_proc, *lc,
699 &(*lc)->resid, n_trns);
701 if (cmd.a_save[REGRESSION_SV_PRED])
703 reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc,
704 &(*lc)->pred, n_trns);
710 for (lc = models; lc < models + cmd.n_dependent; lc++)
712 assert (*lc != NULL);
713 pspp_linreg_cache_free (*lc);
719 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
723 for (i = 0; i < n_vars; i++)
734 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
738 struct variable **varlist;
740 fprintf (fp, "%s", reg_export_categorical_encode_1);
742 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
743 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
745 struct pspp_coeff *coeff = c->coeff[i];
746 const struct variable *v = pspp_coeff_get_var (coeff, 0);
747 if (var_is_alpha (v))
749 if (!reg_inserted (v, varlist, n_vars))
751 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
753 varlist[n_vars] = (struct variable *) v;
758 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
759 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
761 for (i = 0; i < n_vars - 1; i++)
763 fprintf (fp, "&%s,\n\t\t", var_get_name (varlist[i]));
765 fprintf (fp, "&%s};\n\t", var_get_name (varlist[i]));
767 for (i = 0; i < n_vars; i++)
769 size_t n_categories = cat_get_n_categories (varlist[i]);
772 fprintf (fp, "%s.name = \"%s\";\n\t",
773 var_get_name (varlist[i]),
774 var_get_name (varlist[i]));
775 fprintf (fp, "%s.n_vals = %d;\n\t",
776 var_get_name (varlist[i]),
779 for (j = 0; j < n_categories; j++)
781 union value *val = cat_subscript_to_value (j, varlist[i]);
782 fprintf (fp, "%s.values[%d] = \"%s\";\n\t",
783 var_get_name (varlist[i]), j,
784 var_get_value_name (varlist[i], val));
787 fprintf (fp, "%s", reg_export_categorical_encode_2);
791 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
794 struct pspp_coeff *coeff;
795 const struct variable *v;
797 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
798 for (i = 1; i < c->n_indeps; i++)
801 v = pspp_coeff_get_var (coeff, 0);
802 fprintf (fp, "\"%s\",\n\t\t", var_get_name (v));
805 v = pspp_coeff_get_var (coeff, 0);
806 fprintf (fp, "\"%s\"};\n\t", var_get_name (v));
809 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
811 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
812 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
813 reg_print_depvars (fp, c);
814 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
816 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
817 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
820 reg_has_categorical (pspp_linreg_cache * c)
823 const struct variable *v;
825 for (i = 1; i < c->n_coeffs; i++)
827 v = pspp_coeff_get_var (c->coeff[i], 0);
828 if (var_is_alpha (v))
835 subcommand_export (int export, pspp_linreg_cache * c)
840 int n_quantiles = 100;
842 struct pspp_coeff *coeff;
847 assert (model_file != NULL);
848 fp = fopen (fh_get_file_name (model_file), "w");
850 fprintf (fp, "%s", reg_preamble);
851 reg_print_getvar (fp, c);
852 if (reg_has_categorical (c))
854 reg_print_categorical_encoding (fp, c);
856 fprintf (fp, "%s", reg_export_t_quantiles_1);
857 for (i = 0; i < n_quantiles - 1; i++)
859 tmp = 0.5 + 0.005 * (double) i;
860 fprintf (fp, "%.15e,\n\t\t",
861 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
863 fprintf (fp, "%.15e};\n\t",
864 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
865 fprintf (fp, "%s", reg_export_t_quantiles_2);
866 fprintf (fp, "%s", reg_mean_cmt);
867 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
868 fprintf (fp, "const char *var_names[])\n{\n\t");
869 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
870 for (i = 1; i < c->n_indeps; i++)
873 fprintf (fp, "%.15e,\n\t\t", coeff->estimate);
876 fprintf (fp, "%.15e};\n\t", coeff->estimate);
878 fprintf (fp, "double estimate = %.15e;\n\t", coeff->estimate);
879 fprintf (fp, "int i;\n\tint j;\n\n\t");
880 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
881 fprintf (fp, "%s", reg_getvar);
882 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
884 for (i = 0; i < c->cov->size1 - 1; i++)
887 for (j = 0; j < c->cov->size2 - 1; j++)
889 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
891 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
894 for (j = 0; j < c->cov->size2 - 1; j++)
896 fprintf (fp, "%.15e, ",
897 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
899 fprintf (fp, "%.15e}\n\t",
900 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
901 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
903 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
904 fprintf (fp, "%s", reg_variance);
905 fprintf (fp, "%s", reg_export_confidence_interval);
906 tmp = c->mse * c->mse;
907 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
908 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
909 fprintf (fp, "%s", reg_export_prediction_interval_3);
911 fp = fopen ("pspp_model_reg.h", "w");
912 fprintf (fp, "%s", reg_header);
918 regression_custom_export (struct lexer *lexer, struct dataset *ds UNUSED, struct cmd_regression *cmd UNUSED, void *aux UNUSED)
920 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
921 if (!lex_force_match (lexer, '('))
924 if (lex_match (lexer, '*'))
928 model_file = fh_parse (lexer, FH_REF_FILE);
929 if (model_file == NULL)
933 if (!lex_force_match (lexer, ')'))
940 cmd_regression (struct lexer *lexer, struct dataset *ds)
942 if (!parse_regression (lexer, ds, &cmd, NULL))
945 models = xnmalloc (cmd.n_dependent, sizeof *models);
946 if (!multipass_procedure_with_splits (ds, run_regression, &cmd))
947 return CMD_CASCADING_FAILURE;
948 subcommand_save (ds, cmd.sbc_save, models);
955 Is variable k the dependent variable?
958 is_depvar (size_t k, const struct variable *v)
960 return v == v_variables[k];
964 Mark missing cases. Return the number of non-missing cases.
967 mark_missing_cases (const struct casefile *cf, struct variable *v,
968 int *is_missing_case, double n_data)
970 struct casereader *r;
973 const union value *val;
975 for (r = casefile_get_reader (cf, NULL);
976 casereader_read (r, &c); case_destroy (&c))
978 row = casereader_cnum (r) - 1;
980 val = case_data (&c, v);
981 cat_value_update (v, val);
982 if (var_is_value_missing (v, val))
984 if (!is_missing_case[row])
986 /* Now it is missing. */
988 is_missing_case[row] = 1;
992 casereader_destroy (r);
997 /* Parser for the variables sub command */
999 regression_custom_variables (struct lexer *lexer, struct dataset *ds,
1000 struct cmd_regression *cmd UNUSED,
1003 const struct dictionary *dict = dataset_dict (ds);
1005 lex_match (lexer, '=');
1007 if ((lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
1008 && lex_token (lexer) != T_ALL)
1012 if (!parse_variables (lexer, dict, &v_variables, &n_variables, PV_NONE))
1017 assert (n_variables);
1023 Count the explanatory variables. The user may or may
1024 not have specified a response variable in the syntax.
1027 get_n_indep (const struct variable *v)
1032 result = n_variables;
1033 while (i < n_variables)
1035 if (is_depvar (i, v))
1046 Read from the active file. Identify the explanatory variables in
1047 v_variables. Encode categorical variables. Drop cases with missing
1051 prepare_data (int n_data, int is_missing_case[],
1052 struct variable **indep_vars,
1053 struct variable *depvar, const struct casefile *cf)
1058 assert (indep_vars != NULL);
1060 for (i = 0; i < n_variables; i++)
1062 if (!is_depvar (i, depvar))
1064 indep_vars[j] = v_variables[i];
1066 if (var_is_alpha (v_variables[i]))
1068 /* Make a place to hold the binary vectors
1069 corresponding to this variable's values. */
1070 cat_stored_values_create (v_variables[i]);
1073 mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
1077 Mark missing cases for the dependent variable.
1079 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
1084 coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
1086 c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
1087 c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
1088 c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
1089 pspp_coeff_init (c->coeff + 1, dm);
1093 run_regression (const struct ccase *first,
1094 const struct casefile *cf, void *cmd_ UNUSED, const struct dataset *ds)
1097 size_t n_data = 0; /* Number of valide cases. */
1098 size_t n_cases; /* Number of cases. */
1104 Keep track of the missing cases.
1106 int *is_missing_case;
1107 const union value *val;
1108 struct casereader *r;
1110 struct variable **indep_vars;
1111 struct design_matrix *X;
1114 pspp_linreg_opts lopts;
1116 assert (models != NULL);
1118 output_split_file_values (ds, first);
1122 dict_get_vars (dataset_dict (ds), &v_variables, &n_variables,
1126 n_cases = casefile_get_case_cnt (cf);
1128 for (i = 0; i < cmd.n_dependent; i++)
1130 if (!var_is_numeric (cmd.v_dependent[i]))
1132 msg (SE, gettext ("Dependent variable must be numeric."));
1133 pspp_reg_rc = CMD_FAILURE;
1138 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
1140 lopts.get_depvar_mean_std = 1;
1142 for (k = 0; k < cmd.n_dependent; k++)
1144 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
1145 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
1146 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
1147 assert (indep_vars != NULL);
1149 for (i = 0; i < n_cases; i++)
1151 is_missing_case[i] = 0;
1153 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
1155 (const struct casefile *) cf);
1156 Y = gsl_vector_alloc (n_data);
1159 design_matrix_create (n_indep, (const struct variable **) indep_vars,
1161 for (i = 0; i < X->m->size2; i++)
1163 lopts.get_indep_mean_std[i] = 1;
1165 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1166 models[k]->indep_means = gsl_vector_alloc (X->m->size2);
1167 models[k]->indep_std = gsl_vector_alloc (X->m->size2);
1168 models[k]->depvar = (const struct variable *) cmd.v_dependent[k];
1170 For large data sets, use QR decomposition.
1172 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1174 models[k]->method = PSPP_LINREG_SVD;
1178 The second pass fills the design matrix.
1181 for (r = casefile_get_reader (cf, NULL); casereader_read (r, &c);
1183 /* Iterate over the cases. */
1185 case_num = casereader_cnum (r) - 1;
1186 if (!is_missing_case[case_num])
1188 for (i = 0; i < n_variables; ++i) /* Iterate over the
1193 val = case_data (&c, v_variables[i]);
1195 Independent/dependent variable separation. The
1196 'variables' subcommand specifies a varlist which contains
1197 both dependent and independent variables. The dependent
1198 variables are specified with the 'dependent'
1199 subcommand, and maybe also in the 'variables' subcommand.
1200 We need to separate the two.
1202 if (!is_depvar (i, cmd.v_dependent[k]))
1204 if (var_is_alpha (v_variables[i]))
1206 design_matrix_set_categorical (X, row,
1207 v_variables[i], val);
1211 design_matrix_set_numeric (X, row, v_variables[i],
1216 val = case_data (&c, cmd.v_dependent[k]);
1217 gsl_vector_set (Y, row, val->f);
1222 Now that we know the number of coefficients, allocate space
1223 and store pointers to the variables that correspond to the
1226 coeff_init (models[k], X);
1229 Find the least-squares estimates and other statistics.
1231 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1232 subcommand_statistics (cmd.a_statistics, models[k]);
1233 subcommand_export (cmd.sbc_export, models[k]);
1235 gsl_vector_free (Y);
1236 design_matrix_destroy (X);
1238 free (lopts.get_indep_mean_std);
1239 casereader_destroy (r);
1242 free (is_missing_case);