1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 #include <gsl/gsl_cdf.h>
22 #include <gsl/gsl_matrix.h>
23 #include <gsl/gsl_vector.h>
27 #include "regression-export.h"
28 #include <data/case.h>
29 #include <data/casefile.h>
30 #include <data/cat-routines.h>
31 #include <data/category.h>
32 #include <data/dictionary.h>
33 #include <data/missing-values.h>
34 #include <data/procedure.h>
35 #include <data/transformations.h>
36 #include <data/value-labels.h>
37 #include <data/variable.h>
38 #include <language/command.h>
39 #include <language/dictionary/split-file.h>
40 #include <language/data-io/file-handle.h>
41 #include <language/lexer/lexer.h>
42 #include <libpspp/alloc.h>
43 #include <libpspp/compiler.h>
44 #include <libpspp/message.h>
45 #include <math/design-matrix.h>
46 #include <math/coefficient.h>
47 #include <math/linreg/linreg.h>
48 #include <math/moments.h>
49 #include <output/table.h>
53 #define REG_LARGE_DATA 1000
58 "REGRESSION" (regression_):
79 +save[sv_]=resid,pred;
84 static struct cmd_regression cmd;
87 Moments for each of the variables used.
92 const struct variable *v;
95 /* Linear regression models. */
96 static pspp_linreg_cache **models = NULL;
99 Transformations for saving predicted values
104 int n_trns; /* Number of transformations. */
105 int trns_id; /* Which trns is this one? */
106 pspp_linreg_cache *c; /* Linear model for this trns. */
109 Variables used (both explanatory and response).
111 static const struct variable **v_variables;
116 static size_t n_variables;
119 File where the model will be saved if the EXPORT subcommand
122 static struct file_handle *model_file;
125 Return value for the procedure.
127 static int pspp_reg_rc = CMD_SUCCESS;
129 static bool run_regression (const struct ccase *,
130 const struct casefile *, void *,
131 const struct dataset *);
134 STATISTICS subcommand output functions.
136 static void reg_stats_r (pspp_linreg_cache *);
137 static void reg_stats_coeff (pspp_linreg_cache *);
138 static void reg_stats_anova (pspp_linreg_cache *);
139 static void reg_stats_outs (pspp_linreg_cache *);
140 static void reg_stats_zpp (pspp_linreg_cache *);
141 static void reg_stats_label (pspp_linreg_cache *);
142 static void reg_stats_sha (pspp_linreg_cache *);
143 static void reg_stats_ci (pspp_linreg_cache *);
144 static void reg_stats_f (pspp_linreg_cache *);
145 static void reg_stats_bcov (pspp_linreg_cache *);
146 static void reg_stats_ses (pspp_linreg_cache *);
147 static void reg_stats_xtx (pspp_linreg_cache *);
148 static void reg_stats_collin (pspp_linreg_cache *);
149 static void reg_stats_tol (pspp_linreg_cache *);
150 static void reg_stats_selection (pspp_linreg_cache *);
151 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
152 int, pspp_linreg_cache *);
155 reg_stats_r (pspp_linreg_cache * c)
165 rsq = c->ssm / c->sst;
166 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
167 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
168 t = tab_create (n_cols, n_rows, 0);
169 tab_dim (t, tab_natural_dimensions);
170 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
171 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
172 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
173 tab_vline (t, TAL_0, 1, 0, 0);
175 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
176 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
177 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
178 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
179 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
180 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
181 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
182 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
183 tab_title (t, _("Model Summary"));
188 Table showing estimated regression coefficients.
191 reg_stats_coeff (pspp_linreg_cache * c)
203 const struct variable *v;
204 const union value *val;
209 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
210 n_rows = c->n_coeffs + 2;
212 t = tab_create (n_cols, n_rows, 0);
213 tab_headers (t, 2, 0, 1, 0);
214 tab_dim (t, tab_natural_dimensions);
215 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
216 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
217 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
218 tab_vline (t, TAL_0, 1, 0, 0);
220 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
221 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
222 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
223 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
224 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
225 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
226 coeff = c->coeff[0]->estimate;
227 tab_float (t, 2, 1, 0, coeff, 10, 2);
228 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
229 tab_float (t, 3, 1, 0, std_err, 10, 2);
230 beta = coeff / c->depvar_std;
231 tab_float (t, 4, 1, 0, beta, 10, 2);
232 t_stat = coeff / std_err;
233 tab_float (t, 5, 1, 0, t_stat, 10, 2);
234 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
235 tab_float (t, 6, 1, 0, pval, 10, 2);
236 for (j = 1; j <= c->n_indeps; j++)
238 v = pspp_coeff_get_var (c->coeff[j], 0);
239 label = var_to_string (v);
240 /* Do not overwrite the variable's name. */
241 strncpy (tmp, label, MAX_STRING);
242 if (var_is_alpha (v))
245 Append the value associated with this coefficient.
246 This makes sense only if we us the usual binary encoding
250 val = pspp_coeff_get_value (c->coeff[j], v);
251 val_s = var_get_value_name (v, val);
252 strncat (tmp, val_s, MAX_STRING);
255 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
257 Regression coefficients.
259 coeff = c->coeff[j]->estimate;
260 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
262 Standard error of the coefficients.
264 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
265 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
267 'Standardized' coefficient, i.e., regression coefficient
268 if all variables had unit variance.
270 beta = gsl_vector_get (c->indep_std, j);
271 beta *= coeff / c->depvar_std;
272 tab_float (t, 4, j + 1, 0, beta, 10, 2);
275 Test statistic for H0: coefficient is 0.
277 t_stat = coeff / std_err;
278 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
280 P values for the test statistic above.
283 2 * gsl_cdf_tdist_Q (fabs (t_stat),
284 (double) (c->n_obs - c->n_coeffs));
285 tab_float (t, 6, j + 1, 0, pval, 10, 2);
287 tab_title (t, _("Coefficients"));
293 Display the ANOVA table.
296 reg_stats_anova (pspp_linreg_cache * c)
300 const double msm = c->ssm / c->dfm;
301 const double mse = c->sse / c->dfe;
302 const double F = msm / mse;
303 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
308 t = tab_create (n_cols, n_rows, 0);
309 tab_headers (t, 2, 0, 1, 0);
310 tab_dim (t, tab_natural_dimensions);
312 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
314 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
315 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
316 tab_vline (t, TAL_0, 1, 0, 0);
318 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
319 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
320 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
321 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
322 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
324 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
325 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
326 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
328 /* Sums of Squares */
329 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
330 tab_float (t, 2, 3, 0, c->sst, 10, 2);
331 tab_float (t, 2, 2, 0, c->sse, 10, 2);
334 /* Degrees of freedom */
335 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
336 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
337 tab_float (t, 3, 3, 0, c->dft, 4, 0);
341 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
342 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
344 tab_float (t, 5, 1, 0, F, 8, 3);
346 tab_float (t, 6, 1, 0, pval, 8, 3);
348 tab_title (t, _("ANOVA"));
352 reg_stats_outs (pspp_linreg_cache * c)
357 reg_stats_zpp (pspp_linreg_cache * c)
362 reg_stats_label (pspp_linreg_cache * c)
367 reg_stats_sha (pspp_linreg_cache * c)
372 reg_stats_ci (pspp_linreg_cache * c)
377 reg_stats_f (pspp_linreg_cache * c)
382 reg_stats_bcov (pspp_linreg_cache * c)
394 n_cols = c->n_indeps + 1 + 2;
395 n_rows = 2 * (c->n_indeps + 1);
396 t = tab_create (n_cols, n_rows, 0);
397 tab_headers (t, 2, 0, 1, 0);
398 tab_dim (t, tab_natural_dimensions);
399 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
400 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
401 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
402 tab_vline (t, TAL_0, 1, 0, 0);
403 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
404 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
405 for (i = 1; i < c->n_coeffs; i++)
407 const struct variable *v = pspp_coeff_get_var (c->coeff[i], 0);
408 label = var_to_string (v);
409 tab_text (t, 2, i, TAB_CENTER, label);
410 tab_text (t, i + 2, 0, TAB_CENTER, label);
411 for (k = 1; k < c->n_coeffs; k++)
413 col = (i <= k) ? k : i;
414 row = (i <= k) ? i : k;
415 tab_float (t, k + 2, i, TAB_CENTER,
416 gsl_matrix_get (c->cov, row, col), 8, 3);
419 tab_title (t, _("Coefficient Correlations"));
423 reg_stats_ses (pspp_linreg_cache * c)
428 reg_stats_xtx (pspp_linreg_cache * c)
433 reg_stats_collin (pspp_linreg_cache * c)
438 reg_stats_tol (pspp_linreg_cache * c)
443 reg_stats_selection (pspp_linreg_cache * c)
449 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
450 int keyword, pspp_linreg_cache * c)
459 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
462 The order here must match the order in which the STATISTICS
463 keywords appear in the specification section above.
490 Set everything but F.
492 for (i = 0; i < f; i++)
499 for (i = 0; i < all; i++)
507 Default output: ANOVA table, parameter estimates,
508 and statistics for variables not entered into model,
511 if (keywords[defaults] | d)
519 statistics_keyword_output (reg_stats_r, keywords[r], c);
520 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
521 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
522 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
523 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
524 statistics_keyword_output (reg_stats_label, keywords[label], c);
525 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
526 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
527 statistics_keyword_output (reg_stats_f, keywords[f], c);
528 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
529 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
530 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
531 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
532 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
533 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
537 Free the transformation. Free its linear model if this
538 transformation is the last one.
541 regression_trns_free (void *t_)
544 struct reg_trns *t = t_;
546 if (t->trns_id == t->n_trns)
548 result = pspp_linreg_cache_free (t->c);
556 Gets the predicted values.
559 regression_trns_pred_proc (void *t_, struct ccase *c,
560 casenumber case_idx UNUSED)
564 struct reg_trns *trns = t_;
565 pspp_linreg_cache *model;
566 union value *output = NULL;
567 const union value **vals = NULL;
568 struct variable **vars = NULL;
570 assert (trns != NULL);
572 assert (model != NULL);
573 assert (model->depvar != NULL);
574 assert (model->pred != NULL);
576 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
577 n_vals = (*model->get_vars) (model, vars);
579 vals = xnmalloc (n_vals, sizeof (*vals));
580 output = case_data_rw (c, model->pred);
581 assert (output != NULL);
583 for (i = 0; i < n_vals; i++)
585 vals[i] = case_data (c, vars[i]);
587 output->f = (*model->predict) ((const struct variable **) vars,
588 vals, model, n_vals);
591 return TRNS_CONTINUE;
598 regression_trns_resid_proc (void *t_, struct ccase *c,
599 casenumber case_idx UNUSED)
603 struct reg_trns *trns = t_;
604 pspp_linreg_cache *model;
605 union value *output = NULL;
606 const union value **vals = NULL;
607 const union value *obs = NULL;
608 struct variable **vars = NULL;
610 assert (trns != NULL);
612 assert (model != NULL);
613 assert (model->depvar != NULL);
614 assert (model->resid != NULL);
616 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
617 n_vals = (*model->get_vars) (model, vars);
619 vals = xnmalloc (n_vals, sizeof (*vals));
620 output = case_data_rw (c, model->resid);
621 assert (output != NULL);
623 for (i = 0; i < n_vals; i++)
625 vals[i] = case_data (c, vars[i]);
627 obs = case_data (c, model->depvar);
628 output->f = (*model->residual) ((const struct variable **) vars,
629 vals, obs, model, n_vals);
632 return TRNS_CONTINUE;
636 Returns false if NAME is a duplicate of any existing variable name.
639 try_name (const struct dictionary *dict, const char *name)
641 if (dict_lookup_var (dict, name) != NULL)
648 reg_get_name (const struct dictionary *dict, char name[LONG_NAME_LEN],
649 const char prefix[LONG_NAME_LEN])
653 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
654 while (!try_name (dict, name))
657 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
662 reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
663 pspp_linreg_cache * c, struct variable **v, int n_trns)
665 struct dictionary *dict = dataset_dict (ds);
666 static int trns_index = 1;
667 char name[LONG_NAME_LEN];
668 struct variable *new_var;
669 struct reg_trns *t = NULL;
671 t = xmalloc (sizeof (*t));
672 t->trns_id = trns_index;
675 reg_get_name (dict, name, prefix);
676 new_var = dict_create_var (dict, name, 0);
677 assert (new_var != NULL);
679 add_transformation (ds, f, regression_trns_free, t);
684 subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models)
686 pspp_linreg_cache **lc;
690 assert (models != NULL);
694 /* Count the number of transformations we will need. */
695 for (i = 0; i < REGRESSION_SV_count; i++)
702 n_trns *= cmd.n_dependent;
704 for (lc = models; lc < models + cmd.n_dependent; lc++)
706 assert (*lc != NULL);
707 assert ((*lc)->depvar != NULL);
708 if (cmd.a_save[REGRESSION_SV_RESID])
710 reg_save_var (ds, "RES", regression_trns_resid_proc, *lc,
711 &(*lc)->resid, n_trns);
713 if (cmd.a_save[REGRESSION_SV_PRED])
715 reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc,
716 &(*lc)->pred, n_trns);
722 for (lc = models; lc < models + cmd.n_dependent; lc++)
726 pspp_linreg_cache_free (*lc);
733 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
737 for (i = 0; i < n_vars; i++)
748 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
752 struct variable **varlist;
754 fprintf (fp, "%s", reg_export_categorical_encode_1);
756 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
757 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
759 struct pspp_coeff *coeff = c->coeff[i];
760 const struct variable *v = pspp_coeff_get_var (coeff, 0);
761 if (var_is_alpha (v))
763 if (!reg_inserted (v, varlist, n_vars))
765 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
767 varlist[n_vars] = (struct variable *) v;
772 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
773 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
775 for (i = 0; i < n_vars - 1; i++)
777 fprintf (fp, "&%s,\n\t\t", var_get_name (varlist[i]));
779 fprintf (fp, "&%s};\n\t", var_get_name (varlist[i]));
781 for (i = 0; i < n_vars; i++)
783 int n_categories = cat_get_n_categories (varlist[i]);
786 fprintf (fp, "%s.name = \"%s\";\n\t",
787 var_get_name (varlist[i]), var_get_name (varlist[i]));
788 fprintf (fp, "%s.n_vals = %d;\n\t",
789 var_get_name (varlist[i]), n_categories);
791 for (j = 0; j < n_categories; j++)
793 union value *val = cat_subscript_to_value (j, varlist[i]);
794 fprintf (fp, "%s.values[%d] = \"%s\";\n\t",
795 var_get_name (varlist[i]), j,
796 var_get_value_name (varlist[i], val));
799 fprintf (fp, "%s", reg_export_categorical_encode_2);
803 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
806 struct pspp_coeff *coeff;
807 const struct variable *v;
809 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
810 for (i = 1; i < c->n_indeps; i++)
813 v = pspp_coeff_get_var (coeff, 0);
814 fprintf (fp, "\"%s\",\n\t\t", var_get_name (v));
817 v = pspp_coeff_get_var (coeff, 0);
818 fprintf (fp, "\"%s\"};\n\t", var_get_name (v));
821 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
823 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
824 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
825 reg_print_depvars (fp, c);
826 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
828 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
829 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
832 reg_has_categorical (pspp_linreg_cache * c)
835 const struct variable *v;
837 for (i = 1; i < c->n_coeffs; i++)
839 v = pspp_coeff_get_var (c->coeff[i], 0);
840 if (var_is_alpha (v))
847 subcommand_export (int export, pspp_linreg_cache * c)
852 int n_quantiles = 100;
854 struct pspp_coeff *coeff;
859 assert (model_file != NULL);
860 fp = fopen (fh_get_file_name (model_file), "w");
862 fprintf (fp, "%s", reg_preamble);
863 reg_print_getvar (fp, c);
864 if (reg_has_categorical (c))
866 reg_print_categorical_encoding (fp, c);
868 fprintf (fp, "%s", reg_export_t_quantiles_1);
869 for (i = 0; i < n_quantiles - 1; i++)
871 tmp = 0.5 + 0.005 * (double) i;
872 fprintf (fp, "%.15e,\n\t\t",
873 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
875 fprintf (fp, "%.15e};\n\t",
876 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
877 fprintf (fp, "%s", reg_export_t_quantiles_2);
878 fprintf (fp, "%s", reg_mean_cmt);
879 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
880 fprintf (fp, "const char *var_names[])\n{\n\t");
881 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
882 for (i = 1; i < c->n_indeps; i++)
885 fprintf (fp, "%.15e,\n\t\t", coeff->estimate);
888 fprintf (fp, "%.15e};\n\t", coeff->estimate);
890 fprintf (fp, "double estimate = %.15e;\n\t", coeff->estimate);
891 fprintf (fp, "int i;\n\tint j;\n\n\t");
892 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
893 fprintf (fp, "%s", reg_getvar);
894 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
896 for (i = 0; i < c->cov->size1 - 1; i++)
899 for (j = 0; j < c->cov->size2 - 1; j++)
901 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
903 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
906 for (j = 0; j < c->cov->size2 - 1; j++)
908 fprintf (fp, "%.15e, ",
909 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
911 fprintf (fp, "%.15e}\n\t",
912 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
913 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
915 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
916 fprintf (fp, "%s", reg_variance);
917 fprintf (fp, "%s", reg_export_confidence_interval);
918 tmp = c->mse * c->mse;
919 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
920 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
921 fprintf (fp, "%s", reg_export_prediction_interval_3);
923 fp = fopen ("pspp_model_reg.h", "w");
924 fprintf (fp, "%s", reg_header);
930 regression_custom_export (struct lexer *lexer, struct dataset *ds UNUSED,
931 struct cmd_regression *cmd UNUSED, void *aux UNUSED)
933 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
934 if (!lex_force_match (lexer, '('))
937 if (lex_match (lexer, '*'))
941 model_file = fh_parse (lexer, FH_REF_FILE);
942 if (model_file == NULL)
946 if (!lex_force_match (lexer, ')'))
953 cmd_regression (struct lexer *lexer, struct dataset *ds)
957 if (!parse_regression (lexer, ds, &cmd, NULL))
960 models = xnmalloc (cmd.n_dependent, sizeof *models);
961 for (i = 0; i < cmd.n_dependent; i++)
965 if (!multipass_procedure_with_splits (ds, run_regression, &cmd))
966 return CMD_CASCADING_FAILURE;
967 subcommand_save (ds, cmd.sbc_save, models);
974 Is variable k the dependent variable?
977 is_depvar (size_t k, const struct variable *v)
979 return v == v_variables[k];
983 Mark missing cases. Return the number of non-missing cases.
984 Compute the first two moments.
987 mark_missing_cases (const struct casefile *cf, const struct variable *v,
988 int *is_missing_case, double n_data,
989 struct moments_var *mom)
991 struct casereader *r;
994 const union value *val;
997 for (r = casefile_get_reader (cf, NULL);
998 casereader_read (r, &c); case_destroy (&c))
1000 row = casereader_cnum (r) - 1;
1002 val = case_data (&c, v);
1005 moments1_add (mom->m, val->f, w);
1007 cat_value_update (v, val);
1008 if (var_is_value_missing (v, val, MV_ANY))
1010 if (!is_missing_case[row])
1012 /* Now it is missing. */
1014 is_missing_case[row] = 1;
1018 casereader_destroy (r);
1023 /* Parser for the variables sub command */
1025 regression_custom_variables (struct lexer *lexer, struct dataset *ds,
1026 struct cmd_regression *cmd UNUSED,
1029 const struct dictionary *dict = dataset_dict (ds);
1031 lex_match (lexer, '=');
1033 if ((lex_token (lexer) != T_ID
1034 || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
1035 && lex_token (lexer) != T_ALL)
1039 if (!parse_variables_const
1040 (lexer, dict, &v_variables, &n_variables, PV_NONE))
1045 assert (n_variables);
1051 Count the explanatory variables. The user may or may
1052 not have specified a response variable in the syntax.
1055 get_n_indep (const struct variable *v)
1060 result = n_variables;
1061 while (i < n_variables)
1063 if (is_depvar (i, v))
1074 Read from the active file. Identify the explanatory variables in
1075 v_variables. Encode categorical variables. Drop cases with missing
1079 prepare_data (int n_data, int is_missing_case[],
1080 const struct variable **indep_vars,
1081 const struct variable *depvar, const struct casefile *cf,
1082 struct moments_var *mom)
1087 assert (indep_vars != NULL);
1089 for (i = 0; i < n_variables; i++)
1091 if (!is_depvar (i, depvar))
1093 indep_vars[j] = v_variables[i];
1095 if (var_is_alpha (v_variables[i]))
1097 /* Make a place to hold the binary vectors
1098 corresponding to this variable's values. */
1099 cat_stored_values_create (v_variables[i]);
1102 mark_missing_cases (cf, v_variables[i], is_missing_case, n_data,
1107 Mark missing cases for the dependent variable.
1109 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data, NULL);
1114 coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
1116 c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
1117 c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
1118 c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
1119 pspp_coeff_init (c->coeff + 1, dm);
1123 Put the moments in the linreg cache.
1126 compute_moments (pspp_linreg_cache * c, struct moments_var *mom,
1127 struct design_matrix *dm, size_t n)
1137 Scan the variable names in the columns of the design matrix.
1138 When we find the variable we need, insert its mean in the cache.
1140 for (i = 0; i < dm->m->size2; i++)
1142 for (j = 0; j < n; j++)
1144 if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
1146 moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
1147 &skewness, &kurtosis);
1148 gsl_vector_set (c->indep_means, i, mean);
1149 gsl_vector_set (c->indep_std, i, sqrt (variance));
1155 run_regression (const struct ccase *first,
1156 const struct casefile *cf, void *cmd_ UNUSED,
1157 const struct dataset *ds)
1160 size_t n_data = 0; /* Number of valide cases. */
1161 size_t n_cases; /* Number of cases. */
1167 Keep track of the missing cases.
1169 int *is_missing_case;
1170 const union value *val;
1171 struct casereader *r;
1173 const struct variable **indep_vars;
1174 struct design_matrix *X;
1175 struct moments_var *mom;
1178 pspp_linreg_opts lopts;
1180 assert (models != NULL);
1182 output_split_file_values (ds, first);
1186 dict_get_vars (dataset_dict (ds), &v_variables, &n_variables,
1190 n_cases = casefile_get_case_cnt (cf);
1192 for (i = 0; i < cmd.n_dependent; i++)
1194 if (!var_is_numeric (cmd.v_dependent[i]))
1196 msg (SE, gettext ("Dependent variable must be numeric."));
1197 pspp_reg_rc = CMD_FAILURE;
1202 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
1203 mom = xnmalloc (n_variables, sizeof (*mom));
1204 for (i = 0; i < n_variables; i++)
1206 (mom + i)->m = moments1_create (MOMENT_VARIANCE);
1207 (mom + i)->v = v_variables[i];
1209 lopts.get_depvar_mean_std = 1;
1211 for (k = 0; k < cmd.n_dependent; k++)
1213 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
1214 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
1215 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
1216 assert (indep_vars != NULL);
1218 for (i = 0; i < n_cases; i++)
1220 is_missing_case[i] = 0;
1222 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
1224 (const struct casefile *) cf, mom);
1227 Y = gsl_vector_alloc (n_data);
1230 design_matrix_create (n_indep, (const struct variable **) indep_vars,
1232 for (i = 0; i < X->m->size2; i++)
1234 lopts.get_indep_mean_std[i] = 1;
1236 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1237 models[k]->indep_means = gsl_vector_alloc (X->m->size2);
1238 models[k]->indep_std = gsl_vector_alloc (X->m->size2);
1239 models[k]->depvar = (const struct variable *) cmd.v_dependent[k];
1241 For large data sets, use QR decomposition.
1243 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1245 models[k]->method = PSPP_LINREG_SVD;
1249 The second pass fills the design matrix.
1252 for (r = casefile_get_reader (cf, NULL); casereader_read (r, &c);
1254 /* Iterate over the cases. */
1256 case_num = casereader_cnum (r) - 1;
1257 if (!is_missing_case[case_num])
1259 for (i = 0; i < n_variables; ++i) /* Iterate over the
1264 val = case_data (&c, v_variables[i]);
1266 Independent/dependent variable separation. The
1267 'variables' subcommand specifies a varlist which contains
1268 both dependent and independent variables. The dependent
1269 variables are specified with the 'dependent'
1270 subcommand, and maybe also in the 'variables' subcommand.
1271 We need to separate the two.
1273 if (!is_depvar (i, cmd.v_dependent[k]))
1275 if (var_is_alpha (v_variables[i]))
1277 design_matrix_set_categorical (X, row,
1278 v_variables[i], val);
1282 design_matrix_set_numeric (X, row, v_variables[i],
1287 val = case_data (&c, cmd.v_dependent[k]);
1288 gsl_vector_set (Y, row, val->f);
1293 Now that we know the number of coefficients, allocate space
1294 and store pointers to the variables that correspond to the
1297 coeff_init (models[k], X);
1300 Find the least-squares estimates and other statistics.
1302 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1303 compute_moments (models[k], mom, X, n_variables);
1304 subcommand_statistics (cmd.a_statistics, models[k]);
1305 subcommand_export (cmd.sbc_export, models[k]);
1307 gsl_vector_free (Y);
1308 design_matrix_destroy (X);
1310 free (lopts.get_indep_mean_std);
1311 casereader_destroy (r);
1314 for (i = 0; i < n_variables; i++)
1316 moments1_destroy ((mom + i)->m);
1319 free (is_missing_case);