1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 #include <gsl/gsl_cdf.h>
22 #include <gsl/gsl_matrix.h>
23 #include <gsl/gsl_vector.h>
27 #include "regression-export.h"
28 #include <data/case.h>
29 #include <data/casefile.h>
30 #include <data/cat-routines.h>
31 #include <data/category.h>
32 #include <data/dictionary.h>
33 #include <data/missing-values.h>
34 #include <data/procedure.h>
35 #include <data/transformations.h>
36 #include <data/value-labels.h>
37 #include <data/variable.h>
38 #include <language/command.h>
39 #include <language/dictionary/split-file.h>
40 #include <language/data-io/file-handle.h>
41 #include <language/lexer/lexer.h>
42 #include <libpspp/alloc.h>
43 #include <libpspp/compiler.h>
44 #include <libpspp/message.h>
45 #include <math/design-matrix.h>
46 #include <math/coefficient.h>
47 #include <math/linreg/linreg.h>
48 #include <math/moments.h>
49 #include <output/table.h>
53 #define REG_LARGE_DATA 1000
58 "REGRESSION" (regression_):
79 +save[sv_]=resid,pred;
84 static struct cmd_regression cmd;
87 Moments for each of the variables used.
95 /* Linear regression models. */
96 static pspp_linreg_cache **models = NULL;
99 Transformations for saving predicted values
104 int n_trns; /* Number of transformations. */
105 int trns_id; /* Which trns is this one? */
106 pspp_linreg_cache *c; /* Linear model for this trns. */
109 Variables used (both explanatory and response).
111 static struct variable **v_variables;
116 static size_t n_variables;
119 File where the model will be saved if the EXPORT subcommand
122 static struct file_handle *model_file;
125 Return value for the procedure.
127 static int pspp_reg_rc = CMD_SUCCESS;
129 static bool run_regression (const struct ccase *,
130 const struct casefile *, void *,
131 const struct dataset *);
134 STATISTICS subcommand output functions.
136 static void reg_stats_r (pspp_linreg_cache *);
137 static void reg_stats_coeff (pspp_linreg_cache *);
138 static void reg_stats_anova (pspp_linreg_cache *);
139 static void reg_stats_outs (pspp_linreg_cache *);
140 static void reg_stats_zpp (pspp_linreg_cache *);
141 static void reg_stats_label (pspp_linreg_cache *);
142 static void reg_stats_sha (pspp_linreg_cache *);
143 static void reg_stats_ci (pspp_linreg_cache *);
144 static void reg_stats_f (pspp_linreg_cache *);
145 static void reg_stats_bcov (pspp_linreg_cache *);
146 static void reg_stats_ses (pspp_linreg_cache *);
147 static void reg_stats_xtx (pspp_linreg_cache *);
148 static void reg_stats_collin (pspp_linreg_cache *);
149 static void reg_stats_tol (pspp_linreg_cache *);
150 static void reg_stats_selection (pspp_linreg_cache *);
151 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
152 int, pspp_linreg_cache *);
155 reg_stats_r (pspp_linreg_cache * c)
165 rsq = c->ssm / c->sst;
166 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
167 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
168 t = tab_create (n_cols, n_rows, 0);
169 tab_dim (t, tab_natural_dimensions);
170 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
171 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
172 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
173 tab_vline (t, TAL_0, 1, 0, 0);
175 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
176 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
177 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
178 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
179 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
180 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
181 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
182 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
183 tab_title (t, _("Model Summary"));
188 Table showing estimated regression coefficients.
191 reg_stats_coeff (pspp_linreg_cache * c)
203 const struct variable *v;
204 const union value *val;
209 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
210 n_rows = c->n_coeffs + 2;
212 t = tab_create (n_cols, n_rows, 0);
213 tab_headers (t, 2, 0, 1, 0);
214 tab_dim (t, tab_natural_dimensions);
215 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
216 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
217 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
218 tab_vline (t, TAL_0, 1, 0, 0);
220 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
221 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
222 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
223 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
224 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
225 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
226 coeff = c->coeff[0]->estimate;
227 tab_float (t, 2, 1, 0, coeff, 10, 2);
228 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
229 tab_float (t, 3, 1, 0, std_err, 10, 2);
230 beta = coeff / c->depvar_std;
231 tab_float (t, 4, 1, 0, beta, 10, 2);
232 t_stat = coeff / std_err;
233 tab_float (t, 5, 1, 0, t_stat, 10, 2);
234 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
235 tab_float (t, 6, 1, 0, pval, 10, 2);
236 for (j = 1; j <= c->n_indeps; j++)
238 v = pspp_coeff_get_var (c->coeff[j], 0);
239 label = var_to_string (v);
240 /* Do not overwrite the variable's name. */
241 strncpy (tmp, label, MAX_STRING);
242 if (var_is_alpha (v))
245 Append the value associated with this coefficient.
246 This makes sense only if we us the usual binary encoding
250 val = pspp_coeff_get_value (c->coeff[j], v);
251 val_s = var_get_value_name (v, val);
252 strncat (tmp, val_s, MAX_STRING);
255 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
257 Regression coefficients.
259 coeff = c->coeff[j]->estimate;
260 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
262 Standard error of the coefficients.
264 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
265 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
267 'Standardized' coefficient, i.e., regression coefficient
268 if all variables had unit variance.
270 beta = gsl_vector_get (c->indep_std, j);
271 beta *= coeff / c->depvar_std;
272 tab_float (t, 4, j + 1, 0, beta, 10, 2);
275 Test statistic for H0: coefficient is 0.
277 t_stat = coeff / std_err;
278 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
280 P values for the test statistic above.
282 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), (double) (c->n_obs - c->n_coeffs));
283 tab_float (t, 6, j + 1, 0, pval, 10, 2);
285 tab_title (t, _("Coefficients"));
291 Display the ANOVA table.
294 reg_stats_anova (pspp_linreg_cache * c)
298 const double msm = c->ssm / c->dfm;
299 const double mse = c->sse / c->dfe;
300 const double F = msm / mse;
301 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
306 t = tab_create (n_cols, n_rows, 0);
307 tab_headers (t, 2, 0, 1, 0);
308 tab_dim (t, tab_natural_dimensions);
310 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
312 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
313 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
314 tab_vline (t, TAL_0, 1, 0, 0);
316 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
317 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
318 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
319 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
320 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
322 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
323 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
324 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
326 /* Sums of Squares */
327 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
328 tab_float (t, 2, 3, 0, c->sst, 10, 2);
329 tab_float (t, 2, 2, 0, c->sse, 10, 2);
332 /* Degrees of freedom */
333 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
334 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
335 tab_float (t, 3, 3, 0, c->dft, 4, 0);
339 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
340 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
342 tab_float (t, 5, 1, 0, F, 8, 3);
344 tab_float (t, 6, 1, 0, pval, 8, 3);
346 tab_title (t, _("ANOVA"));
350 reg_stats_outs (pspp_linreg_cache * c)
355 reg_stats_zpp (pspp_linreg_cache * c)
360 reg_stats_label (pspp_linreg_cache * c)
365 reg_stats_sha (pspp_linreg_cache * c)
370 reg_stats_ci (pspp_linreg_cache * c)
375 reg_stats_f (pspp_linreg_cache * c)
380 reg_stats_bcov (pspp_linreg_cache * c)
392 n_cols = c->n_indeps + 1 + 2;
393 n_rows = 2 * (c->n_indeps + 1);
394 t = tab_create (n_cols, n_rows, 0);
395 tab_headers (t, 2, 0, 1, 0);
396 tab_dim (t, tab_natural_dimensions);
397 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
398 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
399 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
400 tab_vline (t, TAL_0, 1, 0, 0);
401 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
402 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
403 for (i = 1; i < c->n_coeffs; i++)
405 const struct variable *v = pspp_coeff_get_var (c->coeff[i], 0);
406 label = var_to_string (v);
407 tab_text (t, 2, i, TAB_CENTER, label);
408 tab_text (t, i + 2, 0, TAB_CENTER, label);
409 for (k = 1; k < c->n_coeffs; k++)
411 col = (i <= k) ? k : i;
412 row = (i <= k) ? i : k;
413 tab_float (t, k + 2, i, TAB_CENTER,
414 gsl_matrix_get (c->cov, row, col), 8, 3);
417 tab_title (t, _("Coefficient Correlations"));
421 reg_stats_ses (pspp_linreg_cache * c)
426 reg_stats_xtx (pspp_linreg_cache * c)
431 reg_stats_collin (pspp_linreg_cache * c)
436 reg_stats_tol (pspp_linreg_cache * c)
441 reg_stats_selection (pspp_linreg_cache * c)
447 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
448 int keyword, pspp_linreg_cache * c)
457 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
460 The order here must match the order in which the STATISTICS
461 keywords appear in the specification section above.
488 Set everything but F.
490 for (i = 0; i < f; i++)
497 for (i = 0; i < all; i++)
505 Default output: ANOVA table, parameter estimates,
506 and statistics for variables not entered into model,
509 if (keywords[defaults] | d)
517 statistics_keyword_output (reg_stats_r, keywords[r], c);
518 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
519 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
520 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
521 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
522 statistics_keyword_output (reg_stats_label, keywords[label], c);
523 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
524 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
525 statistics_keyword_output (reg_stats_f, keywords[f], c);
526 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
527 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
528 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
529 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
530 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
531 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
535 Free the transformation. Free its linear model if this
536 transformation is the last one.
539 regression_trns_free (void *t_)
542 struct reg_trns *t = t_;
544 if (t->trns_id == t->n_trns)
546 result = pspp_linreg_cache_free (t->c);
554 Gets the predicted values.
557 regression_trns_pred_proc (void *t_, struct ccase *c,
558 casenumber case_idx UNUSED)
562 struct reg_trns *trns = t_;
563 pspp_linreg_cache *model;
564 union value *output = NULL;
565 const union value **vals = NULL;
566 struct variable **vars = NULL;
568 assert (trns != NULL);
570 assert (model != NULL);
571 assert (model->depvar != NULL);
572 assert (model->pred != NULL);
574 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
575 n_vals = (*model->get_vars) (model, vars);
577 vals = xnmalloc (n_vals, sizeof (*vals));
578 output = case_data_rw (c, model->pred);
579 assert (output != NULL);
581 for (i = 0; i < n_vals; i++)
583 vals[i] = case_data (c, vars[i]);
585 output->f = (*model->predict) ((const struct variable **) vars,
586 vals, model, n_vals);
589 return TRNS_CONTINUE;
596 regression_trns_resid_proc (void *t_, struct ccase *c,
597 casenumber case_idx UNUSED)
601 struct reg_trns *trns = t_;
602 pspp_linreg_cache *model;
603 union value *output = NULL;
604 const union value **vals = NULL;
605 const union value *obs = NULL;
606 struct variable **vars = NULL;
608 assert (trns != NULL);
610 assert (model != NULL);
611 assert (model->depvar != NULL);
612 assert (model->resid != NULL);
614 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
615 n_vals = (*model->get_vars) (model, vars);
617 vals = xnmalloc (n_vals, sizeof (*vals));
618 output = case_data_rw (c, model->resid);
619 assert (output != NULL);
621 for (i = 0; i < n_vals; i++)
623 vals[i] = case_data (c, vars[i]);
625 obs = case_data (c, model->depvar);
626 output->f = (*model->residual) ((const struct variable **) vars,
627 vals, obs, model, n_vals);
630 return TRNS_CONTINUE;
634 Returns false if NAME is a duplicate of any existing variable name.
637 try_name (const struct dictionary *dict, const char *name)
639 if (dict_lookup_var (dict, name) != NULL)
646 reg_get_name (const struct dictionary *dict, char name[LONG_NAME_LEN], const char prefix[LONG_NAME_LEN])
650 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
651 while (!try_name (dict, name))
654 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
659 reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
660 pspp_linreg_cache * c, struct variable **v, int n_trns)
662 struct dictionary *dict = dataset_dict (ds);
663 static int trns_index = 1;
664 char name[LONG_NAME_LEN];
665 struct variable *new_var;
666 struct reg_trns *t = NULL;
668 t = xmalloc (sizeof (*t));
669 t->trns_id = trns_index;
672 reg_get_name (dict, name, prefix);
673 new_var = dict_create_var (dict, name, 0);
674 assert (new_var != NULL);
676 add_transformation (ds, f, regression_trns_free, t);
681 subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models)
683 pspp_linreg_cache **lc;
687 assert (models != NULL);
691 /* Count the number of transformations we will need. */
692 for (i = 0; i < REGRESSION_SV_count; i++)
699 n_trns *= cmd.n_dependent;
701 for (lc = models; lc < models + cmd.n_dependent; lc++)
703 assert (*lc != NULL);
704 assert ((*lc)->depvar != NULL);
705 if (cmd.a_save[REGRESSION_SV_RESID])
707 reg_save_var (ds, "RES", regression_trns_resid_proc, *lc,
708 &(*lc)->resid, n_trns);
710 if (cmd.a_save[REGRESSION_SV_PRED])
712 reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc,
713 &(*lc)->pred, n_trns);
719 for (lc = models; lc < models + cmd.n_dependent; lc++)
721 assert (*lc != NULL);
722 pspp_linreg_cache_free (*lc);
728 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
732 for (i = 0; i < n_vars; i++)
743 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
747 struct variable **varlist;
749 fprintf (fp, "%s", reg_export_categorical_encode_1);
751 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
752 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
754 struct pspp_coeff *coeff = c->coeff[i];
755 const struct variable *v = pspp_coeff_get_var (coeff, 0);
756 if (var_is_alpha (v))
758 if (!reg_inserted (v, varlist, n_vars))
760 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
762 varlist[n_vars] = (struct variable *) v;
767 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
768 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
770 for (i = 0; i < n_vars - 1; i++)
772 fprintf (fp, "&%s,\n\t\t", var_get_name (varlist[i]));
774 fprintf (fp, "&%s};\n\t", var_get_name (varlist[i]));
776 for (i = 0; i < n_vars; i++)
778 int n_categories = cat_get_n_categories (varlist[i]);
781 fprintf (fp, "%s.name = \"%s\";\n\t",
782 var_get_name (varlist[i]),
783 var_get_name (varlist[i]));
784 fprintf (fp, "%s.n_vals = %d;\n\t",
785 var_get_name (varlist[i]),
788 for (j = 0; j < n_categories; j++)
790 union value *val = cat_subscript_to_value (j, varlist[i]);
791 fprintf (fp, "%s.values[%d] = \"%s\";\n\t",
792 var_get_name (varlist[i]), j,
793 var_get_value_name (varlist[i], val));
796 fprintf (fp, "%s", reg_export_categorical_encode_2);
800 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
803 struct pspp_coeff *coeff;
804 const struct variable *v;
806 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
807 for (i = 1; i < c->n_indeps; i++)
810 v = pspp_coeff_get_var (coeff, 0);
811 fprintf (fp, "\"%s\",\n\t\t", var_get_name (v));
814 v = pspp_coeff_get_var (coeff, 0);
815 fprintf (fp, "\"%s\"};\n\t", var_get_name (v));
818 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
820 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
821 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
822 reg_print_depvars (fp, c);
823 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
825 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
826 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
829 reg_has_categorical (pspp_linreg_cache * c)
832 const struct variable *v;
834 for (i = 1; i < c->n_coeffs; i++)
836 v = pspp_coeff_get_var (c->coeff[i], 0);
837 if (var_is_alpha (v))
844 subcommand_export (int export, pspp_linreg_cache * c)
849 int n_quantiles = 100;
851 struct pspp_coeff *coeff;
856 assert (model_file != NULL);
857 fp = fopen (fh_get_file_name (model_file), "w");
859 fprintf (fp, "%s", reg_preamble);
860 reg_print_getvar (fp, c);
861 if (reg_has_categorical (c))
863 reg_print_categorical_encoding (fp, c);
865 fprintf (fp, "%s", reg_export_t_quantiles_1);
866 for (i = 0; i < n_quantiles - 1; i++)
868 tmp = 0.5 + 0.005 * (double) i;
869 fprintf (fp, "%.15e,\n\t\t",
870 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
872 fprintf (fp, "%.15e};\n\t",
873 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
874 fprintf (fp, "%s", reg_export_t_quantiles_2);
875 fprintf (fp, "%s", reg_mean_cmt);
876 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
877 fprintf (fp, "const char *var_names[])\n{\n\t");
878 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
879 for (i = 1; i < c->n_indeps; i++)
882 fprintf (fp, "%.15e,\n\t\t", coeff->estimate);
885 fprintf (fp, "%.15e};\n\t", coeff->estimate);
887 fprintf (fp, "double estimate = %.15e;\n\t", coeff->estimate);
888 fprintf (fp, "int i;\n\tint j;\n\n\t");
889 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
890 fprintf (fp, "%s", reg_getvar);
891 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
893 for (i = 0; i < c->cov->size1 - 1; i++)
896 for (j = 0; j < c->cov->size2 - 1; j++)
898 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
900 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
903 for (j = 0; j < c->cov->size2 - 1; j++)
905 fprintf (fp, "%.15e, ",
906 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
908 fprintf (fp, "%.15e}\n\t",
909 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
910 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
912 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
913 fprintf (fp, "%s", reg_variance);
914 fprintf (fp, "%s", reg_export_confidence_interval);
915 tmp = c->mse * c->mse;
916 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
917 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
918 fprintf (fp, "%s", reg_export_prediction_interval_3);
920 fp = fopen ("pspp_model_reg.h", "w");
921 fprintf (fp, "%s", reg_header);
927 regression_custom_export (struct lexer *lexer, struct dataset *ds UNUSED, struct cmd_regression *cmd UNUSED, void *aux UNUSED)
929 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
930 if (!lex_force_match (lexer, '('))
933 if (lex_match (lexer, '*'))
937 model_file = fh_parse (lexer, FH_REF_FILE);
938 if (model_file == NULL)
942 if (!lex_force_match (lexer, ')'))
949 cmd_regression (struct lexer *lexer, struct dataset *ds)
951 if (!parse_regression (lexer, ds, &cmd, NULL))
954 models = xnmalloc (cmd.n_dependent, sizeof *models);
955 if (!multipass_procedure_with_splits (ds, run_regression, &cmd))
956 return CMD_CASCADING_FAILURE;
957 subcommand_save (ds, cmd.sbc_save, models);
964 Is variable k the dependent variable?
967 is_depvar (size_t k, const struct variable *v)
969 return v == v_variables[k];
973 Mark missing cases. Return the number of non-missing cases.
974 Compute the first two moments.
977 mark_missing_cases (const struct casefile *cf, struct variable *v,
978 int *is_missing_case, double n_data, struct moments_var *mom)
980 struct casereader *r;
983 const union value *val;
986 for (r = casefile_get_reader (cf, NULL);
987 casereader_read (r, &c); case_destroy (&c))
989 row = casereader_cnum (r) - 1;
991 val = case_data (&c, v);
994 moments1_add (mom->m, val->f, w);
996 cat_value_update (v, val);
997 if (var_is_value_missing (v, val, MV_ANY))
999 if (!is_missing_case[row])
1001 /* Now it is missing. */
1003 is_missing_case[row] = 1;
1007 casereader_destroy (r);
1012 /* Parser for the variables sub command */
1014 regression_custom_variables (struct lexer *lexer, struct dataset *ds,
1015 struct cmd_regression *cmd UNUSED,
1018 const struct dictionary *dict = dataset_dict (ds);
1020 lex_match (lexer, '=');
1022 if ((lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
1023 && lex_token (lexer) != T_ALL)
1027 if (!parse_variables (lexer, dict, &v_variables, &n_variables, PV_NONE))
1032 assert (n_variables);
1038 Count the explanatory variables. The user may or may
1039 not have specified a response variable in the syntax.
1042 get_n_indep (const struct variable *v)
1047 result = n_variables;
1048 while (i < n_variables)
1050 if (is_depvar (i, v))
1061 Read from the active file. Identify the explanatory variables in
1062 v_variables. Encode categorical variables. Drop cases with missing
1066 prepare_data (int n_data, int is_missing_case[],
1067 struct variable **indep_vars,
1068 struct variable *depvar, const struct casefile *cf, struct moments_var *mom)
1073 assert (indep_vars != NULL);
1075 for (i = 0; i < n_variables; i++)
1077 if (!is_depvar (i, depvar))
1079 indep_vars[j] = v_variables[i];
1081 if (var_is_alpha (v_variables[i]))
1083 /* Make a place to hold the binary vectors
1084 corresponding to this variable's values. */
1085 cat_stored_values_create (v_variables[i]);
1088 mark_missing_cases (cf, v_variables[i], is_missing_case, n_data, mom + i);
1092 Mark missing cases for the dependent variable.
1094 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data, NULL);
1099 coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
1101 c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
1102 c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
1103 c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
1104 pspp_coeff_init (c->coeff + 1, dm);
1108 Put the moments in the linreg cache.
1111 compute_moments (pspp_linreg_cache *c, struct moments_var *mom, struct design_matrix *dm, size_t n)
1121 Scan the variable names in the columns of the design matrix.
1122 When we find the variable we need, insert its mean in the cache.
1124 for (i = 0; i < dm->m->size2; i++)
1126 for (j = 0; j < n; j++)
1128 if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
1130 moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
1131 &skewness, &kurtosis);
1132 gsl_vector_set (c->indep_means, i, mean);
1133 gsl_vector_set (c->indep_std, i, sqrt (variance));
1139 run_regression (const struct ccase *first,
1140 const struct casefile *cf, void *cmd_ UNUSED, const struct dataset *ds)
1143 size_t n_data = 0; /* Number of valide cases. */
1144 size_t n_cases; /* Number of cases. */
1150 Keep track of the missing cases.
1152 int *is_missing_case;
1153 const union value *val;
1154 struct casereader *r;
1156 struct variable **indep_vars;
1157 struct design_matrix *X;
1158 struct moments_var *mom;
1161 pspp_linreg_opts lopts;
1163 assert (models != NULL);
1165 output_split_file_values (ds, first);
1169 dict_get_vars (dataset_dict (ds), &v_variables, &n_variables,
1173 n_cases = casefile_get_case_cnt (cf);
1175 for (i = 0; i < cmd.n_dependent; i++)
1177 if (!var_is_numeric (cmd.v_dependent[i]))
1179 msg (SE, gettext ("Dependent variable must be numeric."));
1180 pspp_reg_rc = CMD_FAILURE;
1185 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
1186 mom = xnmalloc (n_variables, sizeof (*mom));
1187 for (i = 0; i < n_variables; i++)
1189 (mom + i)->m = moments1_create (MOMENT_VARIANCE);
1190 (mom + i)->v = v_variables[i];
1192 lopts.get_depvar_mean_std = 1;
1194 for (k = 0; k < cmd.n_dependent; k++)
1196 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
1197 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
1198 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
1199 assert (indep_vars != NULL);
1201 for (i = 0; i < n_cases; i++)
1203 is_missing_case[i] = 0;
1205 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
1207 (const struct casefile *) cf, mom);
1208 Y = gsl_vector_alloc (n_data);
1211 design_matrix_create (n_indep, (const struct variable **) indep_vars,
1213 for (i = 0; i < X->m->size2; i++)
1215 lopts.get_indep_mean_std[i] = 1;
1217 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1218 models[k]->indep_means = gsl_vector_alloc (X->m->size2);
1219 models[k]->indep_std = gsl_vector_alloc (X->m->size2);
1220 models[k]->depvar = (const struct variable *) cmd.v_dependent[k];
1222 For large data sets, use QR decomposition.
1224 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1226 models[k]->method = PSPP_LINREG_SVD;
1230 The second pass fills the design matrix.
1233 for (r = casefile_get_reader (cf, NULL); casereader_read (r, &c);
1235 /* Iterate over the cases. */
1237 case_num = casereader_cnum (r) - 1;
1238 if (!is_missing_case[case_num])
1240 for (i = 0; i < n_variables; ++i) /* Iterate over the
1245 val = case_data (&c, v_variables[i]);
1247 Independent/dependent variable separation. The
1248 'variables' subcommand specifies a varlist which contains
1249 both dependent and independent variables. The dependent
1250 variables are specified with the 'dependent'
1251 subcommand, and maybe also in the 'variables' subcommand.
1252 We need to separate the two.
1254 if (!is_depvar (i, cmd.v_dependent[k]))
1256 if (var_is_alpha (v_variables[i]))
1258 design_matrix_set_categorical (X, row,
1259 v_variables[i], val);
1263 design_matrix_set_numeric (X, row, v_variables[i],
1268 val = case_data (&c, cmd.v_dependent[k]);
1269 gsl_vector_set (Y, row, val->f);
1274 Now that we know the number of coefficients, allocate space
1275 and store pointers to the variables that correspond to the
1278 coeff_init (models[k], X);
1281 Find the least-squares estimates and other statistics.
1283 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1284 compute_moments (models[k], mom, X, n_variables);
1285 subcommand_statistics (cmd.a_statistics, models[k]);
1286 subcommand_export (cmd.sbc_export, models[k]);
1288 gsl_vector_free (Y);
1289 design_matrix_destroy (X);
1291 free (lopts.get_indep_mean_std);
1292 casereader_destroy (r);
1294 for (i = 0; i < n_variables; i++)
1296 moments1_destroy ((mom + i)->m);
1299 free (is_missing_case);