1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_vector.h>
24 #include <gsl/gsl_matrix.h>
26 #include <libpspp/alloc.h>
27 #include <data/case.h>
28 #include <data/casefile.h>
29 #include <data/category.h>
30 #include <data/cat-routines.h>
31 #include <language/command.h>
32 #include <libpspp/compiler.h>
33 #include <math/design-matrix.h>
34 #include <data/dictionary.h>
35 #include <libpspp/message.h>
36 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <math/linreg/linreg.h>
40 #include <math/linreg/coefficient.h>
41 #include <data/missing-values.h>
42 #include "regression-export.h"
43 #include <output/table.h>
44 #include <data/value-labels.h>
45 #include <data/variable.h>
46 #include <procedure.h>
48 #define REG_LARGE_DATA 1000
53 "REGRESSION" (regression_):
79 static struct cmd_regression cmd;
81 /* Linear regression models. */
82 pspp_linreg_cache **models = NULL;
85 Variables used (both explanatory and response).
87 static struct variable **v_variables;
92 static size_t n_variables;
95 File where the model will be saved if the EXPORT subcommand
98 struct file_handle *model_file;
101 Return value for the procedure.
103 int pspp_reg_rc = CMD_SUCCESS;
105 static bool run_regression (const struct casefile *, void *);
108 STATISTICS subcommand output functions.
110 static void reg_stats_r (pspp_linreg_cache *);
111 static void reg_stats_coeff (pspp_linreg_cache *);
112 static void reg_stats_anova (pspp_linreg_cache *);
113 static void reg_stats_outs (pspp_linreg_cache *);
114 static void reg_stats_zpp (pspp_linreg_cache *);
115 static void reg_stats_label (pspp_linreg_cache *);
116 static void reg_stats_sha (pspp_linreg_cache *);
117 static void reg_stats_ci (pspp_linreg_cache *);
118 static void reg_stats_f (pspp_linreg_cache *);
119 static void reg_stats_bcov (pspp_linreg_cache *);
120 static void reg_stats_ses (pspp_linreg_cache *);
121 static void reg_stats_xtx (pspp_linreg_cache *);
122 static void reg_stats_collin (pspp_linreg_cache *);
123 static void reg_stats_tol (pspp_linreg_cache *);
124 static void reg_stats_selection (pspp_linreg_cache *);
125 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
126 int, pspp_linreg_cache *);
129 reg_stats_r (pspp_linreg_cache * c)
139 rsq = c->ssm / c->sst;
140 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
141 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
142 t = tab_create (n_cols, n_rows, 0);
143 tab_dim (t, tab_natural_dimensions);
144 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
145 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
146 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
147 tab_vline (t, TAL_0, 1, 0, 0);
149 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
150 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
151 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
152 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
153 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
154 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
155 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
156 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
157 tab_title (t, _("Model Summary"));
162 Table showing estimated regression coefficients.
165 reg_stats_coeff (pspp_linreg_cache * c)
177 const struct variable *v;
178 const union value *val;
183 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
184 n_rows = c->n_coeffs + 2;
186 t = tab_create (n_cols, n_rows, 0);
187 tab_headers (t, 2, 0, 1, 0);
188 tab_dim (t, tab_natural_dimensions);
189 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
190 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
191 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
192 tab_vline (t, TAL_0, 1, 0, 0);
194 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
195 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
196 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
197 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
198 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
199 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
200 coeff = c->coeff[0].estimate;
201 tab_float (t, 2, 1, 0, coeff, 10, 2);
202 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
203 tab_float (t, 3, 1, 0, std_err, 10, 2);
204 beta = coeff / c->depvar_std;
205 tab_float (t, 4, 1, 0, beta, 10, 2);
206 t_stat = coeff / std_err;
207 tab_float (t, 5, 1, 0, t_stat, 10, 2);
208 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
209 tab_float (t, 6, 1, 0, pval, 10, 2);
210 for (j = 1; j <= c->n_indeps; j++)
212 v = pspp_linreg_coeff_get_var (c->coeff + j, 0);
213 label = var_to_string (v);
214 /* Do not overwrite the variable's name. */
215 strncpy (tmp, label, MAX_STRING);
216 if (v->type == ALPHA)
219 Append the value associated with this coefficient.
220 This makes sense only if we us the usual binary encoding
224 val = pspp_linreg_coeff_get_value (c->coeff + j, v);
225 val_s = value_to_string (val, v);
226 strncat (tmp, val_s, MAX_STRING);
229 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
231 Regression coefficients.
233 coeff = c->coeff[j].estimate;
234 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
236 Standard error of the coefficients.
238 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
239 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
241 'Standardized' coefficient, i.e., regression coefficient
242 if all variables had unit variance.
244 beta = gsl_vector_get (c->indep_std, j);
245 beta *= coeff / c->depvar_std;
246 tab_float (t, 4, j + 1, 0, beta, 10, 2);
249 Test statistic for H0: coefficient is 0.
251 t_stat = coeff / std_err;
252 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
254 P values for the test statistic above.
256 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
257 tab_float (t, 6, j + 1, 0, pval, 10, 2);
259 tab_title (t, _("Coefficients"));
265 Display the ANOVA table.
268 reg_stats_anova (pspp_linreg_cache * c)
272 const double msm = c->ssm / c->dfm;
273 const double mse = c->sse / c->dfe;
274 const double F = msm / mse;
275 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
280 t = tab_create (n_cols, n_rows, 0);
281 tab_headers (t, 2, 0, 1, 0);
282 tab_dim (t, tab_natural_dimensions);
284 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
286 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
287 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
288 tab_vline (t, TAL_0, 1, 0, 0);
290 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
291 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
292 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
293 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
294 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
296 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
297 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
298 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
300 /* Sums of Squares */
301 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
302 tab_float (t, 2, 3, 0, c->sst, 10, 2);
303 tab_float (t, 2, 2, 0, c->sse, 10, 2);
306 /* Degrees of freedom */
307 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
308 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
309 tab_float (t, 3, 3, 0, c->dft, 4, 0);
313 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
314 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
316 tab_float (t, 5, 1, 0, F, 8, 3);
318 tab_float (t, 6, 1, 0, pval, 8, 3);
320 tab_title (t, _("ANOVA"));
324 reg_stats_outs (pspp_linreg_cache * c)
329 reg_stats_zpp (pspp_linreg_cache * c)
334 reg_stats_label (pspp_linreg_cache * c)
339 reg_stats_sha (pspp_linreg_cache * c)
344 reg_stats_ci (pspp_linreg_cache * c)
349 reg_stats_f (pspp_linreg_cache * c)
354 reg_stats_bcov (pspp_linreg_cache * c)
366 n_cols = c->n_indeps + 1 + 2;
367 n_rows = 2 * (c->n_indeps + 1);
368 t = tab_create (n_cols, n_rows, 0);
369 tab_headers (t, 2, 0, 1, 0);
370 tab_dim (t, tab_natural_dimensions);
371 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
372 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
373 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
374 tab_vline (t, TAL_0, 1, 0, 0);
375 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
376 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
377 for (i = 1; i < c->n_coeffs; i++)
379 const struct variable *v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
380 label = var_to_string (v);
381 tab_text (t, 2, i, TAB_CENTER, label);
382 tab_text (t, i + 2, 0, TAB_CENTER, label);
383 for (k = 1; k < c->n_coeffs; k++)
385 col = (i <= k) ? k : i;
386 row = (i <= k) ? i : k;
387 tab_float (t, k + 2, i, TAB_CENTER,
388 gsl_matrix_get (c->cov, row, col), 8, 3);
391 tab_title (t, _("Coefficient Correlations"));
395 reg_stats_ses (pspp_linreg_cache * c)
400 reg_stats_xtx (pspp_linreg_cache * c)
405 reg_stats_collin (pspp_linreg_cache * c)
410 reg_stats_tol (pspp_linreg_cache * c)
415 reg_stats_selection (pspp_linreg_cache * c)
421 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
422 int keyword, pspp_linreg_cache * c)
431 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
434 The order here must match the order in which the STATISTICS
435 keywords appear in the specification section above.
462 Set everything but F.
464 for (i = 0; i < f; i++)
471 for (i = 0; i < all; i++)
479 Default output: ANOVA table, parameter estimates,
480 and statistics for variables not entered into model,
483 if (keywords[defaults] | d)
491 statistics_keyword_output (reg_stats_r, keywords[r], c);
492 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
493 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
494 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
495 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
496 statistics_keyword_output (reg_stats_label, keywords[label], c);
497 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
498 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
499 statistics_keyword_output (reg_stats_f, keywords[f], c);
500 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
501 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
502 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
503 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
504 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
505 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
508 regression_trns_proc (void *m, struct ccase *c, int case_idx UNUSED)
513 pspp_linreg_cache *model = m;
515 const union value **vals = NULL;
516 const union value *obs = NULL;
517 struct variable **vars = NULL;
519 assert (model != NULL);
520 assert (model->depvar != NULL);
521 assert (model->resid != NULL);
523 dict_get_vars (default_dict, &vars, &n_vars, 1u << DC_SYSTEM);
524 vals = xnmalloc (n_vars, sizeof (*vals));
525 assert (vals != NULL);
526 output = case_data_rw (c, model->resid->fv);
527 assert (output != NULL);
529 for (i = 0; i < n_vars; i++)
531 /* Do not use the residual variable. */
532 if (vars[i]->index != model->resid->index)
534 /* Do not use the dependent variable as a predictor. */
535 if (vars[i]->index == model->depvar->index)
537 obs = case_data (c, i);
538 assert (obs != NULL);
542 vals[i] = case_data (c, i);
547 output->f = (*model->residual) ((const struct variable **) vars,
548 vals, obs, model, n_vals);
550 return TRNS_CONTINUE;
553 Returns 0 if NAME is a duplicate of any existing variable name.
556 try_name (char *name)
558 if (dict_lookup_var (default_dict, name) != NULL)
565 subcommand_save (int save, pspp_linreg_cache **models)
568 char name[LONG_NAME_LEN + 1];
569 struct variable *residuals = NULL;
570 pspp_linreg_cache **lc;
572 assert (models != NULL);
577 snprintf (name, LONG_NAME_LEN, "RES%d", i);
578 for (lc = models; lc < models + cmd.n_dependent; lc++)
580 assert (*lc != NULL);
581 assert ((*lc)->depvar != NULL);
582 while (!try_name (name))
585 snprintf (name, LONG_NAME_LEN, "RES%d", i);
587 residuals = dict_create_var (default_dict, name, 0);
588 assert (residuals != NULL);
589 (*lc)->resid = residuals;
590 add_transformation (regression_trns_proc, pspp_linreg_cache_free, *lc);
595 for (lc = models; lc < models + cmd.n_dependent; lc++)
597 assert (*lc != NULL);
598 pspp_linreg_cache_free (*lc);
603 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
607 for (i = 0; i < n_vars; i++)
609 if (v->index == varlist[i]->index)
617 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
622 struct variable **varlist;
623 struct pspp_linreg_coeff *coeff;
624 const struct variable *v;
627 fprintf (fp, "%s", reg_export_categorical_encode_1);
629 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
630 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
632 coeff = c->coeff + i;
633 v = pspp_linreg_coeff_get_var (coeff, 0);
634 if (v->type == ALPHA)
636 if (!reg_inserted (v, varlist, n_vars))
638 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
640 varlist[n_vars] = (struct variable *) v;
645 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
646 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
648 for (i = 0; i < n_vars - 1; i++)
650 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
652 fprintf (fp, "&%s};\n\t", varlist[i]->name);
654 for (i = 0; i < n_vars; i++)
656 coeff = c->coeff + i;
657 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
659 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
660 varlist[i]->obs_vals->n_categories);
662 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
664 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
665 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
666 value_to_string (val, varlist[i]));
669 fprintf (fp, "%s", reg_export_categorical_encode_2);
673 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
676 struct pspp_linreg_coeff *coeff;
677 const struct variable *v;
679 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
680 for (i = 1; i < c->n_indeps; i++)
682 coeff = c->coeff + i;
683 v = pspp_linreg_coeff_get_var (coeff, 0);
684 fprintf (fp, "\"%s\",\n\t\t", v->name);
686 coeff = c->coeff + i;
687 v = pspp_linreg_coeff_get_var (coeff, 0);
688 fprintf (fp, "\"%s\"};\n\t", v->name);
691 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
693 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
694 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
695 reg_print_depvars (fp, c);
696 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
698 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
699 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
702 reg_has_categorical (pspp_linreg_cache * c)
705 const struct variable *v;
707 for (i = 1; i < c->n_coeffs; i++)
709 v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
710 if (v->type == ALPHA)
719 subcommand_export (int export, pspp_linreg_cache * c)
724 int n_quantiles = 100;
727 struct pspp_linreg_coeff coeff;
732 assert (model_file != NULL);
733 fp = fopen (fh_get_file_name (model_file), "w");
735 fprintf (fp, "%s", reg_preamble);
736 reg_print_getvar (fp, c);
737 if (reg_has_categorical (c))
739 reg_print_categorical_encoding (fp, c);
741 fprintf (fp, "%s", reg_export_t_quantiles_1);
742 increment = 0.5 / (double) increment;
743 for (i = 0; i < n_quantiles - 1; i++)
745 tmp = 0.5 + 0.005 * (double) i;
746 fprintf (fp, "%.15e,\n\t\t",
747 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
749 fprintf (fp, "%.15e};\n\t",
750 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
751 fprintf (fp, "%s", reg_export_t_quantiles_2);
752 fprintf (fp, "%s", reg_mean_cmt);
753 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
754 fprintf (fp, "const char *var_names[])\n{\n\t");
755 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
756 for (i = 1; i < c->n_indeps; i++)
759 fprintf (fp, "%.15e,\n\t\t", coeff.estimate);
762 fprintf (fp, "%.15e};\n\t", coeff.estimate);
764 fprintf (fp, "double estimate = %.15e;\n\t", coeff.estimate);
765 fprintf (fp, "int i;\n\tint j;\n\n\t");
766 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
767 fprintf (fp, "%s", reg_getvar);
768 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
770 for (i = 0; i < c->cov->size1 - 1; i++)
773 for (j = 0; j < c->cov->size2 - 1; j++)
775 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
777 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
780 for (j = 0; j < c->cov->size2 - 1; j++)
782 fprintf (fp, "%.15e, ",
783 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
785 fprintf (fp, "%.15e}\n\t",
786 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
787 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
789 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
790 fprintf (fp, "%s", reg_variance);
791 fprintf (fp, "%s", reg_export_confidence_interval);
792 tmp = c->mse * c->mse;
793 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
794 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
795 fprintf (fp, "%s", reg_export_prediction_interval_3);
797 fp = fopen ("pspp_model_reg.h", "w");
798 fprintf (fp, "%s", reg_header);
803 regression_custom_export (struct cmd_regression *cmd UNUSED)
805 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
806 if (!lex_force_match ('('))
813 model_file = fh_parse (FH_REF_FILE);
814 if (model_file == NULL)
818 if (!lex_force_match (')'))
825 cmd_regression (void)
827 if (!parse_regression (&cmd))
830 models = xnmalloc (cmd.n_dependent, sizeof *models);
831 if (!multipass_procedure_with_splits (run_regression, &cmd))
832 return CMD_CASCADING_FAILURE;
833 subcommand_save (cmd.sbc_save, models);
840 Is variable k the dependent variable?
843 is_depvar (size_t k, const struct variable *v)
846 compare_var_names returns 0 if the variable
849 if (!compare_var_names (v, v_variables[k], NULL))
856 Mark missing cases. Return the number of non-missing cases.
859 mark_missing_cases (const struct casefile *cf, struct variable *v,
860 int *is_missing_case, double n_data)
862 struct casereader *r;
865 const union value *val;
867 for (r = casefile_get_reader (cf);
868 casereader_read (r, &c); case_destroy (&c))
870 row = casereader_cnum (r) - 1;
872 val = case_data (&c, v->fv);
873 cat_value_update (v, val);
874 if (mv_is_value_missing (&v->miss, val))
876 if (!is_missing_case[row])
878 /* Now it is missing. */
880 is_missing_case[row] = 1;
884 casereader_destroy (r);
889 /* Parser for the variables sub command */
891 regression_custom_variables(struct cmd_regression *cmd UNUSED)
896 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
901 if (!parse_variables (default_dict, &v_variables, &n_variables,
912 Count the explanatory variables. The user may or may
913 not have specified a response variable in the syntax.
916 int get_n_indep (const struct variable *v)
921 result = n_variables;
922 while (i < n_variables)
924 if (is_depvar (i, v))
934 Read from the active file. Identify the explanatory variables in
935 v_variables. Encode categorical variables. Drop cases with missing
939 int prepare_data (int n_data, int is_missing_case[],
940 struct variable **indep_vars,
941 struct variable *depvar,
942 const struct casefile *cf)
947 assert (indep_vars != NULL);
949 for (i = 0; i < n_variables; i++)
951 if (!is_depvar (i, depvar))
953 indep_vars[j] = v_variables[i];
955 if (v_variables[i]->type == ALPHA)
957 /* Make a place to hold the binary vectors
958 corresponding to this variable's values. */
959 cat_stored_values_create (v_variables[i]);
961 n_data = mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
965 Mark missing cases for the dependent variable.
967 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
972 run_regression (const struct casefile *cf, void *cmd_ UNUSED)
975 size_t n_data = 0; /* Number of valide cases. */
976 size_t n_cases; /* Number of cases. */
982 Keep track of the missing cases.
984 int *is_missing_case;
985 const union value *val;
986 struct casereader *r;
988 struct variable **indep_vars;
989 struct design_matrix *X;
992 pspp_linreg_opts lopts;
994 assert (models != NULL);
997 dict_get_vars (default_dict, &v_variables, &n_variables,
1001 n_cases = casefile_get_case_cnt (cf);
1003 for (i = 0; i < cmd.n_dependent; i++)
1005 if (cmd.v_dependent[i]->type != NUMERIC)
1007 msg (SE, gettext ("Dependent variable must be numeric."));
1008 pspp_reg_rc = CMD_FAILURE;
1013 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
1015 lopts.get_depvar_mean_std = 1;
1017 for (k = 0; k < cmd.n_dependent; k++)
1019 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
1020 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
1021 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
1022 assert (indep_vars != NULL);
1024 for (i = 0; i < n_cases; i++)
1026 is_missing_case[i] = 0;
1028 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
1030 (const struct casefile *) cf);
1031 Y = gsl_vector_alloc (n_data);
1034 design_matrix_create (n_indep, (const struct variable **) indep_vars,
1036 for (i = 0; i < X->m->size2; i++)
1038 lopts.get_indep_mean_std[i] = 1;
1040 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1041 models[k]->indep_means = gsl_vector_alloc (X->m->size2);
1042 models[k]->indep_std = gsl_vector_alloc (X->m->size2);
1043 models[k]->depvar = (const struct variable *) cmd.v_dependent[k];
1045 For large data sets, use QR decomposition.
1047 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1049 models[k]->method = PSPP_LINREG_SVD;
1053 The second pass fills the design matrix.
1056 for (r = casefile_get_reader (cf); casereader_read (r, &c);
1058 /* Iterate over the cases. */
1060 case_num = casereader_cnum (r) - 1;
1061 if (!is_missing_case[case_num])
1063 for (i = 0; i < n_variables; ++i) /* Iterate over the
1068 val = case_data (&c, v_variables[i]->fv);
1070 Independent/dependent variable separation. The
1071 'variables' subcommand specifies a varlist which contains
1072 both dependent and independent variables. The dependent
1073 variables are specified with the 'dependent'
1074 subcommand, and maybe also in the 'variables' subcommand.
1075 We need to separate the two.
1077 if (!is_depvar (i, cmd.v_dependent[k]))
1079 if (v_variables[i]->type == ALPHA)
1081 design_matrix_set_categorical (X, row, v_variables[i], val);
1083 else if (v_variables[i]->type == NUMERIC)
1085 design_matrix_set_numeric (X, row, v_variables[i], val);
1089 val = case_data (&c, cmd.v_dependent[k]->fv);
1090 gsl_vector_set (Y, row, val->f);
1095 Now that we know the number of coefficients, allocate space
1096 and store pointers to the variables that correspond to the
1099 pspp_linreg_coeff_init (models[k], X);
1102 Find the least-squares estimates and other statistics.
1104 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1105 subcommand_statistics (cmd.a_statistics, models[k]);
1106 subcommand_export (cmd.sbc_export, models[k]);
1108 gsl_vector_free (Y);
1109 design_matrix_destroy (X);
1111 free (lopts.get_indep_mean_std);
1112 casereader_destroy (r);
1115 free (is_missing_case);