1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_vector.h>
24 #include <gsl/gsl_matrix.h>
26 #include <libpspp/alloc.h>
27 #include <data/case.h>
28 #include <data/casefile.h>
29 #include <data/category.h>
30 #include <data/cat-routines.h>
31 #include <language/command.h>
32 #include <libpspp/compiler.h>
33 #include <math/design-matrix.h>
34 #include <data/dictionary.h>
35 #include <libpspp/message.h>
36 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <math/linreg/linreg.h>
40 #include <math/linreg/coefficient.h>
41 #include <data/missing-values.h>
42 #include "regression-export.h"
43 #include <output/table.h>
44 #include <data/value-labels.h>
45 #include <data/variable.h>
46 #include <procedure.h>
48 #define REG_LARGE_DATA 1000
53 "REGRESSION" (regression_):
79 static struct cmd_regression cmd;
81 /* Linear regression models. */
82 pspp_linreg_cache **models = NULL;
85 Variables used (both explanatory and response).
87 static struct variable **v_variables;
92 static size_t n_variables;
95 File where the model will be saved if the EXPORT subcommand
98 struct file_handle *model_file;
101 Return value for the procedure.
103 int pspp_reg_rc = CMD_SUCCESS;
105 static bool run_regression (const struct casefile *, void *);
108 STATISTICS subcommand output functions.
110 static void reg_stats_r (pspp_linreg_cache *);
111 static void reg_stats_coeff (pspp_linreg_cache *);
112 static void reg_stats_anova (pspp_linreg_cache *);
113 static void reg_stats_outs (pspp_linreg_cache *);
114 static void reg_stats_zpp (pspp_linreg_cache *);
115 static void reg_stats_label (pspp_linreg_cache *);
116 static void reg_stats_sha (pspp_linreg_cache *);
117 static void reg_stats_ci (pspp_linreg_cache *);
118 static void reg_stats_f (pspp_linreg_cache *);
119 static void reg_stats_bcov (pspp_linreg_cache *);
120 static void reg_stats_ses (pspp_linreg_cache *);
121 static void reg_stats_xtx (pspp_linreg_cache *);
122 static void reg_stats_collin (pspp_linreg_cache *);
123 static void reg_stats_tol (pspp_linreg_cache *);
124 static void reg_stats_selection (pspp_linreg_cache *);
125 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
126 int, pspp_linreg_cache *);
129 reg_stats_r (pspp_linreg_cache * c)
139 rsq = c->ssm / c->sst;
140 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
141 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
142 t = tab_create (n_cols, n_rows, 0);
143 tab_dim (t, tab_natural_dimensions);
144 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
145 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
146 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
147 tab_vline (t, TAL_0, 1, 0, 0);
149 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
150 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
151 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
152 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
153 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
154 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
155 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
156 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
157 tab_title (t, _("Model Summary"));
162 Table showing estimated regression coefficients.
165 reg_stats_coeff (pspp_linreg_cache * c)
177 const struct variable *v;
178 const union value *val;
183 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
184 n_rows = c->n_coeffs + 2;
186 t = tab_create (n_cols, n_rows, 0);
187 tab_headers (t, 2, 0, 1, 0);
188 tab_dim (t, tab_natural_dimensions);
189 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
190 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
191 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
192 tab_vline (t, TAL_0, 1, 0, 0);
194 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
195 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
196 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
197 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
198 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
199 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
200 coeff = c->coeff[0].estimate;
201 tab_float (t, 2, 1, 0, coeff, 10, 2);
202 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
203 tab_float (t, 3, 1, 0, std_err, 10, 2);
204 beta = coeff / c->depvar_std;
205 tab_float (t, 4, 1, 0, beta, 10, 2);
206 t_stat = coeff / std_err;
207 tab_float (t, 5, 1, 0, t_stat, 10, 2);
208 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
209 tab_float (t, 6, 1, 0, pval, 10, 2);
210 for (j = 1; j <= c->n_indeps; j++)
212 v = pspp_linreg_coeff_get_var (c->coeff + j, 0);
213 label = var_to_string (v);
214 /* Do not overwrite the variable's name. */
215 strncpy (tmp, label, MAX_STRING);
216 if (v->type == ALPHA)
219 Append the value associated with this coefficient.
220 This makes sense only if we us the usual binary encoding
224 val = pspp_linreg_coeff_get_value (c->coeff + j, v);
225 val_s = value_to_string (val, v);
226 strncat (tmp, val_s, MAX_STRING);
229 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
231 Regression coefficients.
233 coeff = c->coeff[j].estimate;
234 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
236 Standard error of the coefficients.
238 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
239 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
241 'Standardized' coefficient, i.e., regression coefficient
242 if all variables had unit variance.
244 beta = gsl_vector_get (c->indep_std, j);
245 beta *= coeff / c->depvar_std;
246 tab_float (t, 4, j + 1, 0, beta, 10, 2);
249 Test statistic for H0: coefficient is 0.
251 t_stat = coeff / std_err;
252 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
254 P values for the test statistic above.
256 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
257 tab_float (t, 6, j + 1, 0, pval, 10, 2);
259 tab_title (t, _("Coefficients"));
265 Display the ANOVA table.
268 reg_stats_anova (pspp_linreg_cache * c)
272 const double msm = c->ssm / c->dfm;
273 const double mse = c->sse / c->dfe;
274 const double F = msm / mse;
275 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
280 t = tab_create (n_cols, n_rows, 0);
281 tab_headers (t, 2, 0, 1, 0);
282 tab_dim (t, tab_natural_dimensions);
284 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
286 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
287 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
288 tab_vline (t, TAL_0, 1, 0, 0);
290 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
291 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
292 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
293 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
294 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
296 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
297 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
298 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
300 /* Sums of Squares */
301 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
302 tab_float (t, 2, 3, 0, c->sst, 10, 2);
303 tab_float (t, 2, 2, 0, c->sse, 10, 2);
306 /* Degrees of freedom */
307 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
308 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
309 tab_float (t, 3, 3, 0, c->dft, 4, 0);
313 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
314 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
316 tab_float (t, 5, 1, 0, F, 8, 3);
318 tab_float (t, 6, 1, 0, pval, 8, 3);
320 tab_title (t, _("ANOVA"));
324 reg_stats_outs (pspp_linreg_cache * c)
329 reg_stats_zpp (pspp_linreg_cache * c)
334 reg_stats_label (pspp_linreg_cache * c)
339 reg_stats_sha (pspp_linreg_cache * c)
344 reg_stats_ci (pspp_linreg_cache * c)
349 reg_stats_f (pspp_linreg_cache * c)
354 reg_stats_bcov (pspp_linreg_cache * c)
366 n_cols = c->n_indeps + 1 + 2;
367 n_rows = 2 * (c->n_indeps + 1);
368 t = tab_create (n_cols, n_rows, 0);
369 tab_headers (t, 2, 0, 1, 0);
370 tab_dim (t, tab_natural_dimensions);
371 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
372 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
373 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
374 tab_vline (t, TAL_0, 1, 0, 0);
375 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
376 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
377 for (i = 1; i < c->n_coeffs; i++)
379 const struct variable *v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
380 label = var_to_string (v);
381 tab_text (t, 2, i, TAB_CENTER, label);
382 tab_text (t, i + 2, 0, TAB_CENTER, label);
383 for (k = 1; k < c->n_coeffs; k++)
385 col = (i <= k) ? k : i;
386 row = (i <= k) ? i : k;
387 tab_float (t, k + 2, i, TAB_CENTER,
388 gsl_matrix_get (c->cov, row, col), 8, 3);
391 tab_title (t, _("Coefficient Correlations"));
395 reg_stats_ses (pspp_linreg_cache * c)
400 reg_stats_xtx (pspp_linreg_cache * c)
405 reg_stats_collin (pspp_linreg_cache * c)
410 reg_stats_tol (pspp_linreg_cache * c)
415 reg_stats_selection (pspp_linreg_cache * c)
421 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
422 int keyword, pspp_linreg_cache * c)
431 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
434 The order here must match the order in which the STATISTICS
435 keywords appear in the specification section above.
462 Set everything but F.
464 for (i = 0; i < f; i++)
471 for (i = 0; i < all; i++)
479 Default output: ANOVA table, parameter estimates,
480 and statistics for variables not entered into model,
483 if (keywords[defaults] | d)
491 statistics_keyword_output (reg_stats_r, keywords[r], c);
492 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
493 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
494 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
495 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
496 statistics_keyword_output (reg_stats_label, keywords[label], c);
497 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
498 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
499 statistics_keyword_output (reg_stats_f, keywords[f], c);
500 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
501 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
502 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
503 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
504 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
505 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
508 regression_trns_proc (void *m, struct ccase *c, int case_idx UNUSED)
513 pspp_linreg_cache *model = m;
515 const union value **vals = NULL;
516 const union value *obs = NULL;
517 struct variable **vars = NULL;
519 assert (model != NULL);
520 assert (model->depvar != NULL);
521 assert (model->resid != NULL);
523 dict_get_vars (default_dict, &vars, &n_vars, 1u << DC_SYSTEM);
524 vals = xnmalloc (n_vars, sizeof (*vals));
525 assert (vals != NULL);
526 output = case_data_rw (c, model->resid->fv);
527 assert (output != NULL);
529 for (i = 0; i < n_vars; i++)
531 /* Do not use the residual variable. */
532 if (vars[i]->index != model->resid->index)
534 /* Do not use the dependent variable as a predictor. */
535 if (vars[i]->index == model->depvar->index)
537 obs = case_data (c, i);
538 assert (obs != NULL);
542 vals[i] = case_data (c, i);
547 output->f = (*model->residual) ((const struct variable **) vars,
548 vals, obs, model, n_vals);
550 return TRNS_CONTINUE;
553 subcommand_save (int save, pspp_linreg_cache **models)
555 struct variable *residuals = NULL;
556 pspp_linreg_cache **lc;
558 assert (models != NULL);
562 for (lc = models; lc < models + cmd.n_dependent; lc++)
564 assert (*lc != NULL);
565 assert ((*lc)->depvar != NULL);
566 residuals = dict_create_var (default_dict, "residuals", 0);
567 assert (residuals != NULL);
568 (*lc)->resid = residuals;
569 add_transformation (regression_trns_proc, pspp_linreg_cache_free, *lc);
574 for (lc = models; lc < models + cmd.n_dependent; lc++)
576 assert (*lc != NULL);
577 pspp_linreg_cache_free (*lc);
582 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
586 for (i = 0; i < n_vars; i++)
588 if (v->index == varlist[i]->index)
596 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
601 struct variable **varlist;
602 struct pspp_linreg_coeff *coeff;
603 const struct variable *v;
606 fprintf (fp, "%s", reg_export_categorical_encode_1);
608 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
609 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
611 coeff = c->coeff + i;
612 v = pspp_linreg_coeff_get_var (coeff, 0);
613 if (v->type == ALPHA)
615 if (!reg_inserted (v, varlist, n_vars))
617 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
619 varlist[n_vars] = (struct variable *) v;
624 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
625 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
627 for (i = 0; i < n_vars - 1; i++)
629 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
631 fprintf (fp, "&%s};\n\t", varlist[i]->name);
633 for (i = 0; i < n_vars; i++)
635 coeff = c->coeff + i;
636 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
638 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
639 varlist[i]->obs_vals->n_categories);
641 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
643 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
644 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
645 value_to_string (val, varlist[i]));
648 fprintf (fp, "%s", reg_export_categorical_encode_2);
652 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
655 struct pspp_linreg_coeff *coeff;
656 const struct variable *v;
658 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
659 for (i = 1; i < c->n_indeps; i++)
661 coeff = c->coeff + i;
662 v = pspp_linreg_coeff_get_var (coeff, 0);
663 fprintf (fp, "\"%s\",\n\t\t", v->name);
665 coeff = c->coeff + i;
666 v = pspp_linreg_coeff_get_var (coeff, 0);
667 fprintf (fp, "\"%s\"};\n\t", v->name);
670 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
672 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
673 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
674 reg_print_depvars (fp, c);
675 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
677 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
678 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
681 reg_has_categorical (pspp_linreg_cache * c)
684 const struct variable *v;
686 for (i = 1; i < c->n_coeffs; i++)
688 v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
689 if (v->type == ALPHA)
698 subcommand_export (int export, pspp_linreg_cache * c)
703 int n_quantiles = 100;
706 struct pspp_linreg_coeff coeff;
711 assert (model_file != NULL);
712 fp = fopen (fh_get_file_name (model_file), "w");
714 fprintf (fp, "%s", reg_preamble);
715 reg_print_getvar (fp, c);
716 if (reg_has_categorical (c))
718 reg_print_categorical_encoding (fp, c);
720 fprintf (fp, "%s", reg_export_t_quantiles_1);
721 increment = 0.5 / (double) increment;
722 for (i = 0; i < n_quantiles - 1; i++)
724 tmp = 0.5 + 0.005 * (double) i;
725 fprintf (fp, "%.15e,\n\t\t",
726 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
728 fprintf (fp, "%.15e};\n\t",
729 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
730 fprintf (fp, "%s", reg_export_t_quantiles_2);
731 fprintf (fp, "%s", reg_mean_cmt);
732 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
733 fprintf (fp, "const char *var_names[])\n{\n\t");
734 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
735 for (i = 1; i < c->n_indeps; i++)
738 fprintf (fp, "%.15e,\n\t\t", coeff.estimate);
741 fprintf (fp, "%.15e};\n\t", coeff.estimate);
743 fprintf (fp, "double estimate = %.15e;\n\t", coeff.estimate);
744 fprintf (fp, "int i;\n\tint j;\n\n\t");
745 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
746 fprintf (fp, "%s", reg_getvar);
747 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
749 for (i = 0; i < c->cov->size1 - 1; i++)
752 for (j = 0; j < c->cov->size2 - 1; j++)
754 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
756 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
759 for (j = 0; j < c->cov->size2 - 1; j++)
761 fprintf (fp, "%.15e, ",
762 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
764 fprintf (fp, "%.15e}\n\t",
765 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
766 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
768 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
769 fprintf (fp, "%s", reg_variance);
770 fprintf (fp, "%s", reg_export_confidence_interval);
771 tmp = c->mse * c->mse;
772 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
773 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
774 fprintf (fp, "%s", reg_export_prediction_interval_3);
776 fp = fopen ("pspp_model_reg.h", "w");
777 fprintf (fp, "%s", reg_header);
782 regression_custom_export (struct cmd_regression *cmd UNUSED)
784 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
785 if (!lex_force_match ('('))
792 model_file = fh_parse (FH_REF_FILE);
793 if (model_file == NULL)
797 if (!lex_force_match (')'))
804 cmd_regression (void)
806 if (!parse_regression (&cmd))
809 models = xnmalloc (cmd.n_dependent, sizeof *models);
810 if (!multipass_procedure_with_splits (run_regression, &cmd))
811 return CMD_CASCADING_FAILURE;
812 subcommand_save (cmd.sbc_save, models);
819 Is variable k the dependent variable?
822 is_depvar (size_t k, const struct variable *v)
825 compare_var_names returns 0 if the variable
828 if (!compare_var_names (v, v_variables[k], NULL))
835 Mark missing cases. Return the number of non-missing cases.
838 mark_missing_cases (const struct casefile *cf, struct variable *v,
839 int *is_missing_case, double n_data)
841 struct casereader *r;
844 const union value *val;
846 for (r = casefile_get_reader (cf);
847 casereader_read (r, &c); case_destroy (&c))
849 row = casereader_cnum (r) - 1;
851 val = case_data (&c, v->fv);
852 cat_value_update (v, val);
853 if (mv_is_value_missing (&v->miss, val))
855 if (!is_missing_case[row])
857 /* Now it is missing. */
859 is_missing_case[row] = 1;
863 casereader_destroy (r);
868 /* Parser for the variables sub command */
870 regression_custom_variables(struct cmd_regression *cmd UNUSED)
875 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
880 if (!parse_variables (default_dict, &v_variables, &n_variables,
891 Count the explanatory variables. The user may or may
892 not have specified a response variable in the syntax.
895 int get_n_indep (const struct variable *v)
900 result = n_variables;
901 while (i < n_variables)
903 if (is_depvar (i, v))
913 Read from the active file. Identify the explanatory variables in
914 v_variables. Encode categorical variables. Drop cases with missing
918 int prepare_data (int n_data, int is_missing_case[],
919 struct variable **indep_vars,
920 struct variable *depvar,
921 const struct casefile *cf)
926 assert (indep_vars != NULL);
928 for (i = 0; i < n_variables; i++)
930 if (!is_depvar (i, depvar))
932 indep_vars[j] = v_variables[i];
934 if (v_variables[i]->type == ALPHA)
936 /* Make a place to hold the binary vectors
937 corresponding to this variable's values. */
938 cat_stored_values_create (v_variables[i]);
940 n_data = mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
944 Mark missing cases for the dependent variable.
946 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
951 run_regression (const struct casefile *cf, void *cmd_ UNUSED)
954 size_t n_data = 0; /* Number of valide cases. */
955 size_t n_cases; /* Number of cases. */
961 Keep track of the missing cases.
963 int *is_missing_case;
964 const union value *val;
965 struct casereader *r;
967 struct variable **indep_vars;
968 struct design_matrix *X;
971 pspp_linreg_opts lopts;
973 assert (models != NULL);
976 dict_get_vars (default_dict, &v_variables, &n_variables,
980 n_cases = casefile_get_case_cnt (cf);
982 for (i = 0; i < cmd.n_dependent; i++)
984 if (cmd.v_dependent[i]->type != NUMERIC)
986 msg (SE, gettext ("Dependent variable must be numeric."));
987 pspp_reg_rc = CMD_FAILURE;
992 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
994 lopts.get_depvar_mean_std = 1;
996 for (k = 0; k < cmd.n_dependent; k++)
998 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
999 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
1000 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
1001 assert (indep_vars != NULL);
1003 for (i = 0; i < n_cases; i++)
1005 is_missing_case[i] = 0;
1007 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
1009 (const struct casefile *) cf);
1010 Y = gsl_vector_alloc (n_data);
1013 design_matrix_create (n_indep, (const struct variable **) indep_vars,
1015 for (i = 0; i < X->m->size2; i++)
1017 lopts.get_indep_mean_std[i] = 1;
1019 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1020 models[k]->indep_means = gsl_vector_alloc (X->m->size2);
1021 models[k]->indep_std = gsl_vector_alloc (X->m->size2);
1022 models[k]->depvar = (const struct variable *) cmd.v_dependent[k];
1024 For large data sets, use QR decomposition.
1026 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1028 models[k]->method = PSPP_LINREG_SVD;
1032 The second pass fills the design matrix.
1035 for (r = casefile_get_reader (cf); casereader_read (r, &c);
1037 /* Iterate over the cases. */
1039 case_num = casereader_cnum (r) - 1;
1040 if (!is_missing_case[case_num])
1042 for (i = 0; i < n_variables; ++i) /* Iterate over the
1047 val = case_data (&c, v_variables[i]->fv);
1049 Independent/dependent variable separation. The
1050 'variables' subcommand specifies a varlist which contains
1051 both dependent and independent variables. The dependent
1052 variables are specified with the 'dependent'
1053 subcommand, and maybe also in the 'variables' subcommand.
1054 We need to separate the two.
1056 if (!is_depvar (i, cmd.v_dependent[k]))
1058 if (v_variables[i]->type == ALPHA)
1060 design_matrix_set_categorical (X, row, v_variables[i], val);
1062 else if (v_variables[i]->type == NUMERIC)
1064 design_matrix_set_numeric (X, row, v_variables[i], val);
1068 val = case_data (&c, cmd.v_dependent[k]->fv);
1069 gsl_vector_set (Y, row, val->f);
1074 Now that we know the number of coefficients, allocate space
1075 and store pointers to the variables that correspond to the
1078 pspp_linreg_coeff_init (models[k], X);
1081 Find the least-squares estimates and other statistics.
1083 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1084 subcommand_statistics (cmd.a_statistics, models[k]);
1085 subcommand_export (cmd.sbc_export, models[k]);
1087 gsl_vector_free (Y);
1088 design_matrix_destroy (X);
1090 free (lopts.get_indep_mean_std);
1091 casereader_destroy (r);
1094 free (is_missing_case);