1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_vector.h>
24 #include <gsl/gsl_matrix.h>
26 #include <libpspp/alloc.h>
27 #include <data/case.h>
28 #include <data/casefile.h>
29 #include <data/category.h>
30 #include <data/cat-routines.h>
31 #include <language/command.h>
32 #include <libpspp/compiler.h>
33 #include <math/design-matrix.h>
34 #include <data/dictionary.h>
35 #include <libpspp/message.h>
36 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <math/linreg/linreg.h>
40 #include <math/linreg/coefficient.h>
41 #include <data/missing-values.h>
42 #include "regression-export.h"
43 #include <output/table.h>
44 #include <data/value-labels.h>
45 #include <data/variable.h>
46 #include "procedure.h"
48 #define REG_LARGE_DATA 1000
53 "REGRESSION" (regression_):
79 static struct cmd_regression cmd;
82 Variables used (both explanatory and response).
84 static struct variable **v_variables;
89 static size_t n_variables;
92 File where the model will be saved if the EXPORT subcommand
95 struct file_handle *model_file;
98 Return value for the procedure.
100 int pspp_reg_rc = CMD_SUCCESS;
102 static bool run_regression (const struct casefile *, void *);
105 STATISTICS subcommand output functions.
107 static void reg_stats_r (pspp_linreg_cache *);
108 static void reg_stats_coeff (pspp_linreg_cache *);
109 static void reg_stats_anova (pspp_linreg_cache *);
110 static void reg_stats_outs (pspp_linreg_cache *);
111 static void reg_stats_zpp (pspp_linreg_cache *);
112 static void reg_stats_label (pspp_linreg_cache *);
113 static void reg_stats_sha (pspp_linreg_cache *);
114 static void reg_stats_ci (pspp_linreg_cache *);
115 static void reg_stats_f (pspp_linreg_cache *);
116 static void reg_stats_bcov (pspp_linreg_cache *);
117 static void reg_stats_ses (pspp_linreg_cache *);
118 static void reg_stats_xtx (pspp_linreg_cache *);
119 static void reg_stats_collin (pspp_linreg_cache *);
120 static void reg_stats_tol (pspp_linreg_cache *);
121 static void reg_stats_selection (pspp_linreg_cache *);
122 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
123 int, pspp_linreg_cache *);
126 reg_stats_r (pspp_linreg_cache * c)
136 rsq = c->ssm / c->sst;
137 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
138 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
139 t = tab_create (n_cols, n_rows, 0);
140 tab_dim (t, tab_natural_dimensions);
141 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
142 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
143 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
144 tab_vline (t, TAL_0, 1, 0, 0);
146 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
147 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
148 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
149 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
150 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
151 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
152 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
153 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
154 tab_title (t, _("Model Summary"));
159 Table showing estimated regression coefficients.
162 reg_stats_coeff (pspp_linreg_cache * c)
174 const struct variable *v;
175 const union value *val;
180 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
181 n_rows = c->n_coeffs + 2;
183 t = tab_create (n_cols, n_rows, 0);
184 tab_headers (t, 2, 0, 1, 0);
185 tab_dim (t, tab_natural_dimensions);
186 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
187 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
188 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
189 tab_vline (t, TAL_0, 1, 0, 0);
191 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
192 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
193 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
194 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
195 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
196 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
197 coeff = c->coeff[0].estimate;
198 tab_float (t, 2, 1, 0, coeff, 10, 2);
199 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
200 tab_float (t, 3, 1, 0, std_err, 10, 2);
201 beta = coeff / c->depvar_std;
202 tab_float (t, 4, 1, 0, beta, 10, 2);
203 t_stat = coeff / std_err;
204 tab_float (t, 5, 1, 0, t_stat, 10, 2);
205 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
206 tab_float (t, 6, 1, 0, pval, 10, 2);
207 for (j = 1; j <= c->n_indeps; j++)
209 v = pspp_linreg_coeff_get_var (c->coeff + j, 0);
210 label = var_to_string (v);
211 /* Do not overwrite the variable's name. */
212 strncpy (tmp, label, MAX_STRING);
213 if (v->type == ALPHA)
216 Append the value associated with this coefficient.
217 This makes sense only if we us the usual binary encoding
221 val = pspp_linreg_coeff_get_value (c->coeff + j, v);
222 val_s = value_to_string (val, v);
223 strncat (tmp, val_s, MAX_STRING);
226 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
228 Regression coefficients.
230 coeff = c->coeff[j].estimate;
231 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
233 Standard error of the coefficients.
235 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
236 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
238 'Standardized' coefficient, i.e., regression coefficient
239 if all variables had unit variance.
241 beta = gsl_vector_get (c->indep_std, j);
242 beta *= coeff / c->depvar_std;
243 tab_float (t, 4, j + 1, 0, beta, 10, 2);
246 Test statistic for H0: coefficient is 0.
248 t_stat = coeff / std_err;
249 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
251 P values for the test statistic above.
253 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
254 tab_float (t, 6, j + 1, 0, pval, 10, 2);
256 tab_title (t, _("Coefficients"));
262 Display the ANOVA table.
265 reg_stats_anova (pspp_linreg_cache * c)
269 const double msm = c->ssm / c->dfm;
270 const double mse = c->sse / c->dfe;
271 const double F = msm / mse;
272 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
277 t = tab_create (n_cols, n_rows, 0);
278 tab_headers (t, 2, 0, 1, 0);
279 tab_dim (t, tab_natural_dimensions);
281 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
283 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
284 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
285 tab_vline (t, TAL_0, 1, 0, 0);
287 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
288 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
289 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
290 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
291 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
293 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
294 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
295 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
297 /* Sums of Squares */
298 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
299 tab_float (t, 2, 3, 0, c->sst, 10, 2);
300 tab_float (t, 2, 2, 0, c->sse, 10, 2);
303 /* Degrees of freedom */
304 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
305 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
306 tab_float (t, 3, 3, 0, c->dft, 4, 0);
310 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
311 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
313 tab_float (t, 5, 1, 0, F, 8, 3);
315 tab_float (t, 6, 1, 0, pval, 8, 3);
317 tab_title (t, _("ANOVA"));
321 reg_stats_outs (pspp_linreg_cache * c)
326 reg_stats_zpp (pspp_linreg_cache * c)
331 reg_stats_label (pspp_linreg_cache * c)
336 reg_stats_sha (pspp_linreg_cache * c)
341 reg_stats_ci (pspp_linreg_cache * c)
346 reg_stats_f (pspp_linreg_cache * c)
351 reg_stats_bcov (pspp_linreg_cache * c)
363 n_cols = c->n_indeps + 1 + 2;
364 n_rows = 2 * (c->n_indeps + 1);
365 t = tab_create (n_cols, n_rows, 0);
366 tab_headers (t, 2, 0, 1, 0);
367 tab_dim (t, tab_natural_dimensions);
368 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
369 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
370 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
371 tab_vline (t, TAL_0, 1, 0, 0);
372 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
373 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
374 for (i = 1; i < c->n_coeffs; i++)
376 const struct variable *v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
377 label = var_to_string (v);
378 tab_text (t, 2, i, TAB_CENTER, label);
379 tab_text (t, i + 2, 0, TAB_CENTER, label);
380 for (k = 1; k < c->n_coeffs; k++)
382 col = (i <= k) ? k : i;
383 row = (i <= k) ? i : k;
384 tab_float (t, k + 2, i, TAB_CENTER,
385 gsl_matrix_get (c->cov, row, col), 8, 3);
388 tab_title (t, _("Coefficient Correlations"));
392 reg_stats_ses (pspp_linreg_cache * c)
397 reg_stats_xtx (pspp_linreg_cache * c)
402 reg_stats_collin (pspp_linreg_cache * c)
407 reg_stats_tol (pspp_linreg_cache * c)
412 reg_stats_selection (pspp_linreg_cache * c)
418 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
419 int keyword, pspp_linreg_cache * c)
428 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
431 The order here must match the order in which the STATISTICS
432 keywords appear in the specification section above.
459 Set everything but F.
461 for (i = 0; i < f; i++)
468 for (i = 0; i < all; i++)
476 Default output: ANOVA table, parameter estimates,
477 and statistics for variables not entered into model,
480 if (keywords[defaults] | d)
488 statistics_keyword_output (reg_stats_r, keywords[r], c);
489 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
490 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
491 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
492 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
493 statistics_keyword_output (reg_stats_label, keywords[label], c);
494 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
495 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
496 statistics_keyword_output (reg_stats_f, keywords[f], c);
497 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
498 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
499 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
500 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
501 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
502 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
505 regression_trns_proc (void *m, struct ccase *c, int case_idx UNUSED)
509 pspp_linreg_cache *model = m;
511 const union value **vals = NULL;
512 const union value *obs = NULL;
513 struct variable **vars = NULL;
515 assert (model != NULL);
516 assert (model->depvar != NULL);
517 assert (model->resid != NULL);
519 dict_get_vars (default_dict, &vars, &n_vars, 1u << DC_ORDINARY);
520 vals = xnmalloc (n_vars, sizeof (*vals));
521 assert (vals != NULL);
522 output = case_data_rw (c, model->resid->fv);
523 assert (output != NULL);
525 for (i = 0; i < n_vars; i++)
527 /* Do not use the residual variable. */
528 if (vars[i]->index != model->resid->index)
530 /* Do not use the dependent variable as a predictor. */
531 if (vars[i]->index == model->depvar->index)
533 obs = case_data (c, i);
534 assert (obs != NULL);
538 vals[i] = case_data (c, i);
542 output->f = (*model->residual) ((const struct variable **) vars,
543 vals, obs, model, i);
545 return TRNS_CONTINUE;
548 subcommand_save (int save, pspp_linreg_cache *lc)
550 struct variable *residuals = NULL;
553 assert (lc->depvar != NULL);
557 residuals = dict_create_var (default_dict, "residuals", 0);
558 assert (residuals != NULL);
559 lc->resid = residuals;
560 add_transformation (regression_trns_proc, pspp_linreg_cache_free, lc);
564 pspp_linreg_cache_free (lc);
568 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
572 for (i = 0; i < n_vars; i++)
574 if (v->index == varlist[i]->index)
582 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
587 struct variable **varlist;
588 struct pspp_linreg_coeff *coeff;
589 const struct variable *v;
592 fprintf (fp, "%s", reg_export_categorical_encode_1);
594 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
595 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
597 coeff = c->coeff + i;
598 v = pspp_linreg_coeff_get_var (coeff, 0);
599 if (v->type == ALPHA)
601 if (!reg_inserted (v, varlist, n_vars))
603 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
605 varlist[n_vars] = (struct variable *) v;
610 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
611 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
613 for (i = 0; i < n_vars - 1; i++)
615 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
617 fprintf (fp, "&%s};\n\t", varlist[i]->name);
619 for (i = 0; i < n_vars; i++)
621 coeff = c->coeff + i;
622 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
624 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
625 varlist[i]->obs_vals->n_categories);
627 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
629 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
630 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
631 value_to_string (val, varlist[i]));
634 fprintf (fp, "%s", reg_export_categorical_encode_2);
638 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
641 struct pspp_linreg_coeff *coeff;
642 const struct variable *v;
644 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
645 for (i = 1; i < c->n_indeps; i++)
647 coeff = c->coeff + i;
648 v = pspp_linreg_coeff_get_var (coeff, 0);
649 fprintf (fp, "\"%s\",\n\t\t", v->name);
651 coeff = c->coeff + i;
652 v = pspp_linreg_coeff_get_var (coeff, 0);
653 fprintf (fp, "\"%s\"};\n\t", v->name);
656 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
658 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
659 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
660 reg_print_depvars (fp, c);
661 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
663 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
664 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
667 reg_has_categorical (pspp_linreg_cache * c)
670 const struct variable *v;
672 for (i = 1; i < c->n_coeffs; i++)
674 v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
675 if (v->type == ALPHA)
684 subcommand_export (int export, pspp_linreg_cache * c)
689 int n_quantiles = 100;
692 struct pspp_linreg_coeff coeff;
697 assert (model_file != NULL);
698 fp = fopen (fh_get_file_name (model_file), "w");
700 fprintf (fp, "%s", reg_preamble);
701 reg_print_getvar (fp, c);
702 if (reg_has_categorical (c))
704 reg_print_categorical_encoding (fp, c);
706 fprintf (fp, "%s", reg_export_t_quantiles_1);
707 increment = 0.5 / (double) increment;
708 for (i = 0; i < n_quantiles - 1; i++)
710 tmp = 0.5 + 0.005 * (double) i;
711 fprintf (fp, "%.15e,\n\t\t",
712 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
714 fprintf (fp, "%.15e};\n\t",
715 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
716 fprintf (fp, "%s", reg_export_t_quantiles_2);
717 fprintf (fp, "%s", reg_mean_cmt);
718 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
719 fprintf (fp, "const char *var_names[])\n{\n\t");
720 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
721 for (i = 1; i < c->n_indeps; i++)
724 fprintf (fp, "%.15e,\n\t\t", coeff.estimate);
727 fprintf (fp, "%.15e};\n\t", coeff.estimate);
729 fprintf (fp, "double estimate = %.15e;\n\t", coeff.estimate);
730 fprintf (fp, "int i;\n\tint j;\n\n\t");
731 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
732 fprintf (fp, "%s", reg_getvar);
733 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
735 for (i = 0; i < c->cov->size1 - 1; i++)
738 for (j = 0; j < c->cov->size2 - 1; j++)
740 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
742 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
745 for (j = 0; j < c->cov->size2 - 1; j++)
747 fprintf (fp, "%.15e, ",
748 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
750 fprintf (fp, "%.15e}\n\t",
751 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
752 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
754 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
755 fprintf (fp, "%s", reg_variance);
756 fprintf (fp, "%s", reg_export_confidence_interval);
757 tmp = c->mse * c->mse;
758 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
759 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
760 fprintf (fp, "%s", reg_export_prediction_interval_3);
762 fp = fopen ("pspp_model_reg.h", "w");
763 fprintf (fp, "%s", reg_header);
768 regression_custom_export (struct cmd_regression *cmd UNUSED)
770 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
771 if (!lex_force_match ('('))
778 model_file = fh_parse (FH_REF_FILE);
779 if (model_file == NULL)
783 if (!lex_force_match (')'))
790 cmd_regression (void)
792 if (!parse_regression (&cmd))
794 if (!multipass_procedure_with_splits (run_regression, &cmd))
795 return CMD_CASCADING_FAILURE;
803 Is variable k the dependent variable?
806 is_depvar (size_t k, const struct variable *v)
809 compare_var_names returns 0 if the variable
812 if (!compare_var_names (v, v_variables[k], NULL))
819 Mark missing cases. Return the number of non-missing cases.
822 mark_missing_cases (const struct casefile *cf, struct variable *v,
823 int *is_missing_case, double n_data)
825 struct casereader *r;
828 const union value *val;
830 for (r = casefile_get_reader (cf);
831 casereader_read (r, &c); case_destroy (&c))
833 row = casereader_cnum (r) - 1;
835 val = case_data (&c, v->fv);
836 cat_value_update (v, val);
837 if (mv_is_value_missing (&v->miss, val))
839 if (!is_missing_case[row])
841 /* Now it is missing. */
843 is_missing_case[row] = 1;
847 casereader_destroy (r);
852 /* Parser for the variables sub command */
854 regression_custom_variables(struct cmd_regression *cmd UNUSED)
859 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
864 if (!parse_variables (default_dict, &v_variables, &n_variables,
875 Count the explanatory variables. The user may or may
876 not have specified a response variable in the syntax.
879 int get_n_indep (const struct variable *v)
884 result = n_variables;
885 while (i < n_variables)
887 if (is_depvar (i, v))
897 Read from the active file. Identify the explanatory variables in
898 v_variables. Encode categorical variables. Drop cases with missing
902 int prepare_data (int n_data, int is_missing_case[],
903 struct variable **indep_vars,
904 struct variable *depvar,
905 const struct casefile *cf)
910 assert (indep_vars != NULL);
912 for (i = 0; i < n_variables; i++)
914 if (!is_depvar (i, depvar))
916 indep_vars[j] = v_variables[i];
918 if (v_variables[i]->type == ALPHA)
920 /* Make a place to hold the binary vectors
921 corresponding to this variable's values. */
922 cat_stored_values_create (v_variables[i]);
924 n_data = mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
928 Mark missing cases for the dependent variable.
930 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
935 run_regression (const struct casefile *cf, void *cmd_ UNUSED)
938 size_t n_data = 0; /* Number of valide cases. */
939 size_t n_cases; /* Number of cases. */
945 Keep track of the missing cases.
947 int *is_missing_case;
948 const union value *val;
949 struct casereader *r;
951 struct variable **indep_vars;
952 struct design_matrix *X;
954 pspp_linreg_cache *lcache;
955 pspp_linreg_opts lopts;
959 dict_get_vars (default_dict, &v_variables, &n_variables,
963 n_cases = casefile_get_case_cnt (cf);
965 for (i = 0; i < cmd.n_dependent; i++)
967 if (cmd.v_dependent[i]->type != NUMERIC)
969 msg (SE, gettext ("Dependent variable must be numeric."));
970 pspp_reg_rc = CMD_FAILURE;
975 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
977 lopts.get_depvar_mean_std = 1;
980 for (k = 0; k < cmd.n_dependent; k++)
982 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
983 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
984 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
985 assert (indep_vars != NULL);
987 for (i = 0; i < n_cases; i++)
989 is_missing_case[i] = 0;
991 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
993 (const struct casefile *) cf);
994 Y = gsl_vector_alloc (n_data);
997 design_matrix_create (n_indep, (const struct variable **) indep_vars,
999 for (i = 0; i < X->m->size2; i++)
1001 lopts.get_indep_mean_std[i] = 1;
1003 lcache = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1004 lcache->indep_means = gsl_vector_alloc (X->m->size2);
1005 lcache->indep_std = gsl_vector_alloc (X->m->size2);
1006 lcache->depvar = (const struct variable *) cmd.v_dependent[k];
1008 For large data sets, use QR decomposition.
1010 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1012 lcache->method = PSPP_LINREG_SVD;
1016 The second pass creates the design matrix.
1019 for (r = casefile_get_reader (cf); casereader_read (r, &c);
1021 /* Iterate over the cases. */
1023 case_num = casereader_cnum (r) - 1;
1024 if (!is_missing_case[case_num])
1026 for (i = 0; i < n_variables; ++i) /* Iterate over the
1031 val = case_data (&c, v_variables[i]->fv);
1033 Independent/dependent variable separation. The
1034 'variables' subcommand specifies a varlist which contains
1035 both dependent and independent variables. The dependent
1036 variables are specified with the 'dependent'
1037 subcommand, and maybe also in the 'variables' subcommand.
1038 We need to separate the two.
1040 if (!is_depvar (i, cmd.v_dependent[k]))
1042 if (v_variables[i]->type == ALPHA)
1044 design_matrix_set_categorical (X, row, v_variables[i], val);
1046 else if (v_variables[i]->type == NUMERIC)
1048 design_matrix_set_numeric (X, row, v_variables[i], val);
1052 val = case_data (&c, cmd.v_dependent[k]->fv);
1053 gsl_vector_set (Y, row, val->f);
1058 Now that we know the number of coefficients, allocate space
1059 and store pointers to the variables that correspond to the
1062 pspp_linreg_coeff_init (lcache, X);
1065 Find the least-squares estimates and other statistics.
1067 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, lcache);
1068 subcommand_statistics (cmd.a_statistics, lcache);
1069 subcommand_export (cmd.sbc_export, lcache);
1070 subcommand_save (cmd.sbc_save, lcache);
1071 gsl_vector_free (Y);
1072 design_matrix_destroy (X);
1074 free (lopts.get_indep_mean_std);
1075 casereader_destroy (r);
1078 free (is_missing_case);