1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_vector.h>
24 #include <gsl/gsl_matrix.h>
26 #include <libpspp/alloc.h>
27 #include <data/case.h>
28 #include <data/casefile.h>
29 #include <data/category.h>
30 #include <data/cat-routines.h>
31 #include <language/command.h>
32 #include <libpspp/compiler.h>
33 #include <math/design-matrix.h>
34 #include <data/dictionary.h>
35 #include <libpspp/message.h>
36 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <math/linreg/linreg.h>
40 #include <math/linreg/coefficient.h>
41 #include <data/missing-values.h>
42 #include "regression-export.h"
43 #include <output/table.h>
44 #include <data/value-labels.h>
45 #include <data/variable.h>
46 #include "procedure.h"
48 #define REG_LARGE_DATA 1000
53 "REGRESSION" (regression_):
79 static struct cmd_regression cmd;
82 Variables used (both explanatory and response).
84 static struct variable **v_variables;
89 static size_t n_variables;
92 File where the model will be saved if the EXPORT subcommand
95 struct file_handle *model_file;
98 Return value for the procedure.
100 int pspp_reg_rc = CMD_SUCCESS;
102 static bool run_regression (const struct casefile *, void *);
105 STATISTICS subcommand output functions.
107 static void reg_stats_r (pspp_linreg_cache *);
108 static void reg_stats_coeff (pspp_linreg_cache *);
109 static void reg_stats_anova (pspp_linreg_cache *);
110 static void reg_stats_outs (pspp_linreg_cache *);
111 static void reg_stats_zpp (pspp_linreg_cache *);
112 static void reg_stats_label (pspp_linreg_cache *);
113 static void reg_stats_sha (pspp_linreg_cache *);
114 static void reg_stats_ci (pspp_linreg_cache *);
115 static void reg_stats_f (pspp_linreg_cache *);
116 static void reg_stats_bcov (pspp_linreg_cache *);
117 static void reg_stats_ses (pspp_linreg_cache *);
118 static void reg_stats_xtx (pspp_linreg_cache *);
119 static void reg_stats_collin (pspp_linreg_cache *);
120 static void reg_stats_tol (pspp_linreg_cache *);
121 static void reg_stats_selection (pspp_linreg_cache *);
122 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
123 int, pspp_linreg_cache *);
126 reg_stats_r (pspp_linreg_cache * c)
136 rsq = c->ssm / c->sst;
137 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
138 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
139 t = tab_create (n_cols, n_rows, 0);
140 tab_dim (t, tab_natural_dimensions);
141 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
142 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
143 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
144 tab_vline (t, TAL_0, 1, 0, 0);
146 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
147 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
148 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
149 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
150 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
151 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
152 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
153 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
154 tab_title (t, _("Model Summary"));
159 Table showing estimated regression coefficients.
162 reg_stats_coeff (pspp_linreg_cache * c)
174 const struct variable *v;
175 const union value *val;
180 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
181 n_rows = c->n_coeffs + 2;
183 t = tab_create (n_cols, n_rows, 0);
184 tab_headers (t, 2, 0, 1, 0);
185 tab_dim (t, tab_natural_dimensions);
186 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
187 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
188 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
189 tab_vline (t, TAL_0, 1, 0, 0);
191 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
192 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
193 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
194 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
195 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
196 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
197 coeff = c->coeff[0].estimate;
198 tab_float (t, 2, 1, 0, coeff, 10, 2);
199 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
200 tab_float (t, 3, 1, 0, std_err, 10, 2);
201 beta = coeff / c->depvar_std;
202 tab_float (t, 4, 1, 0, beta, 10, 2);
203 t_stat = coeff / std_err;
204 tab_float (t, 5, 1, 0, t_stat, 10, 2);
205 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
206 tab_float (t, 6, 1, 0, pval, 10, 2);
207 for (j = 1; j <= c->n_indeps; j++)
209 v = pspp_linreg_coeff_get_var (c->coeff + j, 0);
210 label = var_to_string (v);
211 /* Do not overwrite the variable's name. */
212 strncpy (tmp, label, MAX_STRING);
213 if (v->type == ALPHA)
216 Append the value associated with this coefficient.
217 This makes sense only if we us the usual binary encoding
221 val = pspp_linreg_coeff_get_value (c->coeff + j, v);
222 val_s = value_to_string (val, v);
223 strncat (tmp, val_s, MAX_STRING);
226 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
228 Regression coefficients.
230 coeff = c->coeff[j].estimate;
231 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
233 Standard error of the coefficients.
235 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
236 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
238 'Standardized' coefficient, i.e., regression coefficient
239 if all variables had unit variance.
241 beta = gsl_vector_get (c->indep_std, j);
242 beta *= coeff / c->depvar_std;
243 tab_float (t, 4, j + 1, 0, beta, 10, 2);
246 Test statistic for H0: coefficient is 0.
248 t_stat = coeff / std_err;
249 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
251 P values for the test statistic above.
253 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
254 tab_float (t, 6, j + 1, 0, pval, 10, 2);
256 tab_title (t, _("Coefficients"));
262 Display the ANOVA table.
265 reg_stats_anova (pspp_linreg_cache * c)
269 const double msm = c->ssm / c->dfm;
270 const double mse = c->sse / c->dfe;
271 const double F = msm / mse;
272 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
277 t = tab_create (n_cols, n_rows, 0);
278 tab_headers (t, 2, 0, 1, 0);
279 tab_dim (t, tab_natural_dimensions);
281 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
283 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
284 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
285 tab_vline (t, TAL_0, 1, 0, 0);
287 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
288 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
289 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
290 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
291 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
293 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
294 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
295 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
297 /* Sums of Squares */
298 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
299 tab_float (t, 2, 3, 0, c->sst, 10, 2);
300 tab_float (t, 2, 2, 0, c->sse, 10, 2);
303 /* Degrees of freedom */
304 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
305 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
306 tab_float (t, 3, 3, 0, c->dft, 4, 0);
310 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
311 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
313 tab_float (t, 5, 1, 0, F, 8, 3);
315 tab_float (t, 6, 1, 0, pval, 8, 3);
317 tab_title (t, _("ANOVA"));
321 reg_stats_outs (pspp_linreg_cache * c)
326 reg_stats_zpp (pspp_linreg_cache * c)
331 reg_stats_label (pspp_linreg_cache * c)
336 reg_stats_sha (pspp_linreg_cache * c)
341 reg_stats_ci (pspp_linreg_cache * c)
346 reg_stats_f (pspp_linreg_cache * c)
351 reg_stats_bcov (pspp_linreg_cache * c)
363 n_cols = c->n_indeps + 1 + 2;
364 n_rows = 2 * (c->n_indeps + 1);
365 t = tab_create (n_cols, n_rows, 0);
366 tab_headers (t, 2, 0, 1, 0);
367 tab_dim (t, tab_natural_dimensions);
368 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
369 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
370 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
371 tab_vline (t, TAL_0, 1, 0, 0);
372 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
373 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
374 for (i = 1; i < c->n_coeffs; i++)
376 const struct variable *v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
377 label = var_to_string (v);
378 tab_text (t, 2, i, TAB_CENTER, label);
379 tab_text (t, i + 2, 0, TAB_CENTER, label);
380 for (k = 1; k < c->n_coeffs; k++)
382 col = (i <= k) ? k : i;
383 row = (i <= k) ? i : k;
384 tab_float (t, k + 2, i, TAB_CENTER,
385 gsl_matrix_get (c->cov, row, col), 8, 3);
388 tab_title (t, _("Coefficient Correlations"));
392 reg_stats_ses (pspp_linreg_cache * c)
397 reg_stats_xtx (pspp_linreg_cache * c)
402 reg_stats_collin (pspp_linreg_cache * c)
407 reg_stats_tol (pspp_linreg_cache * c)
412 reg_stats_selection (pspp_linreg_cache * c)
418 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
419 int keyword, pspp_linreg_cache * c)
428 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
431 The order here must match the order in which the STATISTICS
432 keywords appear in the specification section above.
459 Set everything but F.
461 for (i = 0; i < f; i++)
468 for (i = 0; i < all; i++)
476 Default output: ANOVA table, parameter estimates,
477 and statistics for variables not entered into model,
480 if (keywords[defaults] | d)
488 statistics_keyword_output (reg_stats_r, keywords[r], c);
489 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
490 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
491 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
492 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
493 statistics_keyword_output (reg_stats_label, keywords[label], c);
494 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
495 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
496 statistics_keyword_output (reg_stats_f, keywords[f], c);
497 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
498 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
499 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
500 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
501 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
502 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
505 regression_trns_proc (void *m, struct ccase *c, int case_idx UNUSED)
509 pspp_linreg_cache *model = m;
511 const union value **vals = NULL;
512 const union value *obs = NULL;
513 struct variable **vars = NULL;
515 assert (model != NULL);
516 assert (model->depvar != NULL);
517 assert (model->resid != NULL);
519 vals = xnmalloc (n_variables, sizeof (*vals));
520 dict_get_vars (default_dict, &vars, &n_vars, 1u << DC_ORDINARY);
521 output = case_data_rw (c, model->resid->fv);
522 assert (output != NULL);
524 for (i = 0; i < n_vars; i++)
526 /* Do not use the residual variable as a predictor. */
527 if (vars[i]->index != model->resid->index)
529 /* Do not use the dependent variable as a predictor. */
530 if (vars[i]->index == model->depvar->index)
532 obs = case_data (c, i);
533 assert (obs != NULL);
537 vals[i] = case_data (c, i);
541 output->f = (*model->residual) ((const struct variable **) vars,
542 vals, obs, model, i);
544 return TRNS_CONTINUE;
547 subcommand_save (int save, pspp_linreg_cache *lc)
549 struct variable *residuals = NULL;
552 assert (lc->depvar != NULL);
556 residuals = dict_create_var (default_dict, "residuals", 0);
557 assert (residuals != NULL);
558 lc->resid = residuals;
559 add_transformation (regression_trns_proc, pspp_linreg_cache_free, lc);
563 pspp_linreg_cache_free (lc);
567 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
571 for (i = 0; i < n_vars; i++)
573 if (v->index == varlist[i]->index)
581 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
586 struct variable **varlist;
587 struct pspp_linreg_coeff *coeff;
588 const struct variable *v;
591 fprintf (fp, "%s", reg_export_categorical_encode_1);
593 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
594 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
596 coeff = c->coeff + i;
597 v = pspp_linreg_coeff_get_var (coeff, 0);
598 if (v->type == ALPHA)
600 if (!reg_inserted (v, varlist, n_vars))
602 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
604 varlist[n_vars] = (struct variable *) v;
609 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
610 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
612 for (i = 0; i < n_vars - 1; i++)
614 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
616 fprintf (fp, "&%s};\n\t", varlist[i]->name);
618 for (i = 0; i < n_vars; i++)
620 coeff = c->coeff + i;
621 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
623 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
624 varlist[i]->obs_vals->n_categories);
626 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
628 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
629 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
630 value_to_string (val, varlist[i]));
633 fprintf (fp, "%s", reg_export_categorical_encode_2);
637 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
640 struct pspp_linreg_coeff *coeff;
641 const struct variable *v;
643 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
644 for (i = 1; i < c->n_indeps; i++)
646 coeff = c->coeff + i;
647 v = pspp_linreg_coeff_get_var (coeff, 0);
648 fprintf (fp, "\"%s\",\n\t\t", v->name);
650 coeff = c->coeff + i;
651 v = pspp_linreg_coeff_get_var (coeff, 0);
652 fprintf (fp, "\"%s\"};\n\t", v->name);
655 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
657 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
658 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
659 reg_print_depvars (fp, c);
660 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
662 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
663 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
666 reg_has_categorical (pspp_linreg_cache * c)
669 const struct variable *v;
671 for (i = 1; i < c->n_coeffs; i++)
673 v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
674 if (v->type == ALPHA)
683 subcommand_export (int export, pspp_linreg_cache * c)
688 int n_quantiles = 100;
691 struct pspp_linreg_coeff coeff;
696 assert (model_file != NULL);
697 fp = fopen (fh_get_file_name (model_file), "w");
699 fprintf (fp, "%s", reg_preamble);
700 reg_print_getvar (fp, c);
701 if (reg_has_categorical (c))
703 reg_print_categorical_encoding (fp, c);
705 fprintf (fp, "%s", reg_export_t_quantiles_1);
706 increment = 0.5 / (double) increment;
707 for (i = 0; i < n_quantiles - 1; i++)
709 tmp = 0.5 + 0.005 * (double) i;
710 fprintf (fp, "%.15e,\n\t\t",
711 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
713 fprintf (fp, "%.15e};\n\t",
714 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
715 fprintf (fp, "%s", reg_export_t_quantiles_2);
716 fprintf (fp, "%s", reg_mean_cmt);
717 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
718 fprintf (fp, "const char *var_names[])\n{\n\t");
719 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
720 for (i = 1; i < c->n_indeps; i++)
723 fprintf (fp, "%.15e,\n\t\t", coeff.estimate);
726 fprintf (fp, "%.15e};\n\t", coeff.estimate);
728 fprintf (fp, "double estimate = %.15e;\n\t", coeff.estimate);
729 fprintf (fp, "int i;\n\tint j;\n\n\t");
730 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
731 fprintf (fp, "%s", reg_getvar);
732 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
734 for (i = 0; i < c->cov->size1 - 1; i++)
737 for (j = 0; j < c->cov->size2 - 1; j++)
739 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
741 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
744 for (j = 0; j < c->cov->size2 - 1; j++)
746 fprintf (fp, "%.15e, ",
747 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
749 fprintf (fp, "%.15e}\n\t",
750 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
751 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
753 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
754 fprintf (fp, "%s", reg_variance);
755 fprintf (fp, "%s", reg_export_confidence_interval);
756 tmp = c->mse * c->mse;
757 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
758 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
759 fprintf (fp, "%s", reg_export_prediction_interval_3);
761 fp = fopen ("pspp_model_reg.h", "w");
762 fprintf (fp, "%s", reg_header);
767 regression_custom_export (struct cmd_regression *cmd UNUSED)
769 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
770 if (!lex_force_match ('('))
777 model_file = fh_parse (FH_REF_FILE);
778 if (model_file == NULL)
782 if (!lex_force_match (')'))
789 cmd_regression (void)
791 if (!parse_regression (&cmd))
793 if (!multipass_procedure_with_splits (run_regression, &cmd))
794 return CMD_CASCADING_FAILURE;
802 Is variable k the dependent variable?
805 is_depvar (size_t k, const struct variable *v)
808 compare_var_names returns 0 if the variable
811 if (!compare_var_names (v, v_variables[k], NULL))
818 Mark missing cases. Return the number of non-missing cases.
821 mark_missing_cases (const struct casefile *cf, struct variable *v,
822 int *is_missing_case, double n_data)
824 struct casereader *r;
827 const union value *val;
829 for (r = casefile_get_reader (cf);
830 casereader_read (r, &c); case_destroy (&c))
832 row = casereader_cnum (r) - 1;
834 val = case_data (&c, v->fv);
835 cat_value_update (v, val);
836 if (mv_is_value_missing (&v->miss, val))
838 if (!is_missing_case[row])
840 /* Now it is missing. */
842 is_missing_case[row] = 1;
846 casereader_destroy (r);
851 /* Parser for the variables sub command */
853 regression_custom_variables(struct cmd_regression *cmd UNUSED)
858 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
863 if (!parse_variables (default_dict, &v_variables, &n_variables,
874 Count the explanatory variables. The user may or may
875 not have specified a response variable in the syntax.
878 int get_n_indep (const struct variable *v)
883 result = n_variables;
884 while (i < n_variables)
886 if (is_depvar (i, v))
896 Read from the active file. Identify the explanatory variables in
897 v_variables. Encode categorical variables. Drop cases with missing
901 int prepare_data (int n_data, int is_missing_case[],
902 struct variable **indep_vars,
903 struct variable *depvar,
904 const struct casefile *cf)
909 assert (indep_vars != NULL);
911 for (i = 0; i < n_variables; i++)
913 if (!is_depvar (i, depvar))
915 indep_vars[j] = v_variables[i];
917 if (v_variables[i]->type == ALPHA)
919 /* Make a place to hold the binary vectors
920 corresponding to this variable's values. */
921 cat_stored_values_create (v_variables[i]);
923 n_data = mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
927 Mark missing cases for the dependent variable.
929 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
934 run_regression (const struct casefile *cf, void *cmd_ UNUSED)
937 size_t n_data = 0; /* Number of valide cases. */
938 size_t n_cases; /* Number of cases. */
944 Keep track of the missing cases.
946 int *is_missing_case;
947 const union value *val;
948 struct casereader *r;
950 struct variable **indep_vars;
951 struct design_matrix *X;
953 pspp_linreg_cache *lcache;
954 pspp_linreg_opts lopts;
958 dict_get_vars (default_dict, &v_variables, &n_variables,
962 n_cases = casefile_get_case_cnt (cf);
964 for (i = 0; i < cmd.n_dependent; i++)
966 if (cmd.v_dependent[i]->type != NUMERIC)
968 msg (SE, gettext ("Dependent variable must be numeric."));
969 pspp_reg_rc = CMD_FAILURE;
974 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
976 lopts.get_depvar_mean_std = 1;
979 for (k = 0; k < cmd.n_dependent; k++)
981 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
982 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
983 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
984 assert (indep_vars != NULL);
986 for (i = 0; i < n_cases; i++)
988 is_missing_case[i] = 0;
990 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
992 (const struct casefile *) cf);
993 Y = gsl_vector_alloc (n_data);
996 design_matrix_create (n_indep, (const struct variable **) indep_vars,
998 for (i = 0; i < X->m->size2; i++)
1000 lopts.get_indep_mean_std[i] = 1;
1002 lcache = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1003 lcache->indep_means = gsl_vector_alloc (X->m->size2);
1004 lcache->indep_std = gsl_vector_alloc (X->m->size2);
1005 lcache->depvar = (const struct variable *) cmd.v_dependent[k];
1007 For large data sets, use QR decomposition.
1009 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1011 lcache->method = PSPP_LINREG_SVD;
1015 The second pass creates the design matrix.
1018 for (r = casefile_get_reader (cf); casereader_read (r, &c);
1020 /* Iterate over the cases. */
1022 case_num = casereader_cnum (r) - 1;
1023 if (!is_missing_case[case_num])
1025 for (i = 0; i < n_variables; ++i) /* Iterate over the
1030 val = case_data (&c, v_variables[i]->fv);
1032 Independent/dependent variable separation. The
1033 'variables' subcommand specifies a varlist which contains
1034 both dependent and independent variables. The dependent
1035 variables are specified with the 'dependent'
1036 subcommand, and maybe also in the 'variables' subcommand.
1037 We need to separate the two.
1039 if (!is_depvar (i, cmd.v_dependent[k]))
1041 if (v_variables[i]->type == ALPHA)
1043 design_matrix_set_categorical (X, row, v_variables[i], val);
1045 else if (v_variables[i]->type == NUMERIC)
1047 design_matrix_set_numeric (X, row, v_variables[i], val);
1051 val = case_data (&c, cmd.v_dependent[k]->fv);
1052 gsl_vector_set (Y, row, val->f);
1057 Now that we know the number of coefficients, allocate space
1058 and store pointers to the variables that correspond to the
1061 pspp_linreg_coeff_init (lcache, X);
1064 Find the least-squares estimates and other statistics.
1066 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, lcache);
1067 subcommand_statistics (cmd.a_statistics, lcache);
1068 subcommand_export (cmd.sbc_export, lcache);
1069 subcommand_save (cmd.sbc_save, lcache);
1070 gsl_vector_free (Y);
1071 design_matrix_destroy (X);
1073 free (lopts.get_indep_mean_std);
1074 casereader_destroy (r);
1077 free (is_missing_case);