1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_vector.h>
24 #include <gsl/gsl_matrix.h>
26 #include <libpspp/alloc.h>
27 #include <data/case.h>
28 #include <data/casefile.h>
29 #include <data/category.h>
30 #include <data/cat-routines.h>
31 #include <language/command.h>
32 #include <libpspp/compiler.h>
33 #include <math/design-matrix.h>
34 #include <data/dictionary.h>
35 #include <libpspp/message.h>
36 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <math/linreg/linreg.h>
40 #include <math/linreg/coefficient.h>
41 #include <data/missing-values.h>
42 #include "regression-export.h"
43 #include <output/table.h>
44 #include <data/value-labels.h>
45 #include <data/variable.h>
46 #include "procedure.h"
48 #define REG_LARGE_DATA 1000
53 "REGRESSION" (regression_):
79 static struct cmd_regression cmd;
82 Variables used (both explanatory and response).
84 static struct variable **v_variables;
89 static size_t n_variables;
92 File where the model will be saved if the EXPORT subcommand
95 struct file_handle *model_file;
98 Return value for the procedure.
100 int pspp_reg_rc = CMD_SUCCESS;
102 static bool run_regression (const struct casefile *, void *);
105 STATISTICS subcommand output functions.
107 static void reg_stats_r (pspp_linreg_cache *);
108 static void reg_stats_coeff (pspp_linreg_cache *);
109 static void reg_stats_anova (pspp_linreg_cache *);
110 static void reg_stats_outs (pspp_linreg_cache *);
111 static void reg_stats_zpp (pspp_linreg_cache *);
112 static void reg_stats_label (pspp_linreg_cache *);
113 static void reg_stats_sha (pspp_linreg_cache *);
114 static void reg_stats_ci (pspp_linreg_cache *);
115 static void reg_stats_f (pspp_linreg_cache *);
116 static void reg_stats_bcov (pspp_linreg_cache *);
117 static void reg_stats_ses (pspp_linreg_cache *);
118 static void reg_stats_xtx (pspp_linreg_cache *);
119 static void reg_stats_collin (pspp_linreg_cache *);
120 static void reg_stats_tol (pspp_linreg_cache *);
121 static void reg_stats_selection (pspp_linreg_cache *);
122 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
123 int, pspp_linreg_cache *);
126 reg_stats_r (pspp_linreg_cache * c)
136 rsq = c->ssm / c->sst;
137 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
138 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
139 t = tab_create (n_cols, n_rows, 0);
140 tab_dim (t, tab_natural_dimensions);
141 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
142 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
143 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
144 tab_vline (t, TAL_0, 1, 0, 0);
146 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
147 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
148 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
149 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
150 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
151 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
152 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
153 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
154 tab_title (t, _("Model Summary"));
159 Table showing estimated regression coefficients.
162 reg_stats_coeff (pspp_linreg_cache * c)
174 const struct variable *v;
175 const union value *val;
180 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
181 n_rows = c->n_coeffs + 2;
183 t = tab_create (n_cols, n_rows, 0);
184 tab_headers (t, 2, 0, 1, 0);
185 tab_dim (t, tab_natural_dimensions);
186 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
187 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
188 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
189 tab_vline (t, TAL_0, 1, 0, 0);
191 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
192 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
193 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
194 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
195 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
196 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
197 coeff = c->coeff[0].estimate;
198 tab_float (t, 2, 1, 0, coeff, 10, 2);
199 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
200 tab_float (t, 3, 1, 0, std_err, 10, 2);
201 beta = coeff / c->depvar_std;
202 tab_float (t, 4, 1, 0, beta, 10, 2);
203 t_stat = coeff / std_err;
204 tab_float (t, 5, 1, 0, t_stat, 10, 2);
205 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
206 tab_float (t, 6, 1, 0, pval, 10, 2);
207 for (j = 1; j <= c->n_indeps; j++)
209 v = pspp_linreg_coeff_get_var (c->coeff + j, 0);
210 label = var_to_string (v);
211 /* Do not overwrite the variable's name. */
212 strncpy (tmp, label, MAX_STRING);
213 if (v->type == ALPHA)
216 Append the value associated with this coefficient.
217 This makes sense only if we us the usual binary encoding
221 val = pspp_linreg_coeff_get_value (c->coeff + j, v);
222 val_s = value_to_string (val, v);
223 strncat (tmp, val_s, MAX_STRING);
226 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
228 Regression coefficients.
230 coeff = c->coeff[j].estimate;
231 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
233 Standard error of the coefficients.
235 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
236 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
238 'Standardized' coefficient, i.e., regression coefficient
239 if all variables had unit variance.
241 beta = gsl_vector_get (c->indep_std, j);
242 beta *= coeff / c->depvar_std;
243 tab_float (t, 4, j + 1, 0, beta, 10, 2);
246 Test statistic for H0: coefficient is 0.
248 t_stat = coeff / std_err;
249 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
251 P values for the test statistic above.
253 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
254 tab_float (t, 6, j + 1, 0, pval, 10, 2);
256 tab_title (t, _("Coefficients"));
262 Display the ANOVA table.
265 reg_stats_anova (pspp_linreg_cache * c)
269 const double msm = c->ssm / c->dfm;
270 const double mse = c->sse / c->dfe;
271 const double F = msm / mse;
272 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
277 t = tab_create (n_cols, n_rows, 0);
278 tab_headers (t, 2, 0, 1, 0);
279 tab_dim (t, tab_natural_dimensions);
281 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
283 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
284 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
285 tab_vline (t, TAL_0, 1, 0, 0);
287 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
288 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
289 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
290 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
291 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
293 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
294 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
295 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
297 /* Sums of Squares */
298 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
299 tab_float (t, 2, 3, 0, c->sst, 10, 2);
300 tab_float (t, 2, 2, 0, c->sse, 10, 2);
303 /* Degrees of freedom */
304 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
305 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
306 tab_float (t, 3, 3, 0, c->dft, 4, 0);
310 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
311 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
313 tab_float (t, 5, 1, 0, F, 8, 3);
315 tab_float (t, 6, 1, 0, pval, 8, 3);
317 tab_title (t, _("ANOVA"));
321 reg_stats_outs (pspp_linreg_cache * c)
326 reg_stats_zpp (pspp_linreg_cache * c)
331 reg_stats_label (pspp_linreg_cache * c)
336 reg_stats_sha (pspp_linreg_cache * c)
341 reg_stats_ci (pspp_linreg_cache * c)
346 reg_stats_f (pspp_linreg_cache * c)
351 reg_stats_bcov (pspp_linreg_cache * c)
363 n_cols = c->n_indeps + 1 + 2;
364 n_rows = 2 * (c->n_indeps + 1);
365 t = tab_create (n_cols, n_rows, 0);
366 tab_headers (t, 2, 0, 1, 0);
367 tab_dim (t, tab_natural_dimensions);
368 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
369 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
370 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
371 tab_vline (t, TAL_0, 1, 0, 0);
372 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
373 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
374 for (i = 1; i < c->n_coeffs; i++)
376 const struct variable *v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
377 label = var_to_string (v);
378 tab_text (t, 2, i, TAB_CENTER, label);
379 tab_text (t, i + 2, 0, TAB_CENTER, label);
380 for (k = 1; k < c->n_coeffs; k++)
382 col = (i <= k) ? k : i;
383 row = (i <= k) ? i : k;
384 tab_float (t, k + 2, i, TAB_CENTER,
385 gsl_matrix_get (c->cov, row, col), 8, 3);
388 tab_title (t, _("Coefficient Correlations"));
392 reg_stats_ses (pspp_linreg_cache * c)
397 reg_stats_xtx (pspp_linreg_cache * c)
402 reg_stats_collin (pspp_linreg_cache * c)
407 reg_stats_tol (pspp_linreg_cache * c)
412 reg_stats_selection (pspp_linreg_cache * c)
418 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
419 int keyword, pspp_linreg_cache * c)
428 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
431 The order here must match the order in which the STATISTICS
432 keywords appear in the specification section above.
459 Set everything but F.
461 for (i = 0; i < f; i++)
468 for (i = 0; i < all; i++)
476 Default output: ANOVA table, parameter estimates,
477 and statistics for variables not entered into model,
480 if (keywords[defaults] | d)
488 statistics_keyword_output (reg_stats_r, keywords[r], c);
489 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
490 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
491 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
492 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
493 statistics_keyword_output (reg_stats_label, keywords[label], c);
494 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
495 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
496 statistics_keyword_output (reg_stats_f, keywords[f], c);
497 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
498 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
499 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
500 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
501 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
502 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
505 subcommand_save (int save, pspp_linreg_cache *lc, const struct casefile *cf, int *is_missing)
511 const union value **vals;
512 struct casereader *r;
518 vals = xnmalloc (n_variables, sizeof (*vals));
519 for (r = casefile_get_reader (cf); casereader_read (r, &c);
522 case_num = casereader_cnum (r) - 1;
523 if (!is_missing[case_num])
525 for (i = 0; i < n_variables; ++i)
527 vals[i] = case_data (&c, v_variables[i]->fv);
529 residual = (*lc->predict) (v_variables, vals, lc, n_variables);
536 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
540 for (i = 0; i < n_vars; i++)
542 if (v->index == varlist[i]->index)
550 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
555 struct variable **varlist;
556 struct pspp_linreg_coeff *coeff;
557 const struct variable *v;
560 fprintf (fp, "%s", reg_export_categorical_encode_1);
562 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
563 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
565 coeff = c->coeff + i;
566 v = pspp_linreg_coeff_get_var (coeff, 0);
567 if (v->type == ALPHA)
569 if (!reg_inserted (v, varlist, n_vars))
571 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
573 varlist[n_vars] = (struct variable *) v;
578 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
579 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
581 for (i = 0; i < n_vars - 1; i++)
583 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
585 fprintf (fp, "&%s};\n\t", varlist[i]->name);
587 for (i = 0; i < n_vars; i++)
589 coeff = c->coeff + i;
590 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
592 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
593 varlist[i]->obs_vals->n_categories);
595 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
597 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
598 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
599 value_to_string (val, varlist[i]));
602 fprintf (fp, "%s", reg_export_categorical_encode_2);
606 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
609 struct pspp_linreg_coeff *coeff;
610 const struct variable *v;
612 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
613 for (i = 1; i < c->n_indeps; i++)
615 coeff = c->coeff + i;
616 v = pspp_linreg_coeff_get_var (coeff, 0);
617 fprintf (fp, "\"%s\",\n\t\t", v->name);
619 coeff = c->coeff + i;
620 v = pspp_linreg_coeff_get_var (coeff, 0);
621 fprintf (fp, "\"%s\"};\n\t", v->name);
624 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
626 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
627 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
628 reg_print_depvars (fp, c);
629 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
631 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
632 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
635 reg_has_categorical (pspp_linreg_cache * c)
638 const struct variable *v;
640 for (i = 1; i < c->n_coeffs; i++)
642 v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
643 if (v->type == ALPHA)
652 subcommand_export (int export, pspp_linreg_cache * c)
657 int n_quantiles = 100;
660 struct pspp_linreg_coeff coeff;
665 assert (model_file != NULL);
666 fp = fopen (fh_get_filename (model_file), "w");
668 fprintf (fp, "%s", reg_preamble);
669 reg_print_getvar (fp, c);
670 if (reg_has_categorical (c))
672 reg_print_categorical_encoding (fp, c);
674 fprintf (fp, "%s", reg_export_t_quantiles_1);
675 increment = 0.5 / (double) increment;
676 for (i = 0; i < n_quantiles - 1; i++)
678 tmp = 0.5 + 0.005 * (double) i;
679 fprintf (fp, "%.15e,\n\t\t",
680 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
682 fprintf (fp, "%.15e};\n\t",
683 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
684 fprintf (fp, "%s", reg_export_t_quantiles_2);
685 fprintf (fp, "%s", reg_mean_cmt);
686 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
687 fprintf (fp, "const char *var_names[])\n{\n\t");
688 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
689 for (i = 1; i < c->n_indeps; i++)
692 fprintf (fp, "%.15e,\n\t\t", coeff.estimate);
695 fprintf (fp, "%.15e};\n\t", coeff.estimate);
697 fprintf (fp, "double estimate = %.15e;\n\t", coeff.estimate);
698 fprintf (fp, "int i;\n\tint j;\n\n\t");
699 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
700 fprintf (fp, "%s", reg_getvar);
701 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
703 for (i = 0; i < c->cov->size1 - 1; i++)
706 for (j = 0; j < c->cov->size2 - 1; j++)
708 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
710 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
713 for (j = 0; j < c->cov->size2 - 1; j++)
715 fprintf (fp, "%.15e, ",
716 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
718 fprintf (fp, "%.15e}\n\t",
719 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
720 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
722 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
723 fprintf (fp, "%s", reg_variance);
724 fprintf (fp, "%s", reg_export_confidence_interval);
725 tmp = c->mse * c->mse;
726 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
727 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
728 fprintf (fp, "%s", reg_export_prediction_interval_3);
730 fp = fopen ("pspp_model_reg.h", "w");
731 fprintf (fp, "%s", reg_header);
736 regression_custom_export (struct cmd_regression *cmd UNUSED)
738 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
739 if (!lex_force_match ('('))
746 model_file = fh_parse (FH_REF_FILE);
747 if (model_file == NULL)
751 if (!lex_force_match (')'))
758 cmd_regression (void)
760 if (!parse_regression (&cmd))
762 if (!multipass_procedure_with_splits (run_regression, &cmd))
763 return CMD_CASCADING_FAILURE;
771 Is variable k the dependent variable?
774 is_depvar (size_t k, const struct variable *v)
777 compare_var_names returns 0 if the variable
780 if (!compare_var_names (v, v_variables[k], NULL))
787 Mark missing cases. Return the number of non-missing cases.
790 mark_missing_cases (const struct casefile *cf, struct variable *v,
791 int *is_missing_case, double n_data)
793 struct casereader *r;
796 const union value *val;
798 for (r = casefile_get_reader (cf);
799 casereader_read (r, &c); case_destroy (&c))
801 row = casereader_cnum (r) - 1;
803 val = case_data (&c, v->fv);
804 cat_value_update (v, val);
805 if (mv_is_value_missing (&v->miss, val))
807 if (!is_missing_case[row])
809 /* Now it is missing. */
811 is_missing_case[row] = 1;
815 casereader_destroy (r);
820 /* Parser for the variables sub command */
822 regression_custom_variables(struct cmd_regression *cmd UNUSED)
827 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
832 if (!parse_variables (default_dict, &v_variables, &n_variables,
843 Count the explanatory variables. The user may or may
844 not have specified a response variable in the syntax.
847 int get_n_indep (const struct variable *v)
852 result = n_variables;
853 while (i < n_variables)
855 if (is_depvar (i, v))
865 Read from the active file. Identify the explanatory variables in
866 v_variables. Encode categorical variables. Drop cases with missing
870 int prepare_data (int n_data, int is_missing_case[],
871 struct variable **indep_vars,
872 struct variable *depvar,
873 const struct casefile *cf)
878 assert (indep_vars != NULL);
880 for (i = 0; i < n_variables; i++)
882 if (!is_depvar (i, depvar))
884 indep_vars[j] = v_variables[i];
886 if (v_variables[i]->type == ALPHA)
888 /* Make a place to hold the binary vectors
889 corresponding to this variable's values. */
890 cat_stored_values_create (v_variables[i]);
892 n_data = mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
896 Mark missing cases for the dependent variable.
898 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
903 run_regression (const struct casefile *cf, void *cmd_ UNUSED)
906 size_t n_data = 0; /* Number of valide cases. */
907 size_t n_cases; /* Number of cases. */
913 Keep track of the missing cases.
915 int *is_missing_case;
916 const union value *val;
917 struct casereader *r;
919 struct variable **indep_vars;
920 struct design_matrix *X;
922 pspp_linreg_cache *lcache;
923 pspp_linreg_opts lopts;
927 dict_get_vars (default_dict, &v_variables, &n_variables,
931 n_cases = casefile_get_case_cnt (cf);
933 for (i = 0; i < cmd.n_dependent; i++)
935 if (cmd.v_dependent[i]->type != NUMERIC)
937 msg (SE, gettext ("Dependent variable must be numeric."));
938 pspp_reg_rc = CMD_FAILURE;
943 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
945 lopts.get_depvar_mean_std = 1;
948 for (k = 0; k < cmd.n_dependent; k++)
950 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
951 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
952 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
953 assert (indep_vars != NULL);
955 for (i = 0; i < n_cases; i++)
957 is_missing_case[i] = 0;
959 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
961 (const struct casefile *) cf);
962 Y = gsl_vector_alloc (n_data);
965 design_matrix_create (n_indep, (const struct variable **) indep_vars,
967 for (i = 0; i < X->m->size2; i++)
969 lopts.get_indep_mean_std[i] = 1;
971 lcache = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
972 lcache->indep_means = gsl_vector_alloc (X->m->size2);
973 lcache->indep_std = gsl_vector_alloc (X->m->size2);
974 lcache->depvar = (const struct variable *) cmd.v_dependent[k];
976 For large data sets, use QR decomposition.
978 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
980 lcache->method = PSPP_LINREG_SVD;
984 The second pass creates the design matrix.
987 for (r = casefile_get_reader (cf); casereader_read (r, &c);
989 /* Iterate over the cases. */
991 case_num = casereader_cnum (r) - 1;
992 if (!is_missing_case[case_num])
994 for (i = 0; i < n_variables; ++i) /* Iterate over the
999 val = case_data (&c, v_variables[i]->fv);
1001 Independent/dependent variable separation. The
1002 'variables' subcommand specifies a varlist which contains
1003 both dependent and independent variables. The dependent
1004 variables are specified with the 'dependent'
1005 subcommand, and maybe also in the 'variables' subcommand.
1006 We need to separate the two.
1008 if (!is_depvar (i, cmd.v_dependent[k]))
1010 if (v_variables[i]->type == ALPHA)
1012 design_matrix_set_categorical (X, row, v_variables[i], val);
1014 else if (v_variables[i]->type == NUMERIC)
1016 design_matrix_set_numeric (X, row, v_variables[i], val);
1020 val = case_data (&c, cmd.v_dependent[k]->fv);
1021 gsl_vector_set (Y, row, val->f);
1026 Now that we know the number of coefficients, allocate space
1027 and store pointers to the variables that correspond to the
1030 pspp_linreg_coeff_init (lcache, X);
1033 Find the least-squares estimates and other statistics.
1035 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, lcache);
1036 subcommand_statistics (cmd.a_statistics, lcache);
1037 subcommand_save (cmd.sbc_save, lcache, cf, is_missing_case);
1038 subcommand_export (cmd.sbc_export, lcache);
1039 gsl_vector_free (Y);
1040 design_matrix_destroy (X);
1042 pspp_linreg_cache_free (lcache);
1043 free (lopts.get_indep_mean_std);
1044 casereader_destroy (r);
1047 free (is_missing_case);