1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_vector.h>
24 #include <gsl/gsl_matrix.h>
26 #include <libpspp/alloc.h>
27 #include <data/case.h>
28 #include <data/casefile.h>
29 #include <data/category.h>
30 #include <data/cat-routines.h>
31 #include <language/command.h>
32 #include <libpspp/compiler.h>
33 #include <math/design-matrix.h>
34 #include <data/dictionary.h>
35 #include <libpspp/message.h>
36 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <math/linreg/linreg.h>
40 #include <math/linreg/coefficient.h>
41 #include <data/missing-values.h>
42 #include "regression-export.h"
43 #include <output/table.h>
44 #include <data/value-labels.h>
45 #include <data/variable.h>
46 #include "procedure.h"
48 #define REG_LARGE_DATA 1000
53 "REGRESSION" (regression_):
79 static struct cmd_regression cmd;
82 Variables used (both explanatory and response).
84 static struct variable **v_variables;
89 static size_t n_variables;
92 File where the model will be saved if the EXPORT subcommand
95 struct file_handle *model_file;
98 Return value for the procedure.
100 int pspp_reg_rc = CMD_SUCCESS;
102 static bool run_regression (const struct casefile *, void *);
105 STATISTICS subcommand output functions.
107 static void reg_stats_r (pspp_linreg_cache *);
108 static void reg_stats_coeff (pspp_linreg_cache *);
109 static void reg_stats_anova (pspp_linreg_cache *);
110 static void reg_stats_outs (pspp_linreg_cache *);
111 static void reg_stats_zpp (pspp_linreg_cache *);
112 static void reg_stats_label (pspp_linreg_cache *);
113 static void reg_stats_sha (pspp_linreg_cache *);
114 static void reg_stats_ci (pspp_linreg_cache *);
115 static void reg_stats_f (pspp_linreg_cache *);
116 static void reg_stats_bcov (pspp_linreg_cache *);
117 static void reg_stats_ses (pspp_linreg_cache *);
118 static void reg_stats_xtx (pspp_linreg_cache *);
119 static void reg_stats_collin (pspp_linreg_cache *);
120 static void reg_stats_tol (pspp_linreg_cache *);
121 static void reg_stats_selection (pspp_linreg_cache *);
122 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
123 int, pspp_linreg_cache *);
126 reg_stats_r (pspp_linreg_cache * c)
136 rsq = c->ssm / c->sst;
137 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
138 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
139 t = tab_create (n_cols, n_rows, 0);
140 tab_dim (t, tab_natural_dimensions);
141 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
142 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
143 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
144 tab_vline (t, TAL_0, 1, 0, 0);
146 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
147 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
148 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
149 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
150 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
151 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
152 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
153 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
154 tab_title (t, _("Model Summary"));
159 Table showing estimated regression coefficients.
162 reg_stats_coeff (pspp_linreg_cache * c)
174 const struct variable *v;
175 const union value *val;
180 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
181 n_rows = c->n_coeffs + 2;
183 t = tab_create (n_cols, n_rows, 0);
184 tab_headers (t, 2, 0, 1, 0);
185 tab_dim (t, tab_natural_dimensions);
186 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
187 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
188 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
189 tab_vline (t, TAL_0, 1, 0, 0);
191 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
192 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
193 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
194 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
195 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
196 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
197 coeff = c->coeff[0].estimate;
198 tab_float (t, 2, 1, 0, coeff, 10, 2);
199 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
200 tab_float (t, 3, 1, 0, std_err, 10, 2);
201 beta = coeff / c->depvar_std;
202 tab_float (t, 4, 1, 0, beta, 10, 2);
203 t_stat = coeff / std_err;
204 tab_float (t, 5, 1, 0, t_stat, 10, 2);
205 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
206 tab_float (t, 6, 1, 0, pval, 10, 2);
207 for (j = 1; j <= c->n_indeps; j++)
209 v = pspp_linreg_coeff_get_var (c->coeff + j, 0);
210 label = var_to_string (v);
211 /* Do not overwrite the variable's name. */
212 strncpy (tmp, label, MAX_STRING);
213 if (v->type == ALPHA)
216 Append the value associated with this coefficient.
217 This makes sense only if we us the usual binary encoding
221 val = pspp_linreg_coeff_get_value (c->coeff + j, v);
222 val_s = value_to_string (val, v);
223 strncat (tmp, val_s, MAX_STRING);
226 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
228 Regression coefficients.
230 coeff = c->coeff[j].estimate;
231 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
233 Standard error of the coefficients.
235 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
236 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
238 'Standardized' coefficient, i.e., regression coefficient
239 if all variables had unit variance.
241 beta = gsl_vector_get (c->indep_std, j);
242 beta *= coeff / c->depvar_std;
243 tab_float (t, 4, j + 1, 0, beta, 10, 2);
246 Test statistic for H0: coefficient is 0.
248 t_stat = coeff / std_err;
249 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
251 P values for the test statistic above.
253 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
254 tab_float (t, 6, j + 1, 0, pval, 10, 2);
256 tab_title (t, _("Coefficients"));
262 Display the ANOVA table.
265 reg_stats_anova (pspp_linreg_cache * c)
269 const double msm = c->ssm / c->dfm;
270 const double mse = c->sse / c->dfe;
271 const double F = msm / mse;
272 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
277 t = tab_create (n_cols, n_rows, 0);
278 tab_headers (t, 2, 0, 1, 0);
279 tab_dim (t, tab_natural_dimensions);
281 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
283 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
284 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
285 tab_vline (t, TAL_0, 1, 0, 0);
287 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
288 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
289 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
290 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
291 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
293 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
294 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
295 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
297 /* Sums of Squares */
298 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
299 tab_float (t, 2, 3, 0, c->sst, 10, 2);
300 tab_float (t, 2, 2, 0, c->sse, 10, 2);
303 /* Degrees of freedom */
304 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
305 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
306 tab_float (t, 3, 3, 0, c->dft, 4, 0);
310 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
311 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
313 tab_float (t, 5, 1, 0, F, 8, 3);
315 tab_float (t, 6, 1, 0, pval, 8, 3);
317 tab_title (t, _("ANOVA"));
321 reg_stats_outs (pspp_linreg_cache * c)
326 reg_stats_zpp (pspp_linreg_cache * c)
331 reg_stats_label (pspp_linreg_cache * c)
336 reg_stats_sha (pspp_linreg_cache * c)
341 reg_stats_ci (pspp_linreg_cache * c)
346 reg_stats_f (pspp_linreg_cache * c)
351 reg_stats_bcov (pspp_linreg_cache * c)
363 n_cols = c->n_indeps + 1 + 2;
364 n_rows = 2 * (c->n_indeps + 1);
365 t = tab_create (n_cols, n_rows, 0);
366 tab_headers (t, 2, 0, 1, 0);
367 tab_dim (t, tab_natural_dimensions);
368 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
369 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
370 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
371 tab_vline (t, TAL_0, 1, 0, 0);
372 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
373 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
374 for (i = 1; i < c->n_coeffs; i++)
376 const struct variable *v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
377 label = var_to_string (v);
378 tab_text (t, 2, i, TAB_CENTER, label);
379 tab_text (t, i + 2, 0, TAB_CENTER, label);
380 for (k = 1; k < c->n_coeffs; k++)
382 col = (i <= k) ? k : i;
383 row = (i <= k) ? i : k;
384 tab_float (t, k + 2, i, TAB_CENTER,
385 gsl_matrix_get (c->cov, row, col), 8, 3);
388 tab_title (t, _("Coefficient Correlations"));
392 reg_stats_ses (pspp_linreg_cache * c)
397 reg_stats_xtx (pspp_linreg_cache * c)
402 reg_stats_collin (pspp_linreg_cache * c)
407 reg_stats_tol (pspp_linreg_cache * c)
412 reg_stats_selection (pspp_linreg_cache * c)
418 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
419 int keyword, pspp_linreg_cache * c)
428 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
431 The order here must match the order in which the STATISTICS
432 keywords appear in the specification section above.
459 Set everything but F.
461 for (i = 0; i < f; i++)
468 for (i = 0; i < all; i++)
476 Default output: ANOVA table, parameter estimates,
477 and statistics for variables not entered into model,
480 if (keywords[defaults] | d)
488 statistics_keyword_output (reg_stats_r, keywords[r], c);
489 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
490 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
491 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
492 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
493 statistics_keyword_output (reg_stats_label, keywords[label], c);
494 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
495 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
496 statistics_keyword_output (reg_stats_f, keywords[f], c);
497 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
498 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
499 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
500 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
501 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
502 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
505 subcommand_save (int save, pspp_linreg_cache *lc, const struct casefile *cf, int *is_missing)
510 const union value **vals = NULL;
511 const union value *obs = NULL;
512 struct casereader *r;
516 assert (lc->depvar != NULL);
517 assert (is_missing != NULL);
521 vals = xnmalloc (n_variables, sizeof (*vals));
522 for (r = casefile_get_reader (cf); casereader_read (r, &c);
525 case_num = casereader_cnum (r) - 1;
526 if (!is_missing[case_num])
528 for (i = 0; i < n_variables; ++i)
530 vals[i] = case_data (&c, v_variables[i]->fv);
531 if (v_variables[i]->index == lc->depvar->index)
536 residual = (*lc->residual) ((const struct variable **) v_variables,
537 (const union value **) vals, obs, lc, n_variables);
544 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
548 for (i = 0; i < n_vars; i++)
550 if (v->index == varlist[i]->index)
558 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
563 struct variable **varlist;
564 struct pspp_linreg_coeff *coeff;
565 const struct variable *v;
568 fprintf (fp, "%s", reg_export_categorical_encode_1);
570 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
571 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
573 coeff = c->coeff + i;
574 v = pspp_linreg_coeff_get_var (coeff, 0);
575 if (v->type == ALPHA)
577 if (!reg_inserted (v, varlist, n_vars))
579 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
581 varlist[n_vars] = (struct variable *) v;
586 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
587 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
589 for (i = 0; i < n_vars - 1; i++)
591 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
593 fprintf (fp, "&%s};\n\t", varlist[i]->name);
595 for (i = 0; i < n_vars; i++)
597 coeff = c->coeff + i;
598 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
600 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
601 varlist[i]->obs_vals->n_categories);
603 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
605 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
606 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
607 value_to_string (val, varlist[i]));
610 fprintf (fp, "%s", reg_export_categorical_encode_2);
614 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
617 struct pspp_linreg_coeff *coeff;
618 const struct variable *v;
620 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
621 for (i = 1; i < c->n_indeps; i++)
623 coeff = c->coeff + i;
624 v = pspp_linreg_coeff_get_var (coeff, 0);
625 fprintf (fp, "\"%s\",\n\t\t", v->name);
627 coeff = c->coeff + i;
628 v = pspp_linreg_coeff_get_var (coeff, 0);
629 fprintf (fp, "\"%s\"};\n\t", v->name);
632 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
634 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
635 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
636 reg_print_depvars (fp, c);
637 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
639 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
640 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
643 reg_has_categorical (pspp_linreg_cache * c)
646 const struct variable *v;
648 for (i = 1; i < c->n_coeffs; i++)
650 v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
651 if (v->type == ALPHA)
660 subcommand_export (int export, pspp_linreg_cache * c)
665 int n_quantiles = 100;
668 struct pspp_linreg_coeff coeff;
673 assert (model_file != NULL);
674 fp = fopen (fh_get_filename (model_file), "w");
676 fprintf (fp, "%s", reg_preamble);
677 reg_print_getvar (fp, c);
678 if (reg_has_categorical (c))
680 reg_print_categorical_encoding (fp, c);
682 fprintf (fp, "%s", reg_export_t_quantiles_1);
683 increment = 0.5 / (double) increment;
684 for (i = 0; i < n_quantiles - 1; i++)
686 tmp = 0.5 + 0.005 * (double) i;
687 fprintf (fp, "%.15e,\n\t\t",
688 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
690 fprintf (fp, "%.15e};\n\t",
691 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
692 fprintf (fp, "%s", reg_export_t_quantiles_2);
693 fprintf (fp, "%s", reg_mean_cmt);
694 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
695 fprintf (fp, "const char *var_names[])\n{\n\t");
696 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
697 for (i = 1; i < c->n_indeps; i++)
700 fprintf (fp, "%.15e,\n\t\t", coeff.estimate);
703 fprintf (fp, "%.15e};\n\t", coeff.estimate);
705 fprintf (fp, "double estimate = %.15e;\n\t", coeff.estimate);
706 fprintf (fp, "int i;\n\tint j;\n\n\t");
707 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
708 fprintf (fp, "%s", reg_getvar);
709 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
711 for (i = 0; i < c->cov->size1 - 1; i++)
714 for (j = 0; j < c->cov->size2 - 1; j++)
716 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
718 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
721 for (j = 0; j < c->cov->size2 - 1; j++)
723 fprintf (fp, "%.15e, ",
724 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
726 fprintf (fp, "%.15e}\n\t",
727 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
728 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
730 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
731 fprintf (fp, "%s", reg_variance);
732 fprintf (fp, "%s", reg_export_confidence_interval);
733 tmp = c->mse * c->mse;
734 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
735 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
736 fprintf (fp, "%s", reg_export_prediction_interval_3);
738 fp = fopen ("pspp_model_reg.h", "w");
739 fprintf (fp, "%s", reg_header);
744 regression_custom_export (struct cmd_regression *cmd UNUSED)
746 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
747 if (!lex_force_match ('('))
754 model_file = fh_parse (FH_REF_FILE);
755 if (model_file == NULL)
759 if (!lex_force_match (')'))
766 cmd_regression (void)
768 if (!parse_regression (&cmd))
770 if (!multipass_procedure_with_splits (run_regression, &cmd))
771 return CMD_CASCADING_FAILURE;
779 Is variable k the dependent variable?
782 is_depvar (size_t k, const struct variable *v)
785 compare_var_names returns 0 if the variable
788 if (!compare_var_names (v, v_variables[k], NULL))
795 Mark missing cases. Return the number of non-missing cases.
798 mark_missing_cases (const struct casefile *cf, struct variable *v,
799 int *is_missing_case, double n_data)
801 struct casereader *r;
804 const union value *val;
806 for (r = casefile_get_reader (cf);
807 casereader_read (r, &c); case_destroy (&c))
809 row = casereader_cnum (r) - 1;
811 val = case_data (&c, v->fv);
812 cat_value_update (v, val);
813 if (mv_is_value_missing (&v->miss, val))
815 if (!is_missing_case[row])
817 /* Now it is missing. */
819 is_missing_case[row] = 1;
823 casereader_destroy (r);
828 /* Parser for the variables sub command */
830 regression_custom_variables(struct cmd_regression *cmd UNUSED)
835 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
840 if (!parse_variables (default_dict, &v_variables, &n_variables,
851 Count the explanatory variables. The user may or may
852 not have specified a response variable in the syntax.
855 int get_n_indep (const struct variable *v)
860 result = n_variables;
861 while (i < n_variables)
863 if (is_depvar (i, v))
873 Read from the active file. Identify the explanatory variables in
874 v_variables. Encode categorical variables. Drop cases with missing
878 int prepare_data (int n_data, int is_missing_case[],
879 struct variable **indep_vars,
880 struct variable *depvar,
881 const struct casefile *cf)
886 assert (indep_vars != NULL);
888 for (i = 0; i < n_variables; i++)
890 if (!is_depvar (i, depvar))
892 indep_vars[j] = v_variables[i];
894 if (v_variables[i]->type == ALPHA)
896 /* Make a place to hold the binary vectors
897 corresponding to this variable's values. */
898 cat_stored_values_create (v_variables[i]);
900 n_data = mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
904 Mark missing cases for the dependent variable.
906 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
911 run_regression (const struct casefile *cf, void *cmd_ UNUSED)
914 size_t n_data = 0; /* Number of valide cases. */
915 size_t n_cases; /* Number of cases. */
921 Keep track of the missing cases.
923 int *is_missing_case;
924 const union value *val;
925 struct casereader *r;
927 struct variable **indep_vars;
928 struct design_matrix *X;
930 pspp_linreg_cache *lcache;
931 pspp_linreg_opts lopts;
935 dict_get_vars (default_dict, &v_variables, &n_variables,
939 n_cases = casefile_get_case_cnt (cf);
941 for (i = 0; i < cmd.n_dependent; i++)
943 if (cmd.v_dependent[i]->type != NUMERIC)
945 msg (SE, gettext ("Dependent variable must be numeric."));
946 pspp_reg_rc = CMD_FAILURE;
951 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
953 lopts.get_depvar_mean_std = 1;
956 for (k = 0; k < cmd.n_dependent; k++)
958 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
959 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
960 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
961 assert (indep_vars != NULL);
963 for (i = 0; i < n_cases; i++)
965 is_missing_case[i] = 0;
967 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
969 (const struct casefile *) cf);
970 Y = gsl_vector_alloc (n_data);
973 design_matrix_create (n_indep, (const struct variable **) indep_vars,
975 for (i = 0; i < X->m->size2; i++)
977 lopts.get_indep_mean_std[i] = 1;
979 lcache = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
980 lcache->indep_means = gsl_vector_alloc (X->m->size2);
981 lcache->indep_std = gsl_vector_alloc (X->m->size2);
982 lcache->depvar = (const struct variable *) cmd.v_dependent[k];
984 For large data sets, use QR decomposition.
986 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
988 lcache->method = PSPP_LINREG_SVD;
992 The second pass creates the design matrix.
995 for (r = casefile_get_reader (cf); casereader_read (r, &c);
997 /* Iterate over the cases. */
999 case_num = casereader_cnum (r) - 1;
1000 if (!is_missing_case[case_num])
1002 for (i = 0; i < n_variables; ++i) /* Iterate over the
1007 val = case_data (&c, v_variables[i]->fv);
1009 Independent/dependent variable separation. The
1010 'variables' subcommand specifies a varlist which contains
1011 both dependent and independent variables. The dependent
1012 variables are specified with the 'dependent'
1013 subcommand, and maybe also in the 'variables' subcommand.
1014 We need to separate the two.
1016 if (!is_depvar (i, cmd.v_dependent[k]))
1018 if (v_variables[i]->type == ALPHA)
1020 design_matrix_set_categorical (X, row, v_variables[i], val);
1022 else if (v_variables[i]->type == NUMERIC)
1024 design_matrix_set_numeric (X, row, v_variables[i], val);
1028 val = case_data (&c, cmd.v_dependent[k]->fv);
1029 gsl_vector_set (Y, row, val->f);
1034 Now that we know the number of coefficients, allocate space
1035 and store pointers to the variables that correspond to the
1038 pspp_linreg_coeff_init (lcache, X);
1041 Find the least-squares estimates and other statistics.
1043 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, lcache);
1044 subcommand_statistics (cmd.a_statistics, lcache);
1045 subcommand_save (cmd.sbc_save, lcache, cf, is_missing_case);
1046 subcommand_export (cmd.sbc_export, lcache);
1047 gsl_vector_free (Y);
1048 design_matrix_destroy (X);
1050 pspp_linreg_cache_free (lcache);
1051 free (lopts.get_indep_mean_std);
1052 casereader_destroy (r);
1055 free (is_missing_case);