1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_vector.h>
24 #include <gsl/gsl_matrix.h>
26 #include <libpspp/alloc.h>
27 #include <data/case.h>
28 #include <data/casefile.h>
29 #include <data/category.h>
30 #include <data/cat-routines.h>
31 #include <language/command.h>
32 #include <libpspp/compiler.h>
33 #include <math/design-matrix.h>
34 #include <data/dictionary.h>
35 #include <libpspp/message.h>
36 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <math/linreg/linreg.h>
40 #include <math/linreg/coefficient.h>
41 #include <data/missing-values.h>
42 #include "regression-export.h"
43 #include <output/table.h>
44 #include <data/value-labels.h>
45 #include <data/variable.h>
46 #include "procedure.h"
48 #define REG_LARGE_DATA 1000
53 "REGRESSION" (regression_):
79 static struct cmd_regression cmd;
82 Variables used (both explanatory and response).
84 static struct variable **v_variables;
89 static size_t n_variables;
92 File where the model will be saved if the EXPORT subcommand
95 struct file_handle *model_file;
98 Return value for the procedure.
100 int pspp_reg_rc = CMD_SUCCESS;
102 static bool run_regression (const struct casefile *, void *);
105 STATISTICS subcommand output functions.
107 static void reg_stats_r (pspp_linreg_cache *);
108 static void reg_stats_coeff (pspp_linreg_cache *);
109 static void reg_stats_anova (pspp_linreg_cache *);
110 static void reg_stats_outs (pspp_linreg_cache *);
111 static void reg_stats_zpp (pspp_linreg_cache *);
112 static void reg_stats_label (pspp_linreg_cache *);
113 static void reg_stats_sha (pspp_linreg_cache *);
114 static void reg_stats_ci (pspp_linreg_cache *);
115 static void reg_stats_f (pspp_linreg_cache *);
116 static void reg_stats_bcov (pspp_linreg_cache *);
117 static void reg_stats_ses (pspp_linreg_cache *);
118 static void reg_stats_xtx (pspp_linreg_cache *);
119 static void reg_stats_collin (pspp_linreg_cache *);
120 static void reg_stats_tol (pspp_linreg_cache *);
121 static void reg_stats_selection (pspp_linreg_cache *);
122 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
123 int, pspp_linreg_cache *);
126 reg_stats_r (pspp_linreg_cache * c)
136 rsq = c->ssm / c->sst;
137 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
138 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
139 t = tab_create (n_cols, n_rows, 0);
140 tab_dim (t, tab_natural_dimensions);
141 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
142 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
143 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
144 tab_vline (t, TAL_0, 1, 0, 0);
146 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
147 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
148 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
149 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
150 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
151 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
152 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
153 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
154 tab_title (t, _("Model Summary"));
159 Table showing estimated regression coefficients.
162 reg_stats_coeff (pspp_linreg_cache * c)
174 const struct variable *v;
175 const union value *val;
180 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
181 n_rows = c->n_coeffs + 2;
183 t = tab_create (n_cols, n_rows, 0);
184 tab_headers (t, 2, 0, 1, 0);
185 tab_dim (t, tab_natural_dimensions);
186 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
187 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
188 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
189 tab_vline (t, TAL_0, 1, 0, 0);
191 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
192 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
193 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
194 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
195 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
196 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
197 coeff = c->coeff[0].estimate;
198 tab_float (t, 2, 1, 0, coeff, 10, 2);
199 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
200 tab_float (t, 3, 1, 0, std_err, 10, 2);
201 beta = coeff / c->depvar_std;
202 tab_float (t, 4, 1, 0, beta, 10, 2);
203 t_stat = coeff / std_err;
204 tab_float (t, 5, 1, 0, t_stat, 10, 2);
205 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
206 tab_float (t, 6, 1, 0, pval, 10, 2);
207 for (j = 1; j <= c->n_indeps; j++)
209 v = pspp_linreg_coeff_get_var (c->coeff + j, 0);
210 label = var_to_string (v);
211 /* Do not overwrite the variable's name. */
212 strncpy (tmp, label, MAX_STRING);
213 if (v->type == ALPHA)
216 Append the value associated with this coefficient.
217 This makes sense only if we us the usual binary encoding
221 val = pspp_linreg_coeff_get_value (c->coeff + j, v);
222 val_s = value_to_string (val, v);
223 strncat (tmp, val_s, MAX_STRING);
226 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
228 Regression coefficients.
230 coeff = c->coeff[j].estimate;
231 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
233 Standard error of the coefficients.
235 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
236 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
238 'Standardized' coefficient, i.e., regression coefficient
239 if all variables had unit variance.
241 beta = gsl_vector_get (c->indep_std, j);
242 beta *= coeff / c->depvar_std;
243 tab_float (t, 4, j + 1, 0, beta, 10, 2);
246 Test statistic for H0: coefficient is 0.
248 t_stat = coeff / std_err;
249 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
251 P values for the test statistic above.
253 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
254 tab_float (t, 6, j + 1, 0, pval, 10, 2);
256 tab_title (t, _("Coefficients"));
262 Display the ANOVA table.
265 reg_stats_anova (pspp_linreg_cache * c)
269 const double msm = c->ssm / c->dfm;
270 const double mse = c->sse / c->dfe;
271 const double F = msm / mse;
272 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
277 t = tab_create (n_cols, n_rows, 0);
278 tab_headers (t, 2, 0, 1, 0);
279 tab_dim (t, tab_natural_dimensions);
281 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
283 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
284 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
285 tab_vline (t, TAL_0, 1, 0, 0);
287 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
288 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
289 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
290 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
291 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
293 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
294 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
295 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
297 /* Sums of Squares */
298 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
299 tab_float (t, 2, 3, 0, c->sst, 10, 2);
300 tab_float (t, 2, 2, 0, c->sse, 10, 2);
303 /* Degrees of freedom */
304 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
305 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
306 tab_float (t, 3, 3, 0, c->dft, 4, 0);
310 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
311 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
313 tab_float (t, 5, 1, 0, F, 8, 3);
315 tab_float (t, 6, 1, 0, pval, 8, 3);
317 tab_title (t, _("ANOVA"));
321 reg_stats_outs (pspp_linreg_cache * c)
326 reg_stats_zpp (pspp_linreg_cache * c)
331 reg_stats_label (pspp_linreg_cache * c)
336 reg_stats_sha (pspp_linreg_cache * c)
341 reg_stats_ci (pspp_linreg_cache * c)
346 reg_stats_f (pspp_linreg_cache * c)
351 reg_stats_bcov (pspp_linreg_cache * c)
363 n_cols = c->n_indeps + 1 + 2;
364 n_rows = 2 * (c->n_indeps + 1);
365 t = tab_create (n_cols, n_rows, 0);
366 tab_headers (t, 2, 0, 1, 0);
367 tab_dim (t, tab_natural_dimensions);
368 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
369 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
370 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
371 tab_vline (t, TAL_0, 1, 0, 0);
372 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
373 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
374 for (i = 1; i < c->n_coeffs; i++)
376 const struct variable *v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
377 label = var_to_string (v);
378 tab_text (t, 2, i, TAB_CENTER, label);
379 tab_text (t, i + 2, 0, TAB_CENTER, label);
380 for (k = 1; k < c->n_coeffs; k++)
382 col = (i <= k) ? k : i;
383 row = (i <= k) ? i : k;
384 tab_float (t, k + 2, i, TAB_CENTER,
385 gsl_matrix_get (c->cov, row, col), 8, 3);
388 tab_title (t, _("Coefficient Correlations"));
392 reg_stats_ses (pspp_linreg_cache * c)
397 reg_stats_xtx (pspp_linreg_cache * c)
402 reg_stats_collin (pspp_linreg_cache * c)
407 reg_stats_tol (pspp_linreg_cache * c)
412 reg_stats_selection (pspp_linreg_cache * c)
418 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
419 int keyword, pspp_linreg_cache * c)
428 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
431 The order here must match the order in which the STATISTICS
432 keywords appear in the specification section above.
459 Set everything but F.
461 for (i = 0; i < f; i++)
468 for (i = 0; i < all; i++)
476 Default output: ANOVA table, parameter estimates,
477 and statistics for variables not entered into model,
480 if (keywords[defaults] | d)
488 statistics_keyword_output (reg_stats_r, keywords[r], c);
489 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
490 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
491 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
492 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
493 statistics_keyword_output (reg_stats_label, keywords[label], c);
494 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
495 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
496 statistics_keyword_output (reg_stats_f, keywords[f], c);
497 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
498 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
499 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
500 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
501 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
502 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
505 subcommand_save (int save, pspp_linreg_cache *lc, const struct casefile *cf, int *is_missing)
510 const union value **vals;
511 struct casereader *r;
515 assert (is_missing != NULL);
519 vals = xnmalloc (n_variables, sizeof (*vals));
520 for (r = casefile_get_reader (cf); casereader_read (r, &c);
523 case_num = casereader_cnum (r) - 1;
524 if (!is_missing[case_num])
526 for (i = 0; i < n_variables; ++i)
528 vals[i] = case_data (&c, v_variables[i]->fv);
530 residual = (*lc->predict) ((const struct variable **) v_variables,
531 (const union value **) vals, lc, n_variables);
538 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
542 for (i = 0; i < n_vars; i++)
544 if (v->index == varlist[i]->index)
552 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
557 struct variable **varlist;
558 struct pspp_linreg_coeff *coeff;
559 const struct variable *v;
562 fprintf (fp, "%s", reg_export_categorical_encode_1);
564 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
565 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
567 coeff = c->coeff + i;
568 v = pspp_linreg_coeff_get_var (coeff, 0);
569 if (v->type == ALPHA)
571 if (!reg_inserted (v, varlist, n_vars))
573 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
575 varlist[n_vars] = (struct variable *) v;
580 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
581 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
583 for (i = 0; i < n_vars - 1; i++)
585 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
587 fprintf (fp, "&%s};\n\t", varlist[i]->name);
589 for (i = 0; i < n_vars; i++)
591 coeff = c->coeff + i;
592 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
594 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
595 varlist[i]->obs_vals->n_categories);
597 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
599 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
600 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
601 value_to_string (val, varlist[i]));
604 fprintf (fp, "%s", reg_export_categorical_encode_2);
608 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
611 struct pspp_linreg_coeff *coeff;
612 const struct variable *v;
614 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
615 for (i = 1; i < c->n_indeps; i++)
617 coeff = c->coeff + i;
618 v = pspp_linreg_coeff_get_var (coeff, 0);
619 fprintf (fp, "\"%s\",\n\t\t", v->name);
621 coeff = c->coeff + i;
622 v = pspp_linreg_coeff_get_var (coeff, 0);
623 fprintf (fp, "\"%s\"};\n\t", v->name);
626 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
628 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
629 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
630 reg_print_depvars (fp, c);
631 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
633 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
634 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
637 reg_has_categorical (pspp_linreg_cache * c)
640 const struct variable *v;
642 for (i = 1; i < c->n_coeffs; i++)
644 v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
645 if (v->type == ALPHA)
654 subcommand_export (int export, pspp_linreg_cache * c)
659 int n_quantiles = 100;
662 struct pspp_linreg_coeff coeff;
667 assert (model_file != NULL);
668 fp = fopen (fh_get_filename (model_file), "w");
670 fprintf (fp, "%s", reg_preamble);
671 reg_print_getvar (fp, c);
672 if (reg_has_categorical (c))
674 reg_print_categorical_encoding (fp, c);
676 fprintf (fp, "%s", reg_export_t_quantiles_1);
677 increment = 0.5 / (double) increment;
678 for (i = 0; i < n_quantiles - 1; i++)
680 tmp = 0.5 + 0.005 * (double) i;
681 fprintf (fp, "%.15e,\n\t\t",
682 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
684 fprintf (fp, "%.15e};\n\t",
685 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
686 fprintf (fp, "%s", reg_export_t_quantiles_2);
687 fprintf (fp, "%s", reg_mean_cmt);
688 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
689 fprintf (fp, "const char *var_names[])\n{\n\t");
690 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
691 for (i = 1; i < c->n_indeps; i++)
694 fprintf (fp, "%.15e,\n\t\t", coeff.estimate);
697 fprintf (fp, "%.15e};\n\t", coeff.estimate);
699 fprintf (fp, "double estimate = %.15e;\n\t", coeff.estimate);
700 fprintf (fp, "int i;\n\tint j;\n\n\t");
701 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
702 fprintf (fp, "%s", reg_getvar);
703 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
705 for (i = 0; i < c->cov->size1 - 1; i++)
708 for (j = 0; j < c->cov->size2 - 1; j++)
710 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
712 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
715 for (j = 0; j < c->cov->size2 - 1; j++)
717 fprintf (fp, "%.15e, ",
718 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
720 fprintf (fp, "%.15e}\n\t",
721 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
722 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
724 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
725 fprintf (fp, "%s", reg_variance);
726 fprintf (fp, "%s", reg_export_confidence_interval);
727 tmp = c->mse * c->mse;
728 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
729 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
730 fprintf (fp, "%s", reg_export_prediction_interval_3);
732 fp = fopen ("pspp_model_reg.h", "w");
733 fprintf (fp, "%s", reg_header);
738 regression_custom_export (struct cmd_regression *cmd UNUSED)
740 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
741 if (!lex_force_match ('('))
748 model_file = fh_parse (FH_REF_FILE);
749 if (model_file == NULL)
753 if (!lex_force_match (')'))
760 cmd_regression (void)
762 if (!parse_regression (&cmd))
764 if (!multipass_procedure_with_splits (run_regression, &cmd))
765 return CMD_CASCADING_FAILURE;
773 Is variable k the dependent variable?
776 is_depvar (size_t k, const struct variable *v)
779 compare_var_names returns 0 if the variable
782 if (!compare_var_names (v, v_variables[k], NULL))
789 Mark missing cases. Return the number of non-missing cases.
792 mark_missing_cases (const struct casefile *cf, struct variable *v,
793 int *is_missing_case, double n_data)
795 struct casereader *r;
798 const union value *val;
800 for (r = casefile_get_reader (cf);
801 casereader_read (r, &c); case_destroy (&c))
803 row = casereader_cnum (r) - 1;
805 val = case_data (&c, v->fv);
806 cat_value_update (v, val);
807 if (mv_is_value_missing (&v->miss, val))
809 if (!is_missing_case[row])
811 /* Now it is missing. */
813 is_missing_case[row] = 1;
817 casereader_destroy (r);
822 /* Parser for the variables sub command */
824 regression_custom_variables(struct cmd_regression *cmd UNUSED)
829 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
834 if (!parse_variables (default_dict, &v_variables, &n_variables,
845 Count the explanatory variables. The user may or may
846 not have specified a response variable in the syntax.
849 int get_n_indep (const struct variable *v)
854 result = n_variables;
855 while (i < n_variables)
857 if (is_depvar (i, v))
867 Read from the active file. Identify the explanatory variables in
868 v_variables. Encode categorical variables. Drop cases with missing
872 int prepare_data (int n_data, int is_missing_case[],
873 struct variable **indep_vars,
874 struct variable *depvar,
875 const struct casefile *cf)
880 assert (indep_vars != NULL);
882 for (i = 0; i < n_variables; i++)
884 if (!is_depvar (i, depvar))
886 indep_vars[j] = v_variables[i];
888 if (v_variables[i]->type == ALPHA)
890 /* Make a place to hold the binary vectors
891 corresponding to this variable's values. */
892 cat_stored_values_create (v_variables[i]);
894 n_data = mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
898 Mark missing cases for the dependent variable.
900 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
905 run_regression (const struct casefile *cf, void *cmd_ UNUSED)
908 size_t n_data = 0; /* Number of valide cases. */
909 size_t n_cases; /* Number of cases. */
915 Keep track of the missing cases.
917 int *is_missing_case;
918 const union value *val;
919 struct casereader *r;
921 struct variable **indep_vars;
922 struct design_matrix *X;
924 pspp_linreg_cache *lcache;
925 pspp_linreg_opts lopts;
929 dict_get_vars (default_dict, &v_variables, &n_variables,
933 n_cases = casefile_get_case_cnt (cf);
935 for (i = 0; i < cmd.n_dependent; i++)
937 if (cmd.v_dependent[i]->type != NUMERIC)
939 msg (SE, gettext ("Dependent variable must be numeric."));
940 pspp_reg_rc = CMD_FAILURE;
945 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
947 lopts.get_depvar_mean_std = 1;
950 for (k = 0; k < cmd.n_dependent; k++)
952 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
953 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
954 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
955 assert (indep_vars != NULL);
957 for (i = 0; i < n_cases; i++)
959 is_missing_case[i] = 0;
961 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
963 (const struct casefile *) cf);
964 Y = gsl_vector_alloc (n_data);
967 design_matrix_create (n_indep, (const struct variable **) indep_vars,
969 for (i = 0; i < X->m->size2; i++)
971 lopts.get_indep_mean_std[i] = 1;
973 lcache = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
974 lcache->indep_means = gsl_vector_alloc (X->m->size2);
975 lcache->indep_std = gsl_vector_alloc (X->m->size2);
976 lcache->depvar = (const struct variable *) cmd.v_dependent[k];
978 For large data sets, use QR decomposition.
980 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
982 lcache->method = PSPP_LINREG_SVD;
986 The second pass creates the design matrix.
989 for (r = casefile_get_reader (cf); casereader_read (r, &c);
991 /* Iterate over the cases. */
993 case_num = casereader_cnum (r) - 1;
994 if (!is_missing_case[case_num])
996 for (i = 0; i < n_variables; ++i) /* Iterate over the
1001 val = case_data (&c, v_variables[i]->fv);
1003 Independent/dependent variable separation. The
1004 'variables' subcommand specifies a varlist which contains
1005 both dependent and independent variables. The dependent
1006 variables are specified with the 'dependent'
1007 subcommand, and maybe also in the 'variables' subcommand.
1008 We need to separate the two.
1010 if (!is_depvar (i, cmd.v_dependent[k]))
1012 if (v_variables[i]->type == ALPHA)
1014 design_matrix_set_categorical (X, row, v_variables[i], val);
1016 else if (v_variables[i]->type == NUMERIC)
1018 design_matrix_set_numeric (X, row, v_variables[i], val);
1022 val = case_data (&c, cmd.v_dependent[k]->fv);
1023 gsl_vector_set (Y, row, val->f);
1028 Now that we know the number of coefficients, allocate space
1029 and store pointers to the variables that correspond to the
1032 pspp_linreg_coeff_init (lcache, X);
1035 Find the least-squares estimates and other statistics.
1037 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, lcache);
1038 subcommand_statistics (cmd.a_statistics, lcache);
1039 subcommand_save (cmd.sbc_save, lcache, cf, is_missing_case);
1040 subcommand_export (cmd.sbc_export, lcache);
1041 gsl_vector_free (Y);
1042 design_matrix_destroy (X);
1044 pspp_linreg_cache_free (lcache);
1045 free (lopts.get_indep_mean_std);
1046 casereader_destroy (r);
1049 free (is_missing_case);