1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_vector.h>
24 #include <gsl/gsl_matrix.h>
26 #include <libpspp/alloc.h>
27 #include <data/case.h>
28 #include <data/casefile.h>
29 #include <data/category.h>
30 #include <data/cat-routines.h>
31 #include <language/command.h>
32 #include <libpspp/compiler.h>
33 #include <math/design-matrix.h>
34 #include <data/dictionary.h>
35 #include <libpspp/message.h>
36 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <math/linreg/linreg.h>
40 #include <math/linreg/coefficient.h>
41 #include <data/missing-values.h>
42 #include "regression-export.h"
43 #include <output/table.h>
44 #include <data/value-labels.h>
45 #include <data/variable.h>
46 #include "procedure.h"
48 #define REG_LARGE_DATA 1000
53 "REGRESSION" (regression_):
78 static struct cmd_regression cmd;
81 Variables used (both explanatory and response).
83 static struct variable **v_variables;
88 static size_t n_variables;
91 File where the model will be saved if the EXPORT subcommand
94 struct file_handle *model_file;
97 Return value for the procedure.
99 int pspp_reg_rc = CMD_SUCCESS;
101 static bool run_regression (const struct casefile *, void *);
104 STATISTICS subcommand output functions.
106 static void reg_stats_r (pspp_linreg_cache *);
107 static void reg_stats_coeff (pspp_linreg_cache *);
108 static void reg_stats_anova (pspp_linreg_cache *);
109 static void reg_stats_outs (pspp_linreg_cache *);
110 static void reg_stats_zpp (pspp_linreg_cache *);
111 static void reg_stats_label (pspp_linreg_cache *);
112 static void reg_stats_sha (pspp_linreg_cache *);
113 static void reg_stats_ci (pspp_linreg_cache *);
114 static void reg_stats_f (pspp_linreg_cache *);
115 static void reg_stats_bcov (pspp_linreg_cache *);
116 static void reg_stats_ses (pspp_linreg_cache *);
117 static void reg_stats_xtx (pspp_linreg_cache *);
118 static void reg_stats_collin (pspp_linreg_cache *);
119 static void reg_stats_tol (pspp_linreg_cache *);
120 static void reg_stats_selection (pspp_linreg_cache *);
121 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
122 int, pspp_linreg_cache *);
125 reg_stats_r (pspp_linreg_cache * c)
135 rsq = c->ssm / c->sst;
136 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
137 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
138 t = tab_create (n_cols, n_rows, 0);
139 tab_dim (t, tab_natural_dimensions);
140 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
141 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
142 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
143 tab_vline (t, TAL_0, 1, 0, 0);
145 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
146 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
147 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
148 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
149 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
150 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
151 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
152 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
153 tab_title (t, _("Model Summary"));
158 Table showing estimated regression coefficients.
161 reg_stats_coeff (pspp_linreg_cache * c)
173 const struct variable *v;
174 const union value *val;
179 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
180 n_rows = c->n_coeffs + 2;
182 t = tab_create (n_cols, n_rows, 0);
183 tab_headers (t, 2, 0, 1, 0);
184 tab_dim (t, tab_natural_dimensions);
185 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
186 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
187 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
188 tab_vline (t, TAL_0, 1, 0, 0);
190 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
191 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
192 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
193 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
194 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
195 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
196 coeff = c->coeff[0].estimate;
197 tab_float (t, 2, 1, 0, coeff, 10, 2);
198 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
199 tab_float (t, 3, 1, 0, std_err, 10, 2);
200 beta = coeff / c->depvar_std;
201 tab_float (t, 4, 1, 0, beta, 10, 2);
202 t_stat = coeff / std_err;
203 tab_float (t, 5, 1, 0, t_stat, 10, 2);
204 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
205 tab_float (t, 6, 1, 0, pval, 10, 2);
206 for (j = 1; j <= c->n_indeps; j++)
208 v = pspp_linreg_coeff_get_var (c->coeff + j, 0);
209 label = var_to_string (v);
210 /* Do not overwrite the variable's name. */
211 strncpy (tmp, label, MAX_STRING);
212 if (v->type == ALPHA)
215 Append the value associated with this coefficient.
216 This makes sense only if we us the usual binary encoding
220 val = pspp_linreg_coeff_get_value (c->coeff + j, v);
221 val_s = value_to_string (val, v);
222 strncat (tmp, val_s, MAX_STRING);
225 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
227 Regression coefficients.
229 coeff = c->coeff[j].estimate;
230 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
232 Standard error of the coefficients.
234 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
235 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
237 'Standardized' coefficient, i.e., regression coefficient
238 if all variables had unit variance.
240 beta = gsl_vector_get (c->indep_std, j);
241 beta *= coeff / c->depvar_std;
242 tab_float (t, 4, j + 1, 0, beta, 10, 2);
245 Test statistic for H0: coefficient is 0.
247 t_stat = coeff / std_err;
248 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
250 P values for the test statistic above.
252 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
253 tab_float (t, 6, j + 1, 0, pval, 10, 2);
255 tab_title (t, _("Coefficients"));
261 Display the ANOVA table.
264 reg_stats_anova (pspp_linreg_cache * c)
268 const double msm = c->ssm / c->dfm;
269 const double mse = c->sse / c->dfe;
270 const double F = msm / mse;
271 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
276 t = tab_create (n_cols, n_rows, 0);
277 tab_headers (t, 2, 0, 1, 0);
278 tab_dim (t, tab_natural_dimensions);
280 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
282 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
283 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
284 tab_vline (t, TAL_0, 1, 0, 0);
286 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
287 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
288 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
289 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
290 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
292 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
293 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
294 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
296 /* Sums of Squares */
297 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
298 tab_float (t, 2, 3, 0, c->sst, 10, 2);
299 tab_float (t, 2, 2, 0, c->sse, 10, 2);
302 /* Degrees of freedom */
303 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
304 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
305 tab_float (t, 3, 3, 0, c->dft, 4, 0);
309 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
310 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
312 tab_float (t, 5, 1, 0, F, 8, 3);
314 tab_float (t, 6, 1, 0, pval, 8, 3);
316 tab_title (t, _("ANOVA"));
320 reg_stats_outs (pspp_linreg_cache * c)
325 reg_stats_zpp (pspp_linreg_cache * c)
330 reg_stats_label (pspp_linreg_cache * c)
335 reg_stats_sha (pspp_linreg_cache * c)
340 reg_stats_ci (pspp_linreg_cache * c)
345 reg_stats_f (pspp_linreg_cache * c)
350 reg_stats_bcov (pspp_linreg_cache * c)
362 n_cols = c->n_indeps + 1 + 2;
363 n_rows = 2 * (c->n_indeps + 1);
364 t = tab_create (n_cols, n_rows, 0);
365 tab_headers (t, 2, 0, 1, 0);
366 tab_dim (t, tab_natural_dimensions);
367 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
368 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
369 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
370 tab_vline (t, TAL_0, 1, 0, 0);
371 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
372 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
373 for (i = 1; i < c->n_coeffs; i++)
375 const struct variable *v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
376 label = var_to_string (v);
377 tab_text (t, 2, i, TAB_CENTER, label);
378 tab_text (t, i + 2, 0, TAB_CENTER, label);
379 for (k = 1; k < c->n_coeffs; k++)
381 col = (i <= k) ? k : i;
382 row = (i <= k) ? i : k;
383 tab_float (t, k + 2, i, TAB_CENTER,
384 gsl_matrix_get (c->cov, row, col), 8, 3);
387 tab_title (t, _("Coefficient Correlations"));
391 reg_stats_ses (pspp_linreg_cache * c)
396 reg_stats_xtx (pspp_linreg_cache * c)
401 reg_stats_collin (pspp_linreg_cache * c)
406 reg_stats_tol (pspp_linreg_cache * c)
411 reg_stats_selection (pspp_linreg_cache * c)
417 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
418 int keyword, pspp_linreg_cache * c)
427 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
430 The order here must match the order in which the STATISTICS
431 keywords appear in the specification section above.
458 Set everything but F.
460 for (i = 0; i < f; i++)
467 for (i = 0; i < all; i++)
475 Default output: ANOVA table, parameter estimates,
476 and statistics for variables not entered into model,
479 if (keywords[defaults] | d)
487 statistics_keyword_output (reg_stats_r, keywords[r], c);
488 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
489 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
490 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
491 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
492 statistics_keyword_output (reg_stats_label, keywords[label], c);
493 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
494 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
495 statistics_keyword_output (reg_stats_f, keywords[f], c);
496 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
497 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
498 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
499 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
500 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
501 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
504 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
508 for (i = 0; i < n_vars; i++)
510 if (v->index == varlist[i]->index)
518 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
523 struct variable **varlist;
524 struct pspp_linreg_coeff *coeff;
525 const struct variable *v;
528 fprintf (fp, "%s", reg_export_categorical_encode_1);
530 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
531 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
533 coeff = c->coeff + i;
534 v = pspp_linreg_coeff_get_var (coeff, 0);
535 if (v->type == ALPHA)
537 if (!reg_inserted (v, varlist, n_vars))
539 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
541 varlist[n_vars] = (struct variable *) v;
546 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
547 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
549 for (i = 0; i < n_vars - 1; i++)
551 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
553 fprintf (fp, "&%s};\n\t", varlist[i]->name);
555 for (i = 0; i < n_vars; i++)
557 coeff = c->coeff + i;
558 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
560 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
561 varlist[i]->obs_vals->n_categories);
563 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
565 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
566 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
567 value_to_string (val, varlist[i]));
570 fprintf (fp, "%s", reg_export_categorical_encode_2);
574 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
577 struct pspp_linreg_coeff *coeff;
578 const struct variable *v;
580 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
581 for (i = 1; i < c->n_indeps; i++)
583 coeff = c->coeff + i;
584 v = pspp_linreg_coeff_get_var (coeff, 0);
585 fprintf (fp, "\"%s\",\n\t\t", v->name);
587 coeff = c->coeff + i;
588 v = pspp_linreg_coeff_get_var (coeff, 0);
589 fprintf (fp, "\"%s\"};\n\t", v->name);
592 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
594 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
595 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
596 reg_print_depvars (fp, c);
597 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
599 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
600 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
603 reg_has_categorical (pspp_linreg_cache * c)
606 const struct variable *v;
608 for (i = 1; i < c->n_coeffs; i++)
610 v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
611 if (v->type == ALPHA)
620 subcommand_export (int export, pspp_linreg_cache * c)
625 int n_quantiles = 100;
628 struct pspp_linreg_coeff coeff;
633 assert (model_file != NULL);
634 fp = fopen (fh_get_filename (model_file), "w");
636 fprintf (fp, "%s", reg_preamble);
637 reg_print_getvar (fp, c);
638 if (reg_has_categorical (c))
640 reg_print_categorical_encoding (fp, c);
642 fprintf (fp, "%s", reg_export_t_quantiles_1);
643 increment = 0.5 / (double) increment;
644 for (i = 0; i < n_quantiles - 1; i++)
646 tmp = 0.5 + 0.005 * (double) i;
647 fprintf (fp, "%.15e,\n\t\t",
648 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
650 fprintf (fp, "%.15e};\n\t",
651 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
652 fprintf (fp, "%s", reg_export_t_quantiles_2);
653 fprintf (fp, "%s", reg_mean_cmt);
654 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
655 fprintf (fp, "const char *var_names[])\n{\n\t");
656 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
657 for (i = 1; i < c->n_indeps; i++)
660 fprintf (fp, "%.15e,\n\t\t", coeff.estimate);
663 fprintf (fp, "%.15e};\n\t", coeff.estimate);
665 fprintf (fp, "double estimate = %.15e;\n\t", coeff.estimate);
666 fprintf (fp, "int i;\n\tint j;\n\n\t");
667 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
668 fprintf (fp, "%s", reg_getvar);
669 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
671 for (i = 0; i < c->cov->size1 - 1; i++)
674 for (j = 0; j < c->cov->size2 - 1; j++)
676 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
678 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
681 for (j = 0; j < c->cov->size2 - 1; j++)
683 fprintf (fp, "%.15e, ",
684 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
686 fprintf (fp, "%.15e}\n\t",
687 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
688 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
690 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
691 fprintf (fp, "%s", reg_variance);
692 fprintf (fp, "%s", reg_export_confidence_interval);
693 tmp = c->mse * c->mse;
694 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
695 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
696 fprintf (fp, "%s", reg_export_prediction_interval_3);
698 fp = fopen ("pspp_model_reg.h", "w");
699 fprintf (fp, "%s", reg_header);
704 regression_custom_export (struct cmd_regression *cmd UNUSED)
706 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
707 if (!lex_force_match ('('))
714 model_file = fh_parse (FH_REF_FILE);
715 if (model_file == NULL)
719 if (!lex_force_match (')'))
726 cmd_regression (void)
728 if (!parse_regression (&cmd))
730 if (!multipass_procedure_with_splits (run_regression, &cmd))
731 return CMD_CASCADING_FAILURE;
739 Is variable k the dependent variable?
742 is_depvar (size_t k, const struct variable *v)
745 compare_var_names returns 0 if the variable
748 if (!compare_var_names (v, v_variables[k], NULL))
755 Mark missing cases. Return the number of non-missing cases.
758 mark_missing_cases (const struct casefile *cf, struct variable *v,
759 int *is_missing_case, double n_data)
761 struct casereader *r;
764 const union value *val;
766 for (r = casefile_get_reader (cf);
767 casereader_read (r, &c); case_destroy (&c))
769 row = casereader_cnum (r) - 1;
771 val = case_data (&c, v->fv);
772 cat_value_update (v, val);
773 if (mv_is_value_missing (&v->miss, val))
775 if (!is_missing_case[row])
777 /* Now it is missing. */
779 is_missing_case[row] = 1;
783 casereader_destroy (r);
788 /* Parser for the variables sub command */
790 regression_custom_variables(struct cmd_regression *cmd UNUSED)
795 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
800 if (!parse_variables (default_dict, &v_variables, &n_variables,
811 Count the explanatory variables. The user may or may
812 not have specified a response variable in the syntax.
815 int get_n_indep (const struct variable *v)
820 result = n_variables;
821 while (i < n_variables)
823 if (is_depvar (i, v))
833 Read from the active file. Identify the explanatory variables in
834 v_variables. Encode categorical variables. Drop cases with missing
838 int prepare_data (int n_data, int is_missing_case[],
839 struct variable **indep_vars,
840 struct variable *depvar,
841 const struct casefile *cf)
846 assert (indep_vars != NULL);
848 for (i = 0; i < n_variables; i++)
850 if (!is_depvar (i, depvar))
852 indep_vars[j] = v_variables[i];
854 if (v_variables[i]->type == ALPHA)
856 /* Make a place to hold the binary vectors
857 corresponding to this variable's values. */
858 cat_stored_values_create (v_variables[i]);
860 n_data = mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
864 Mark missing cases for the dependent variable.
866 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
871 run_regression (const struct casefile *cf, void *cmd_ UNUSED)
874 size_t n_data = 0; /* Number of valide cases. */
875 size_t n_cases; /* Number of cases. */
881 Keep track of the missing cases.
883 int *is_missing_case;
884 const union value *val;
885 struct casereader *r;
887 struct variable **indep_vars;
888 struct design_matrix *X;
890 pspp_linreg_cache *lcache;
891 pspp_linreg_opts lopts;
895 dict_get_vars (default_dict, &v_variables, &n_variables,
899 n_cases = casefile_get_case_cnt (cf);
901 for (i = 0; i < cmd.n_dependent; i++)
903 if (cmd.v_dependent[i]->type != NUMERIC)
905 msg (SE, gettext ("Dependent variable must be numeric."));
906 pspp_reg_rc = CMD_FAILURE;
911 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
913 lopts.get_depvar_mean_std = 1;
916 for (k = 0; k < cmd.n_dependent; k++)
918 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
919 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
920 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
921 assert (indep_vars != NULL);
923 for (i = 0; i < n_cases; i++)
925 is_missing_case[i] = 0;
927 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
929 (const struct casefile *) cf);
930 Y = gsl_vector_alloc (n_data);
933 design_matrix_create (n_indep, (const struct variable **) indep_vars,
935 for (i = 0; i < X->m->size2; i++)
937 lopts.get_indep_mean_std[i] = 1;
939 lcache = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
940 lcache->indep_means = gsl_vector_alloc (X->m->size2);
941 lcache->indep_std = gsl_vector_alloc (X->m->size2);
942 lcache->depvar = (const struct variable *) cmd.v_dependent[k];
944 For large data sets, use QR decomposition.
946 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
948 lcache->method = PSPP_LINREG_SVD;
952 The second pass creates the design matrix.
955 for (r = casefile_get_reader (cf); casereader_read (r, &c);
957 /* Iterate over the cases. */
959 case_num = casereader_cnum (r) - 1;
960 if (!is_missing_case[case_num])
962 for (i = 0; i < n_variables; ++i) /* Iterate over the
967 val = case_data (&c, v_variables[i]->fv);
969 Independent/dependent variable separation. The
970 'variables' subcommand specifies a varlist which contains
971 both dependent and independent variables. The dependent
972 variables are specified with the 'dependent'
973 subcommand, and maybe also in the 'variables' subcommand.
974 We need to separate the two.
976 if (!is_depvar (i, cmd.v_dependent[k]))
978 if (v_variables[i]->type == ALPHA)
980 design_matrix_set_categorical (X, row, v_variables[i], val);
982 else if (v_variables[i]->type == NUMERIC)
984 design_matrix_set_numeric (X, row, v_variables[i], val);
988 val = case_data (&c, cmd.v_dependent[k]->fv);
989 gsl_vector_set (Y, row, val->f);
994 Now that we know the number of coefficients, allocate space
995 and store pointers to the variables that correspond to the
998 pspp_linreg_coeff_init (lcache, X);
1001 Find the least-squares estimates and other statistics.
1003 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, lcache);
1004 subcommand_statistics (cmd.a_statistics, lcache);
1005 subcommand_export (cmd.sbc_export, lcache);
1006 gsl_vector_free (Y);
1007 design_matrix_destroy (X);
1009 pspp_linreg_cache_free (lcache);
1010 free (lopts.get_indep_mean_std);
1011 casereader_destroy (r);
1014 free (is_missing_case);