1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_vector.h>
24 #include <gsl/gsl_matrix.h>
26 #include <libpspp/alloc.h>
27 #include <data/case.h>
28 #include <data/casefile.h>
29 #include <data/category.h>
30 #include <data/cat-routines.h>
31 #include <language/command.h>
32 #include <libpspp/compiler.h>
33 #include <math/design-matrix.h>
34 #include <data/dictionary.h>
35 #include <libpspp/message.h>
36 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <math/linreg/linreg.h>
40 #include <math/linreg/coefficient.h>
41 #include <data/missing-values.h>
42 #include "regression-export.h"
43 #include <output/table.h>
44 #include <data/value-labels.h>
45 #include <data/variable.h>
46 #include "procedure.h"
48 #define REG_LARGE_DATA 1000
53 "REGRESSION" (regression_):
78 static struct cmd_regression cmd;
81 Variables used (both explanatory and response).
83 static struct variable **v_variables;
88 static size_t n_variables;
91 File where the model will be saved if the EXPORT subcommand
94 struct file_handle *model_file;
97 Return value for the procedure.
99 int pspp_reg_rc = CMD_SUCCESS;
101 static bool run_regression (const struct casefile *, void *);
104 STATISTICS subcommand output functions.
106 static void reg_stats_r (pspp_linreg_cache *);
107 static void reg_stats_coeff (pspp_linreg_cache *);
108 static void reg_stats_anova (pspp_linreg_cache *);
109 static void reg_stats_outs (pspp_linreg_cache *);
110 static void reg_stats_zpp (pspp_linreg_cache *);
111 static void reg_stats_label (pspp_linreg_cache *);
112 static void reg_stats_sha (pspp_linreg_cache *);
113 static void reg_stats_ci (pspp_linreg_cache *);
114 static void reg_stats_f (pspp_linreg_cache *);
115 static void reg_stats_bcov (pspp_linreg_cache *);
116 static void reg_stats_ses (pspp_linreg_cache *);
117 static void reg_stats_xtx (pspp_linreg_cache *);
118 static void reg_stats_collin (pspp_linreg_cache *);
119 static void reg_stats_tol (pspp_linreg_cache *);
120 static void reg_stats_selection (pspp_linreg_cache *);
121 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
122 int, pspp_linreg_cache *);
125 reg_stats_r (pspp_linreg_cache * c)
135 rsq = c->ssm / c->sst;
136 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
137 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
138 t = tab_create (n_cols, n_rows, 0);
139 tab_dim (t, tab_natural_dimensions);
140 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
141 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
142 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
143 tab_vline (t, TAL_0, 1, 0, 0);
145 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
146 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
147 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
148 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
149 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
150 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
151 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
152 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
153 tab_title (t, _("Model Summary"));
158 Table showing estimated regression coefficients.
161 reg_stats_coeff (pspp_linreg_cache * c)
173 const struct variable *v;
174 const union value *val;
179 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
180 n_rows = c->n_coeffs + 2;
182 t = tab_create (n_cols, n_rows, 0);
183 tab_headers (t, 2, 0, 1, 0);
184 tab_dim (t, tab_natural_dimensions);
185 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
186 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
187 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
188 tab_vline (t, TAL_0, 1, 0, 0);
190 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
191 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
192 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
193 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
194 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
195 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
196 coeff = c->coeff[0].estimate;
197 tab_float (t, 2, 1, 0, coeff, 10, 2);
198 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
199 tab_float (t, 3, 1, 0, std_err, 10, 2);
200 beta = coeff / c->depvar_std;
201 tab_float (t, 4, 1, 0, beta, 10, 2);
202 t_stat = coeff / std_err;
203 tab_float (t, 5, 1, 0, t_stat, 10, 2);
204 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
205 tab_float (t, 6, 1, 0, pval, 10, 2);
206 for (j = 1; j <= c->n_indeps; j++)
208 v = pspp_linreg_coeff_get_var (c->coeff + j, 0);
209 label = var_to_string (v);
210 /* Do not overwrite the variable's name. */
211 strncpy (tmp, label, MAX_STRING);
212 if (v->type == ALPHA)
215 Append the value associated with this coefficient.
216 This makes sense only if we us the usual binary encoding
220 val = pspp_linreg_coeff_get_value (c->coeff + j, v);
221 val_s = value_to_string (val, v);
222 strncat (tmp, val_s, MAX_STRING);
225 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
227 Regression coefficients.
229 coeff = c->coeff[j].estimate;
230 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
232 Standard error of the coefficients.
234 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
235 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
237 'Standardized' coefficient, i.e., regression coefficient
238 if all variables had unit variance.
240 beta = gsl_vector_get (c->indep_std, j);
241 beta *= coeff / c->depvar_std;
242 tab_float (t, 4, j + 1, 0, beta, 10, 2);
245 Test statistic for H0: coefficient is 0.
247 t_stat = coeff / std_err;
248 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
250 P values for the test statistic above.
252 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
253 tab_float (t, 6, j + 1, 0, pval, 10, 2);
255 tab_title (t, _("Coefficients"));
261 Display the ANOVA table.
264 reg_stats_anova (pspp_linreg_cache * c)
268 const double msm = c->ssm / c->dfm;
269 const double mse = c->sse / c->dfe;
270 const double F = msm / mse;
271 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
276 t = tab_create (n_cols, n_rows, 0);
277 tab_headers (t, 2, 0, 1, 0);
278 tab_dim (t, tab_natural_dimensions);
280 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
282 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
283 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
284 tab_vline (t, TAL_0, 1, 0, 0);
286 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
287 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
288 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
289 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
290 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
292 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
293 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
294 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
296 /* Sums of Squares */
297 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
298 tab_float (t, 2, 3, 0, c->sst, 10, 2);
299 tab_float (t, 2, 2, 0, c->sse, 10, 2);
302 /* Degrees of freedom */
303 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
304 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
305 tab_float (t, 3, 3, 0, c->dft, 4, 0);
309 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
310 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
312 tab_float (t, 5, 1, 0, F, 8, 3);
314 tab_float (t, 6, 1, 0, pval, 8, 3);
316 tab_title (t, _("ANOVA"));
320 reg_stats_outs (pspp_linreg_cache * c)
325 reg_stats_zpp (pspp_linreg_cache * c)
330 reg_stats_label (pspp_linreg_cache * c)
335 reg_stats_sha (pspp_linreg_cache * c)
340 reg_stats_ci (pspp_linreg_cache * c)
345 reg_stats_f (pspp_linreg_cache * c)
350 reg_stats_bcov (pspp_linreg_cache * c)
362 n_cols = c->n_indeps + 1 + 2;
363 n_rows = 2 * (c->n_indeps + 1);
364 t = tab_create (n_cols, n_rows, 0);
365 tab_headers (t, 2, 0, 1, 0);
366 tab_dim (t, tab_natural_dimensions);
367 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
368 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
369 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
370 tab_vline (t, TAL_0, 1, 0, 0);
371 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
372 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
373 for (i = 1; i < c->n_coeffs; i++)
375 const struct variable *v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
376 label = var_to_string (v);
377 tab_text (t, 2, i, TAB_CENTER, label);
378 tab_text (t, i + 2, 0, TAB_CENTER, label);
379 for (k = 1; k < c->n_coeffs; k++)
381 col = (i <= k) ? k : i;
382 row = (i <= k) ? i : k;
383 tab_float (t, k + 2, i, TAB_CENTER,
384 gsl_matrix_get (c->cov, row, col), 8, 3);
387 tab_title (t, _("Coefficient Correlations"));
391 reg_stats_ses (pspp_linreg_cache * c)
396 reg_stats_xtx (pspp_linreg_cache * c)
401 reg_stats_collin (pspp_linreg_cache * c)
406 reg_stats_tol (pspp_linreg_cache * c)
411 reg_stats_selection (pspp_linreg_cache * c)
417 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
418 int keyword, pspp_linreg_cache * c)
427 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
430 The order here must match the order in which the STATISTICS
431 keywords appear in the specification section above.
458 Set everything but F.
460 for (i = 0; i < f; i++)
467 for (i = 0; i < all; i++)
475 Default output: ANOVA table, parameter estimates,
476 and statistics for variables not entered into model,
479 if (keywords[defaults] | d)
487 statistics_keyword_output (reg_stats_r, keywords[r], c);
488 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
489 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
490 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
491 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
492 statistics_keyword_output (reg_stats_label, keywords[label], c);
493 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
494 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
495 statistics_keyword_output (reg_stats_f, keywords[f], c);
496 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
497 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
498 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
499 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
500 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
501 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
504 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
508 for (i = 0; i < n_vars; i++)
510 if (v->index == varlist[i]->index)
518 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
523 struct variable **varlist;
524 struct pspp_linreg_coeff *coeff;
525 const struct variable *v;
528 fprintf (fp, "%s", reg_export_categorical_encode_1);
530 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
531 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
533 coeff = c->coeff + i;
534 v = pspp_linreg_coeff_get_var (coeff, 0);
535 if (v->type == ALPHA)
537 if (!reg_inserted (v, varlist, n_vars))
539 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
541 varlist[n_vars] = (struct variable *) v;
546 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
547 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
549 for (i = 0; i < n_vars - 1; i++)
551 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
553 fprintf (fp, "&%s};\n\t", varlist[i]->name);
555 for (i = 0; i < n_vars; i++)
557 coeff = c->coeff + i;
558 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
560 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
561 varlist[i]->obs_vals->n_categories);
563 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
565 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
566 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
567 value_to_string (val, varlist[i]));
570 fprintf (fp, "%s", reg_export_categorical_encode_2);
574 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
577 struct pspp_linreg_coeff *coeff;
578 const struct variable *v;
580 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
581 for (i = 1; i < c->n_indeps; i++)
583 coeff = c->coeff + i;
584 v = pspp_linreg_coeff_get_var (coeff, 0);
585 fprintf (fp, "\"%s\",\n\t\t", v->name);
587 coeff = c->coeff + i;
588 v = pspp_linreg_coeff_get_var (coeff, 0);
589 fprintf (fp, "\"%s\"};\n\t", v->name);
592 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
594 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
595 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
596 reg_print_depvars (fp, c);
597 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
599 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
600 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
603 subcommand_export (int export, pspp_linreg_cache * c)
608 int n_quantiles = 100;
611 struct pspp_linreg_coeff coeff;
616 assert (model_file != NULL);
617 fp = fopen (fh_get_filename (model_file), "w");
619 fprintf (fp, "%s", reg_preamble);
620 reg_print_getvar (fp, c);
621 reg_print_categorical_encoding (fp, c);
622 fprintf (fp, "%s", reg_export_t_quantiles_1);
623 increment = 0.5 / (double) increment;
624 for (i = 0; i < n_quantiles - 1; i++)
626 tmp = 0.5 + 0.005 * (double) i;
627 fprintf (fp, "%.15e,\n\t\t",
628 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
630 fprintf (fp, "%.15e};\n\t",
631 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
632 fprintf (fp, "%s", reg_export_t_quantiles_2);
633 fprintf (fp, "%s", reg_mean_cmt);
634 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
635 fprintf (fp, "const char *var_names[])\n{\n\t");
636 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
637 for (i = 1; i < c->n_indeps; i++)
640 fprintf (fp, "%.15e,\n\t\t", coeff.estimate);
643 fprintf (fp, "%.15e};\n\t", coeff.estimate);
645 fprintf (fp, "double estimate = %.15e;\n\t", coeff.estimate);
646 fprintf (fp, "int i;\n\tint j;\n\n\t");
647 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
648 fprintf (fp, "%s", reg_getvar);
649 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
651 for (i = 0; i < c->cov->size1 - 1; i++)
654 for (j = 0; j < c->cov->size2 - 1; j++)
656 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
658 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
661 for (j = 0; j < c->cov->size2 - 1; j++)
663 fprintf (fp, "%.15e, ",
664 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
666 fprintf (fp, "%.15e}\n\t",
667 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
668 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
670 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
671 fprintf (fp, "%s", reg_variance);
672 fprintf (fp, "%s", reg_export_confidence_interval);
673 tmp = c->mse * c->mse;
674 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
675 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
676 fprintf (fp, "%s", reg_export_prediction_interval_3);
678 fp = fopen ("pspp_model_reg.h", "w");
679 fprintf (fp, "%s", reg_header);
684 regression_custom_export (struct cmd_regression *cmd UNUSED)
686 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
687 if (!lex_force_match ('('))
694 model_file = fh_parse (FH_REF_FILE);
695 if (model_file == NULL)
699 if (!lex_force_match (')'))
706 cmd_regression (void)
708 if (!parse_regression (&cmd))
710 if (!multipass_procedure_with_splits (run_regression, &cmd))
711 return CMD_CASCADING_FAILURE;
719 Is variable k the dependent variable?
722 is_depvar (size_t k, const struct variable *v)
725 compare_var_names returns 0 if the variable
728 if (!compare_var_names (v, v_variables[k], NULL))
735 Mark missing cases. Return the number of non-missing cases.
738 mark_missing_cases (const struct casefile *cf, struct variable *v,
739 int *is_missing_case, double n_data)
741 struct casereader *r;
744 const union value *val;
746 for (r = casefile_get_reader (cf);
747 casereader_read (r, &c); case_destroy (&c))
749 row = casereader_cnum (r) - 1;
751 val = case_data (&c, v->fv);
752 cat_value_update (v, val);
753 if (mv_is_value_missing (&v->miss, val))
755 if (!is_missing_case[row])
757 /* Now it is missing. */
759 is_missing_case[row] = 1;
763 casereader_destroy (r);
768 /* Parser for the variables sub command */
770 regression_custom_variables(struct cmd_regression *cmd UNUSED)
775 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
780 if (!parse_variables (default_dict, &v_variables, &n_variables,
791 Count the explanatory variables. The user may or may
792 not have specified a response variable in the syntax.
795 int get_n_indep (const struct variable *v)
800 result = n_variables;
801 while (i < n_variables)
803 if (is_depvar (i, v))
813 Read from the active file. Identify the explanatory variables in
814 v_variables. Encode categorical variables. Drop cases with missing
818 int prepare_data (int n_data, int is_missing_case[],
819 struct variable **indep_vars,
820 struct variable *depvar,
821 const struct casefile *cf)
826 assert (indep_vars != NULL);
828 for (i = 0; i < n_variables; i++)
830 if (!is_depvar (i, depvar))
832 indep_vars[j] = v_variables[i];
834 if (v_variables[i]->type == ALPHA)
836 /* Make a place to hold the binary vectors
837 corresponding to this variable's values. */
838 cat_stored_values_create (v_variables[i]);
840 n_data = mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
844 Mark missing cases for the dependent variable.
846 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
851 run_regression (const struct casefile *cf, void *cmd_ UNUSED)
854 size_t n_data = 0; /* Number of valide cases. */
855 size_t n_cases; /* Number of cases. */
861 Keep track of the missing cases.
863 int *is_missing_case;
864 const union value *val;
865 struct casereader *r;
867 struct variable **indep_vars;
868 struct design_matrix *X;
870 pspp_linreg_cache *lcache;
871 pspp_linreg_opts lopts;
875 dict_get_vars (default_dict, &v_variables, &n_variables,
879 n_cases = casefile_get_case_cnt (cf);
881 for (i = 0; i < cmd.n_dependent; i++)
883 if (cmd.v_dependent[i]->type != NUMERIC)
885 msg (SE, gettext ("Dependent variable must be numeric."));
886 pspp_reg_rc = CMD_FAILURE;
891 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
893 lopts.get_depvar_mean_std = 1;
896 for (k = 0; k < cmd.n_dependent; k++)
898 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
899 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
900 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
901 assert (indep_vars != NULL);
903 for (i = 0; i < n_cases; i++)
905 is_missing_case[i] = 0;
907 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
909 (const struct casefile *) cf);
910 Y = gsl_vector_alloc (n_data);
913 design_matrix_create (n_indep, (const struct variable **) indep_vars,
915 for (i = 0; i < X->m->size2; i++)
917 lopts.get_indep_mean_std[i] = 1;
919 lcache = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
920 lcache->indep_means = gsl_vector_alloc (X->m->size2);
921 lcache->indep_std = gsl_vector_alloc (X->m->size2);
922 lcache->depvar = (const struct variable *) cmd.v_dependent[k];
924 For large data sets, use QR decomposition.
926 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
928 lcache->method = PSPP_LINREG_SVD;
932 The second pass creates the design matrix.
935 for (r = casefile_get_reader (cf); casereader_read (r, &c);
937 /* Iterate over the cases. */
939 case_num = casereader_cnum (r) - 1;
940 if (!is_missing_case[case_num])
942 for (i = 0; i < n_variables; ++i) /* Iterate over the
947 val = case_data (&c, v_variables[i]->fv);
949 Independent/dependent variable separation. The
950 'variables' subcommand specifies a varlist which contains
951 both dependent and independent variables. The dependent
952 variables are specified with the 'dependent'
953 subcommand, and maybe also in the 'variables' subcommand.
954 We need to separate the two.
956 if (!is_depvar (i, cmd.v_dependent[k]))
958 if (v_variables[i]->type == ALPHA)
960 design_matrix_set_categorical (X, row, v_variables[i], val);
962 else if (v_variables[i]->type == NUMERIC)
964 design_matrix_set_numeric (X, row, v_variables[i], val);
968 val = case_data (&c, cmd.v_dependent[k]->fv);
969 gsl_vector_set (Y, row, val->f);
974 Now that we know the number of coefficients, allocate space
975 and store pointers to the variables that correspond to the
978 pspp_linreg_coeff_init (lcache, X);
981 Find the least-squares estimates and other statistics.
983 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, lcache);
984 subcommand_statistics (cmd.a_statistics, lcache);
985 subcommand_export (cmd.sbc_export, lcache);
987 design_matrix_destroy (X);
989 pspp_linreg_cache_free (lcache);
990 free (lopts.get_indep_mean_std);
991 casereader_destroy (r);
994 free (is_missing_case);