1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_vector.h>
24 #include <gsl/gsl_matrix.h>
30 #include "cat-routines.h"
33 #include "design-matrix.h"
34 #include "dictionary.h"
36 #include "file-handle-def.h"
40 #include "coefficient.h"
41 #include "missing-values.h"
42 #include "regression-export.h"
44 #include "value-labels.h"
46 #include "procedure.h"
48 #define REG_LARGE_DATA 1000
53 "REGRESSION" (regression_):
78 static struct cmd_regression cmd;
81 Array holding the subscripts of the independent variables.
86 File where the model will be saved if the EXPORT subcommand
89 struct file_handle *model_file;
92 Return value for the procedure.
94 int pspp_reg_rc = CMD_SUCCESS;
96 static bool run_regression (const struct casefile *, void *);
99 STATISTICS subcommand output functions.
101 static void reg_stats_r (pspp_linreg_cache *);
102 static void reg_stats_coeff (pspp_linreg_cache *);
103 static void reg_stats_anova (pspp_linreg_cache *);
104 static void reg_stats_outs (pspp_linreg_cache *);
105 static void reg_stats_zpp (pspp_linreg_cache *);
106 static void reg_stats_label (pspp_linreg_cache *);
107 static void reg_stats_sha (pspp_linreg_cache *);
108 static void reg_stats_ci (pspp_linreg_cache *);
109 static void reg_stats_f (pspp_linreg_cache *);
110 static void reg_stats_bcov (pspp_linreg_cache *);
111 static void reg_stats_ses (pspp_linreg_cache *);
112 static void reg_stats_xtx (pspp_linreg_cache *);
113 static void reg_stats_collin (pspp_linreg_cache *);
114 static void reg_stats_tol (pspp_linreg_cache *);
115 static void reg_stats_selection (pspp_linreg_cache *);
116 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
117 int, pspp_linreg_cache *);
120 reg_stats_r (pspp_linreg_cache * c)
130 rsq = c->ssm / c->sst;
131 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
132 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
133 t = tab_create (n_cols, n_rows, 0);
134 tab_dim (t, tab_natural_dimensions);
135 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
136 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
137 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
138 tab_vline (t, TAL_0, 1, 0, 0);
140 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
141 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
142 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
143 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
144 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
145 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
146 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
147 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
148 tab_title (t, 0, _("Model Summary"));
153 Table showing estimated regression coefficients.
156 reg_stats_coeff (pspp_linreg_cache * c)
169 const struct variable *v;
170 const union value *val;
175 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
176 n_rows = c->n_coeffs + 2;
178 t = tab_create (n_cols, n_rows, 0);
179 tab_headers (t, 2, 0, 1, 0);
180 tab_dim (t, tab_natural_dimensions);
181 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
182 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
183 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
184 tab_vline (t, TAL_0, 1, 0, 0);
186 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
187 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
188 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
189 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
190 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
191 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
192 coeff = c->coeff[0].estimate;
193 tab_float (t, 2, 1, 0, coeff, 10, 2);
194 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
195 tab_float (t, 3, 1, 0, std_err, 10, 2);
196 beta = coeff / c->depvar_std;
197 tab_float (t, 4, 1, 0, beta, 10, 2);
198 t_stat = coeff / std_err;
199 tab_float (t, 5, 1, 0, t_stat, 10, 2);
200 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
201 tab_float (t, 6, 1, 0, pval, 10, 2);
202 for (j = 1; j <= c->n_indeps; j++)
205 v = pspp_linreg_coeff_get_var (c->coeff + j, 0);
206 label = var_to_string (v);
207 /* Do not overwrite the variable's name. */
208 strncpy (tmp, label, MAX_STRING);
209 if (v->type == ALPHA)
212 Append the value associated with this coefficient.
213 This makes sense only if we us the usual binary encoding
217 val = pspp_linreg_coeff_get_value (c->coeff + j, v);
218 val_s = value_to_string (val, v);
219 strncat (tmp, val_s, MAX_STRING);
222 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
224 Regression coefficients.
226 coeff = c->coeff[j].estimate;
227 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
229 Standard error of the coefficients.
231 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
232 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
234 'Standardized' coefficient, i.e., regression coefficient
235 if all variables had unit variance.
237 beta = gsl_vector_get (c->indep_std, j);
238 beta *= coeff / c->depvar_std;
239 tab_float (t, 4, j + 1, 0, beta, 10, 2);
242 Test statistic for H0: coefficient is 0.
244 t_stat = coeff / std_err;
245 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
247 P values for the test statistic above.
249 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
250 tab_float (t, 6, j + 1, 0, pval, 10, 2);
252 tab_title (t, 0, _("Coefficients"));
258 Display the ANOVA table.
261 reg_stats_anova (pspp_linreg_cache * c)
265 const double msm = c->ssm / c->dfm;
266 const double mse = c->sse / c->dfe;
267 const double F = msm / mse;
268 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
273 t = tab_create (n_cols, n_rows, 0);
274 tab_headers (t, 2, 0, 1, 0);
275 tab_dim (t, tab_natural_dimensions);
277 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
279 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
280 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
281 tab_vline (t, TAL_0, 1, 0, 0);
283 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
284 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
285 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
286 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
287 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
289 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
290 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
291 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
293 /* Sums of Squares */
294 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
295 tab_float (t, 2, 3, 0, c->sst, 10, 2);
296 tab_float (t, 2, 2, 0, c->sse, 10, 2);
299 /* Degrees of freedom */
300 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
301 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
302 tab_float (t, 3, 3, 0, c->dft, 4, 0);
306 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
307 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
309 tab_float (t, 5, 1, 0, F, 8, 3);
311 tab_float (t, 6, 1, 0, pval, 8, 3);
313 tab_title (t, 0, _("ANOVA"));
317 reg_stats_outs (pspp_linreg_cache * c)
322 reg_stats_zpp (pspp_linreg_cache * c)
327 reg_stats_label (pspp_linreg_cache * c)
332 reg_stats_sha (pspp_linreg_cache * c)
337 reg_stats_ci (pspp_linreg_cache * c)
342 reg_stats_f (pspp_linreg_cache * c)
347 reg_stats_bcov (pspp_linreg_cache * c)
360 n_cols = c->n_indeps + 1 + 2;
361 n_rows = 2 * (c->n_indeps + 1);
362 t = tab_create (n_cols, n_rows, 0);
363 tab_headers (t, 2, 0, 1, 0);
364 tab_dim (t, tab_natural_dimensions);
365 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
366 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
367 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
368 tab_vline (t, TAL_0, 1, 0, 0);
369 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
370 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
371 for (i = 1; i < c->n_indeps + 1; i++)
373 j = indep_vars[(i - 1)];
374 struct variable *v = cmd.v_variables[j];
375 label = var_to_string (v);
376 tab_text (t, 2, i, TAB_CENTER, label);
377 tab_text (t, i + 2, 0, TAB_CENTER, label);
378 for (k = 1; k < c->n_indeps + 1; k++)
380 col = (i <= k) ? k : i;
381 row = (i <= k) ? i : k;
382 tab_float (t, k + 2, i, TAB_CENTER,
383 gsl_matrix_get (c->cov, row, col), 8, 3);
386 tab_title (t, 0, _("Coefficient Correlations"));
390 reg_stats_ses (pspp_linreg_cache * c)
395 reg_stats_xtx (pspp_linreg_cache * c)
400 reg_stats_collin (pspp_linreg_cache * c)
405 reg_stats_tol (pspp_linreg_cache * c)
410 reg_stats_selection (pspp_linreg_cache * c)
416 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
417 int keyword, pspp_linreg_cache * c)
426 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
429 The order here must match the order in which the STATISTICS
430 keywords appear in the specification section above.
457 Set everything but F.
459 for (i = 0; i < f; i++)
466 for (i = 0; i < all; i++)
474 Default output: ANOVA table, parameter estimates,
475 and statistics for variables not entered into model,
478 if (keywords[defaults] | d)
486 statistics_keyword_output (reg_stats_r, keywords[r], c);
487 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
488 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
489 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
490 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
491 statistics_keyword_output (reg_stats_label, keywords[label], c);
492 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
493 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
494 statistics_keyword_output (reg_stats_f, keywords[f], c);
495 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
496 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
497 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
498 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
499 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
500 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
503 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
507 for (i = 0; i < n_vars; i++)
509 if (v->index == varlist[i]->index)
517 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
522 struct variable **varlist;
523 struct pspp_linreg_coeff *coeff;
524 const struct variable *v;
527 fprintf (fp, "%s", reg_export_categorical_encode_1);
529 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
530 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
532 coeff = c->coeff + i;
533 v = pspp_linreg_coeff_get_var (coeff, 0);
534 if (v->type == ALPHA)
536 if (!reg_inserted (v, varlist, n_vars))
538 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
540 varlist[n_vars] = (struct variable *) v;
545 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
546 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
548 for (i = 0; i < n_vars - 1; i++)
550 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
552 fprintf (fp, "&%s};\n\t", varlist[i]->name);
554 for (i = 0; i < n_vars; i++)
556 coeff = c->coeff + i;
557 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
559 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
560 varlist[i]->obs_vals->n_categories);
562 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
564 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
565 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
566 value_to_string (val, varlist[i]));
569 fprintf (fp, "%s", reg_export_categorical_encode_2);
573 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
576 struct pspp_linreg_coeff *coeff;
577 const struct variable *v;
579 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
580 for (i = 1; i < c->n_indeps; i++)
582 coeff = c->coeff + i;
583 v = pspp_linreg_coeff_get_var (coeff, 0);
584 fprintf (fp, "\"%s\",\n\t\t", v->name);
586 coeff = c->coeff + i;
587 v = pspp_linreg_coeff_get_var (coeff, 0);
588 fprintf (fp, "\"%s\"};\n\t", v->name);
591 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
593 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
594 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
595 reg_print_depvars (fp, c);
596 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
598 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
599 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
602 subcommand_export (int export, pspp_linreg_cache * c)
606 int n_quantiles = 100;
609 struct pspp_linreg_coeff coeff;
615 assert (model_file != NULL);
617 fp = fopen (fh_get_filename (model_file), "w");
618 fprintf (fp, "%s", reg_preamble);
619 reg_print_getvar (fp, c);
620 reg_print_categorical_encoding (fp, c);
621 fprintf (fp, "%s", reg_export_t_quantiles_1);
622 increment = 0.5 / (double) increment;
623 for (i = 0; i < n_quantiles - 1; i++)
625 tmp = 0.5 + 0.005 * (double) i;
626 fprintf (fp, "%.15e,\n\t\t",
627 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
629 fprintf (fp, "%.15e};\n\t",
630 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
631 fprintf (fp, "%s", reg_export_t_quantiles_2);
632 fprintf (fp, "%s", reg_mean_cmt);
633 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
634 fprintf (fp, "const char *var_names[])\n{\n\t");
635 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
636 for (i = 1; i < c->n_indeps; i++)
639 fprintf (fp, "%.15e,\n\t\t", coeff.estimate);
642 fprintf (fp, "%.15e};\n\t", coeff.estimate);
644 fprintf (fp, "double estimate = %.15e;\n\t", coeff.estimate);
645 fprintf (fp, "int i;\n\tint j;\n\n\t");
646 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
647 fprintf (fp, "%s", reg_getvar);
648 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
650 for (i = 0; i < c->cov->size1 - 1; i++)
653 for (j = 0; j < c->cov->size2 - 1; j++)
655 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
657 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
660 for (j = 0; j < c->cov->size2 - 1; j++)
662 fprintf (fp, "%.15e, ",
663 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
665 fprintf (fp, "%.15e}\n\t",
666 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
667 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
669 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
670 fprintf (fp, "%s", reg_variance);
671 fprintf (fp, "%s", reg_export_confidence_interval);
672 tmp = c->mse * c->mse;
673 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
674 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
675 fprintf (fp, "%s", reg_export_prediction_interval_3);
677 fp = fopen ("pspp_model_reg.h", "w");
678 fprintf (fp, "%s", reg_header);
683 regression_custom_export (struct cmd_regression *cmd)
685 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
686 if (!lex_force_match ('('))
693 model_file = fh_parse (FH_REF_FILE);
694 if (model_file == NULL)
698 if (!lex_force_match (')'))
705 cmd_regression (void)
707 if (!parse_regression (&cmd))
709 if (!multipass_procedure_with_splits (run_regression, &cmd))
710 return CMD_CASCADING_FAILURE;
716 Is variable k one of the dependent variables?
722 for (j = 0; j < cmd.n_dependent; j++)
725 compare_var_names returns 0 if the variable
728 if (!compare_var_names (cmd.v_dependent[j], cmd.v_variables[k], NULL))
735 Mark missing cases. Return the number of non-missing cases.
738 mark_missing_cases (const struct casefile *cf, struct variable *v,
739 int *is_missing_case, double n_data)
741 struct casereader *r;
744 const union value *val;
746 for (r = casefile_get_reader (cf);
747 casereader_read (r, &c); case_destroy (&c))
749 row = casereader_cnum (r) - 1;
751 val = case_data (&c, v->fv);
752 cat_value_update (v, val);
753 if (mv_is_value_missing (&v->miss, val))
755 if (!is_missing_case[row])
757 /* Now it is missing. */
759 is_missing_case[row] = 1;
763 casereader_destroy (r);
769 run_regression (const struct casefile *cf, void *cmd_ UNUSED)
779 Keep track of the missing cases.
781 int *is_missing_case;
782 const union value *val;
783 struct casereader *r;
786 struct variable *depvar;
787 struct variable **indep_vars;
788 struct design_matrix *X;
790 pspp_linreg_cache *lcache;
791 pspp_linreg_opts lopts;
793 n_data = casefile_get_case_cnt (cf);
795 for (i = 0; i < cmd.n_dependent; i++)
797 if (cmd.v_dependent[i]->type != NUMERIC)
799 msg (SE, gettext ("Dependent variable must be numeric."));
800 pspp_reg_rc = CMD_FAILURE;
805 is_missing_case = xnmalloc (n_data, sizeof (*is_missing_case));
806 for (i = 0; i < n_data; i++)
807 is_missing_case[i] = 0;
809 n_indep = cmd.n_variables - cmd.n_dependent;
810 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
812 lopts.get_depvar_mean_std = 1;
813 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
816 Read from the active file. The first pass encodes categorical
817 variables and drops cases with missing values.
820 for (i = 0; i < cmd.n_variables; i++)
824 v = cmd.v_variables[i];
827 if (v->type == ALPHA)
829 /* Make a place to hold the binary vectors
830 corresponding to this variable's values. */
831 cat_stored_values_create (v);
833 n_data = mark_missing_cases (cf, v, is_missing_case, n_data);
838 Drop cases with missing values for any dependent variable.
841 for (i = 0; i < cmd.n_dependent; i++)
843 v = cmd.v_dependent[i];
845 n_data = mark_missing_cases (cf, v, is_missing_case, n_data);
848 for (k = 0; k < cmd.n_dependent; k++)
850 depvar = cmd.v_dependent[k];
851 Y = gsl_vector_alloc (n_data);
854 design_matrix_create (n_indep, (const struct variable **) indep_vars,
856 for (i = 0; i < X->m->size2; i++)
858 lopts.get_indep_mean_std[i] = 1;
860 lcache = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
861 lcache->indep_means = gsl_vector_alloc (X->m->size2);
862 lcache->indep_std = gsl_vector_alloc (X->m->size2);
863 lcache->depvar = (const struct variable *) depvar;
865 For large data sets, use QR decomposition.
867 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
869 lcache->method = PSPP_LINREG_SVD;
873 The second pass creates the design matrix.
876 for (r = casefile_get_reader (cf); casereader_read (r, &c);
878 /* Iterate over the cases. */
880 case_num = casereader_cnum (r) - 1;
881 if (!is_missing_case[case_num])
883 for (i = 0; i < cmd.n_variables; ++i) /* Iterate over the variables
884 for the current case.
887 v = cmd.v_variables[i];
888 val = case_data (&c, v->fv);
890 Independent/dependent variable separation. The
891 'variables' subcommand specifies a varlist which contains
892 both dependent and independent variables. The dependent
893 variables are specified with the 'dependent'
894 subcommand, and maybe also in the 'variables' subcommand.
895 We need to separate the two.
899 if (v->type == ALPHA)
901 design_matrix_set_categorical (X, row, v, val);
903 else if (v->type == NUMERIC)
905 design_matrix_set_numeric (X, row, v, val);
909 val = case_data (&c, depvar->fv);
910 gsl_vector_set (Y, row, val->f);
915 Now that we know the number of coefficients, allocate space
916 and store pointers to the variables that correspond to the
919 pspp_linreg_coeff_init (lcache, X);
922 Find the least-squares estimates and other statistics.
924 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, lcache);
925 subcommand_statistics (cmd.a_statistics, lcache);
926 subcommand_export (cmd.sbc_export, lcache);
928 design_matrix_destroy (X);
929 pspp_linreg_cache_free (lcache);
930 free (lopts.get_indep_mean_std);
931 casereader_destroy (r);
934 free (is_missing_case);