1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_vector.h>
24 #include <gsl/gsl_matrix.h>
30 #include "cat-routines.h"
32 #include "design-matrix.h"
33 #include "dictionary.h"
35 #include "file-handle-def.h"
39 #include "coefficient.h"
40 #include "missing-values.h"
41 #include "regression-export.h"
43 #include "value-labels.h"
45 #include "procedure.h"
47 #define REG_LARGE_DATA 1000
52 "REGRESSION" (regression_):
77 static struct cmd_regression cmd;
80 Array holding the subscripts of the independent variables.
85 File where the model will be saved if the EXPORT subcommand
88 struct file_handle *model_file;
91 Return value for the procedure.
93 int pspp_reg_rc = CMD_SUCCESS;
95 static bool run_regression (const struct casefile *, void *);
98 STATISTICS subcommand output functions.
100 static void reg_stats_r (pspp_linreg_cache *);
101 static void reg_stats_coeff (pspp_linreg_cache *);
102 static void reg_stats_anova (pspp_linreg_cache *);
103 static void reg_stats_outs (pspp_linreg_cache *);
104 static void reg_stats_zpp (pspp_linreg_cache *);
105 static void reg_stats_label (pspp_linreg_cache *);
106 static void reg_stats_sha (pspp_linreg_cache *);
107 static void reg_stats_ci (pspp_linreg_cache *);
108 static void reg_stats_f (pspp_linreg_cache *);
109 static void reg_stats_bcov (pspp_linreg_cache *);
110 static void reg_stats_ses (pspp_linreg_cache *);
111 static void reg_stats_xtx (pspp_linreg_cache *);
112 static void reg_stats_collin (pspp_linreg_cache *);
113 static void reg_stats_tol (pspp_linreg_cache *);
114 static void reg_stats_selection (pspp_linreg_cache *);
115 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
116 int, pspp_linreg_cache *);
119 reg_stats_r (pspp_linreg_cache * c)
129 rsq = c->ssm / c->sst;
130 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
131 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
132 t = tab_create (n_cols, n_rows, 0);
133 tab_dim (t, tab_natural_dimensions);
134 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
135 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
136 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
137 tab_vline (t, TAL_0, 1, 0, 0);
139 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
140 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
141 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
142 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
143 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
144 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
145 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
146 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
147 tab_title (t, 0, _("Model Summary"));
152 Table showing estimated regression coefficients.
155 reg_stats_coeff (pspp_linreg_cache * c)
168 const struct variable *v;
169 const union value *val;
174 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
175 n_rows = c->n_coeffs + 2;
177 t = tab_create (n_cols, n_rows, 0);
178 tab_headers (t, 2, 0, 1, 0);
179 tab_dim (t, tab_natural_dimensions);
180 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
181 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
182 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
183 tab_vline (t, TAL_0, 1, 0, 0);
185 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
186 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
187 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
188 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
189 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
190 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
191 coeff = c->coeff[0].estimate;
192 tab_float (t, 2, 1, 0, coeff, 10, 2);
193 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
194 tab_float (t, 3, 1, 0, std_err, 10, 2);
195 beta = coeff / c->depvar_std;
196 tab_float (t, 4, 1, 0, beta, 10, 2);
197 t_stat = coeff / std_err;
198 tab_float (t, 5, 1, 0, t_stat, 10, 2);
199 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
200 tab_float (t, 6, 1, 0, pval, 10, 2);
201 for (j = 1; j <= c->n_indeps; j++)
204 v = pspp_linreg_coeff_get_var (c->coeff + j, 0);
205 label = var_to_string (v);
206 /* Do not overwrite the variable's name. */
207 strncpy (tmp, label, MAX_STRING);
208 if (v->type == ALPHA)
211 Append the value associated with this coefficient.
212 This makes sense only if we us the usual binary encoding
216 val = pspp_linreg_coeff_get_value (c->coeff + j, v);
217 val_s = value_to_string (val, v);
218 strncat (tmp, val_s, MAX_STRING);
221 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
223 Regression coefficients.
225 coeff = c->coeff[j].estimate;
226 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
228 Standard error of the coefficients.
230 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
231 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
233 'Standardized' coefficient, i.e., regression coefficient
234 if all variables had unit variance.
236 beta = gsl_vector_get (c->indep_std, j);
237 beta *= coeff / c->depvar_std;
238 tab_float (t, 4, j + 1, 0, beta, 10, 2);
241 Test statistic for H0: coefficient is 0.
243 t_stat = coeff / std_err;
244 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
246 P values for the test statistic above.
248 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
249 tab_float (t, 6, j + 1, 0, pval, 10, 2);
251 tab_title (t, 0, _("Coefficients"));
257 Display the ANOVA table.
260 reg_stats_anova (pspp_linreg_cache * c)
264 const double msm = c->ssm / c->dfm;
265 const double mse = c->sse / c->dfe;
266 const double F = msm / mse;
267 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
272 t = tab_create (n_cols, n_rows, 0);
273 tab_headers (t, 2, 0, 1, 0);
274 tab_dim (t, tab_natural_dimensions);
276 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
278 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
279 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
280 tab_vline (t, TAL_0, 1, 0, 0);
282 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
283 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
284 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
285 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
286 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
288 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
289 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
290 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
292 /* Sums of Squares */
293 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
294 tab_float (t, 2, 3, 0, c->sst, 10, 2);
295 tab_float (t, 2, 2, 0, c->sse, 10, 2);
298 /* Degrees of freedom */
299 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
300 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
301 tab_float (t, 3, 3, 0, c->dft, 4, 0);
305 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
306 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
308 tab_float (t, 5, 1, 0, F, 8, 3);
310 tab_float (t, 6, 1, 0, pval, 8, 3);
312 tab_title (t, 0, _("ANOVA"));
316 reg_stats_outs (pspp_linreg_cache * c)
321 reg_stats_zpp (pspp_linreg_cache * c)
326 reg_stats_label (pspp_linreg_cache * c)
331 reg_stats_sha (pspp_linreg_cache * c)
336 reg_stats_ci (pspp_linreg_cache * c)
341 reg_stats_f (pspp_linreg_cache * c)
346 reg_stats_bcov (pspp_linreg_cache * c)
359 n_cols = c->n_indeps + 1 + 2;
360 n_rows = 2 * (c->n_indeps + 1);
361 t = tab_create (n_cols, n_rows, 0);
362 tab_headers (t, 2, 0, 1, 0);
363 tab_dim (t, tab_natural_dimensions);
364 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
365 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
366 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
367 tab_vline (t, TAL_0, 1, 0, 0);
368 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
369 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
370 for (i = 1; i < c->n_indeps + 1; i++)
372 j = indep_vars[(i - 1)];
373 struct variable *v = cmd.v_variables[j];
374 label = var_to_string (v);
375 tab_text (t, 2, i, TAB_CENTER, label);
376 tab_text (t, i + 2, 0, TAB_CENTER, label);
377 for (k = 1; k < c->n_indeps + 1; k++)
379 col = (i <= k) ? k : i;
380 row = (i <= k) ? i : k;
381 tab_float (t, k + 2, i, TAB_CENTER,
382 gsl_matrix_get (c->cov, row, col), 8, 3);
385 tab_title (t, 0, _("Coefficient Correlations"));
389 reg_stats_ses (pspp_linreg_cache * c)
394 reg_stats_xtx (pspp_linreg_cache * c)
399 reg_stats_collin (pspp_linreg_cache * c)
404 reg_stats_tol (pspp_linreg_cache * c)
409 reg_stats_selection (pspp_linreg_cache * c)
415 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
416 int keyword, pspp_linreg_cache * c)
425 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
428 The order here must match the order in which the STATISTICS
429 keywords appear in the specification section above.
456 Set everything but F.
458 for (i = 0; i < f; i++)
465 for (i = 0; i < all; i++)
473 Default output: ANOVA table, parameter estimates,
474 and statistics for variables not entered into model,
477 if (keywords[defaults] | d)
485 statistics_keyword_output (reg_stats_r, keywords[r], c);
486 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
487 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
488 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
489 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
490 statistics_keyword_output (reg_stats_label, keywords[label], c);
491 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
492 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
493 statistics_keyword_output (reg_stats_f, keywords[f], c);
494 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
495 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
496 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
497 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
498 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
499 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
502 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
506 for (i = 0; i < n_vars; i++)
508 if (v->index == varlist[i]->index)
516 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
521 struct variable **varlist;
522 struct pspp_linreg_coeff *coeff;
523 const struct variable *v;
526 fprintf (fp, "%s", reg_export_categorical_encode_1);
528 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
529 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
531 coeff = c->coeff + i;
532 v = pspp_linreg_coeff_get_var (coeff, 0);
533 if (v->type == ALPHA)
535 if (!reg_inserted (v, varlist, n_vars))
537 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
539 varlist[n_vars] = (struct variable *) v;
544 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
545 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
547 for (i = 0; i < n_vars - 1; i++)
549 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
551 fprintf (fp, "&%s};\n\t", varlist[i]->name);
553 for (i = 0; i < n_vars; i++)
555 coeff = c->coeff + i;
556 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
558 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
559 varlist[i]->obs_vals->n_categories);
561 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
563 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
564 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
565 value_to_string (val, varlist[i]));
568 fprintf (fp, "%s", reg_export_categorical_encode_2);
572 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
575 struct pspp_linreg_coeff *coeff;
576 const struct variable *v;
578 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
579 for (i = 1; i < c->n_indeps; i++)
581 coeff = c->coeff + i;
582 v = pspp_linreg_coeff_get_var (coeff, 0);
583 fprintf (fp, "\"%s\",\n\t\t", v->name);
585 coeff = c->coeff + i;
586 v = pspp_linreg_coeff_get_var (coeff, 0);
587 fprintf (fp, "\"%s\"};\n\t", v->name);
590 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
592 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
593 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
594 reg_print_depvars (fp, c);
595 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
597 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
598 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
601 subcommand_export (int export, pspp_linreg_cache * c)
605 int n_quantiles = 100;
608 struct pspp_linreg_coeff coeff;
614 assert (model_file != NULL);
616 fp = fopen (fh_get_filename (model_file), "w");
617 fprintf (fp, "%s", reg_preamble);
618 reg_print_getvar (fp, c);
619 reg_print_categorical_encoding (fp, c);
620 fprintf (fp, "%s", reg_export_t_quantiles_1);
621 increment = 0.5 / (double) increment;
622 for (i = 0; i < n_quantiles - 1; i++)
624 tmp = 0.5 + 0.005 * (double) i;
625 fprintf (fp, "%.15e,\n\t\t",
626 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
628 fprintf (fp, "%.15e};\n\t",
629 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
630 fprintf (fp, "%s", reg_export_t_quantiles_2);
631 fprintf (fp, "%s", reg_mean_cmt);
632 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
633 fprintf (fp, "const char *var_names[])\n{\n\t");
634 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
635 for (i = 1; i < c->n_indeps; i++)
638 fprintf (fp, "%.15e,\n\t\t", coeff.estimate);
641 fprintf (fp, "%.15e};\n\t", coeff.estimate);
643 fprintf (fp, "double estimate = %.15e;\n\t", coeff.estimate);
644 fprintf (fp, "int i;\n\tint j;\n\n\t");
645 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
646 fprintf (fp, "%s", reg_getvar);
647 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
649 for (i = 0; i < c->cov->size1 - 1; i++)
652 for (j = 0; j < c->cov->size2 - 1; j++)
654 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
656 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
659 for (j = 0; j < c->cov->size2 - 1; j++)
661 fprintf (fp, "%.15e, ",
662 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
664 fprintf (fp, "%.15e}\n\t",
665 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
666 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
668 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
669 fprintf (fp, "%s", reg_variance);
670 fprintf (fp, "%s", reg_export_confidence_interval);
671 tmp = c->mse * c->mse;
672 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
673 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
674 fprintf (fp, "%s", reg_export_prediction_interval_3);
676 fp = fopen ("pspp_model_reg.h", "w");
677 fprintf (fp, "%s", reg_header);
682 regression_custom_export (struct cmd_regression *cmd)
684 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
685 if (!lex_force_match ('('))
692 model_file = fh_parse (FH_REF_FILE);
693 if (model_file == NULL)
697 if (!lex_force_match (')'))
704 cmd_regression (void)
706 if (!parse_regression (&cmd))
708 if (!multipass_procedure_with_splits (run_regression, &cmd))
709 return CMD_CASCADING_FAILURE;
715 Is variable k one of the dependent variables?
721 for (j = 0; j < cmd.n_dependent; j++)
724 compare_var_names returns 0 if the variable
727 if (!compare_var_names (cmd.v_dependent[j], cmd.v_variables[k], NULL))
734 Mark missing cases. Return the number of non-missing cases.
737 mark_missing_cases (const struct casefile *cf, struct variable *v,
738 int *is_missing_case, double n_data)
740 struct casereader *r;
743 const union value *val;
745 for (r = casefile_get_reader (cf);
746 casereader_read (r, &c); case_destroy (&c))
748 row = casereader_cnum (r) - 1;
750 val = case_data (&c, v->fv);
751 cat_value_update (v, val);
752 if (mv_is_value_missing (&v->miss, val))
754 if (!is_missing_case[row])
756 /* Now it is missing. */
758 is_missing_case[row] = 1;
762 casereader_destroy (r);
768 run_regression (const struct casefile *cf, void *cmd_ UNUSED)
778 Keep track of the missing cases.
780 int *is_missing_case;
781 const union value *val;
782 struct casereader *r;
785 struct variable *depvar;
786 struct variable **indep_vars;
787 struct design_matrix *X;
789 pspp_linreg_cache *lcache;
790 pspp_linreg_opts lopts;
792 n_data = casefile_get_case_cnt (cf);
794 for (i = 0; i < cmd.n_dependent; i++)
796 if (cmd.v_dependent[i]->type != NUMERIC)
798 msg (SE, gettext ("Dependent variable must be numeric."));
799 pspp_reg_rc = CMD_FAILURE;
804 is_missing_case = xnmalloc (n_data, sizeof (*is_missing_case));
805 for (i = 0; i < n_data; i++)
806 is_missing_case[i] = 0;
808 n_indep = cmd.n_variables - cmd.n_dependent;
809 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
811 lopts.get_depvar_mean_std = 1;
812 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
815 Read from the active file. The first pass encodes categorical
816 variables and drops cases with missing values.
819 for (i = 0; i < cmd.n_variables; i++)
823 v = cmd.v_variables[i];
826 if (v->type == ALPHA)
828 /* Make a place to hold the binary vectors
829 corresponding to this variable's values. */
830 cat_stored_values_create (v);
832 n_data = mark_missing_cases (cf, v, is_missing_case, n_data);
837 Drop cases with missing values for any dependent variable.
840 for (i = 0; i < cmd.n_dependent; i++)
842 v = cmd.v_dependent[i];
844 n_data = mark_missing_cases (cf, v, is_missing_case, n_data);
847 for (k = 0; k < cmd.n_dependent; k++)
849 depvar = cmd.v_dependent[k];
850 Y = gsl_vector_alloc (n_data);
853 design_matrix_create (n_indep, (const struct variable **) indep_vars,
855 for (i = 0; i < X->m->size2; i++)
857 lopts.get_indep_mean_std[i] = 1;
859 lcache = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
860 lcache->indep_means = gsl_vector_alloc (X->m->size2);
861 lcache->indep_std = gsl_vector_alloc (X->m->size2);
862 lcache->depvar = (const struct variable *) depvar;
864 For large data sets, use QR decomposition.
866 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
868 lcache->method = PSPP_LINREG_SVD;
872 The second pass creates the design matrix.
875 for (r = casefile_get_reader (cf); casereader_read (r, &c);
877 /* Iterate over the cases. */
879 case_num = casereader_cnum (r) - 1;
880 if (!is_missing_case[case_num])
882 for (i = 0; i < cmd.n_variables; ++i) /* Iterate over the variables
883 for the current case.
886 v = cmd.v_variables[i];
887 val = case_data (&c, v->fv);
889 Independent/dependent variable separation. The
890 'variables' subcommand specifies a varlist which contains
891 both dependent and independent variables. The dependent
892 variables are specified with the 'dependent'
893 subcommand, and maybe also in the 'variables' subcommand.
894 We need to separate the two.
898 if (v->type == ALPHA)
900 design_matrix_set_categorical (X, row, v, val);
902 else if (v->type == NUMERIC)
904 design_matrix_set_numeric (X, row, v, val);
908 val = case_data (&c, depvar->fv);
909 gsl_vector_set (Y, row, val->f);
914 Now that we know the number of coefficients, allocate space
915 and store pointers to the variables that correspond to the
918 pspp_linreg_coeff_init (lcache, X);
921 Find the least-squares estimates and other statistics.
923 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, lcache);
924 subcommand_statistics (cmd.a_statistics, lcache);
925 subcommand_export (cmd.sbc_export, lcache);
927 design_matrix_destroy (X);
928 pspp_linreg_cache_free (lcache);
929 free (lopts.get_indep_mean_std);
930 casereader_destroy (r);
933 free (is_missing_case);