1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_vector.h>
24 #include <gsl/gsl_matrix.h>
29 #include "cat-routines.h"
31 #include "design-matrix.h"
32 #include "dictionary.h"
34 #include "file-handle.h"
37 #include <linreg/pspp_linreg.h>
38 #include "missing-values.h"
39 #include "regression_export.h"
41 #include "value-labels.h"
45 #define REG_LARGE_DATA 1000
50 "REGRESSION" (regression_):
75 static struct cmd_regression cmd;
78 Array holding the subscripts of the independent variables.
83 File where the model will be saved if the EXPORT subcommand
86 struct file_handle *model_file;
89 Return value for the procedure.
91 int pspp_reg_rc = CMD_SUCCESS;
93 static void run_regression (const struct casefile *, void *);
94 static union value *pspp_reg_coefficient_to_value (const pspp_linreg_cache *,
100 Which value of a categorical variable corresponds to the
101 coefficient? This routine provides the answer IF the
102 categorical variable is encoded in the usual way: The
103 first value maps to the zero vector, the second value to
104 the vector (0, 1, 0, ...), the third value (0, 0, 1, 0, ...),
108 pspp_reg_coefficient_to_value (const pspp_linreg_cache * cache,
109 const struct pspp_linreg_coeff *coeff,
113 int k = 1; /* First coefficient is the intercept. */
115 const union value *result;
116 struct pspp_linreg_coeff c;
118 assert (cache != NULL);
119 assert (coeff != NULL);
120 assert (coeff->v->type == ALPHA);
121 while (!found && k < cache->n_coeffs)
125 compare_var_names returns 0 if the variable
128 if (!compare_var_names (coeff->v, c.v, NULL))
141 result = cat_subscript_to_value ((const size_t) which, coeff->v);
146 STATISTICS subcommand output functions.
148 static void reg_stats_r (pspp_linreg_cache *);
149 static void reg_stats_coeff (pspp_linreg_cache *);
150 static void reg_stats_anova (pspp_linreg_cache *);
151 static void reg_stats_outs (pspp_linreg_cache *);
152 static void reg_stats_zpp (pspp_linreg_cache *);
153 static void reg_stats_label (pspp_linreg_cache *);
154 static void reg_stats_sha (pspp_linreg_cache *);
155 static void reg_stats_ci (pspp_linreg_cache *);
156 static void reg_stats_f (pspp_linreg_cache *);
157 static void reg_stats_bcov (pspp_linreg_cache *);
158 static void reg_stats_ses (pspp_linreg_cache *);
159 static void reg_stats_xtx (pspp_linreg_cache *);
160 static void reg_stats_collin (pspp_linreg_cache *);
161 static void reg_stats_tol (pspp_linreg_cache *);
162 static void reg_stats_selection (pspp_linreg_cache *);
163 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
164 int, pspp_linreg_cache *);
167 reg_stats_r (pspp_linreg_cache * c)
177 rsq = c->ssm / c->sst;
178 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
179 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
180 t = tab_create (n_cols, n_rows, 0);
181 tab_dim (t, tab_natural_dimensions);
182 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
183 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
184 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
185 tab_vline (t, TAL_0, 1, 0, 0);
187 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
188 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
189 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
190 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
191 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
192 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
193 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
194 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
195 tab_title (t, 0, _("Model Summary"));
200 Table showing estimated regression coefficients.
203 reg_stats_coeff (pspp_linreg_cache * c)
216 const union value *val;
221 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
222 n_rows = c->n_coeffs + 2;
224 t = tab_create (n_cols, n_rows, 0);
225 tab_headers (t, 2, 0, 1, 0);
226 tab_dim (t, tab_natural_dimensions);
227 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
228 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
229 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
230 tab_vline (t, TAL_0, 1, 0, 0);
232 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
233 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
234 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
235 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
236 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
237 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
238 coeff = c->coeff[0].estimate;
239 tab_float (t, 2, 1, 0, coeff, 10, 2);
240 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
241 tab_float (t, 3, 1, 0, std_err, 10, 2);
242 beta = coeff / c->depvar_std;
243 tab_float (t, 4, 1, 0, beta, 10, 2);
244 t_stat = coeff / std_err;
245 tab_float (t, 5, 1, 0, t_stat, 10, 2);
246 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
247 tab_float (t, 6, 1, 0, pval, 10, 2);
248 for (j = 1; j <= c->n_indeps; j++)
251 label = var_to_string (c->coeff[j].v);
252 /* Do not overwrite the variable's name. */
253 strncpy (tmp, label, MAX_STRING);
254 if (c->coeff[j].v->type == ALPHA)
257 Append the value associated with this coefficient.
258 This makes sense only if we us the usual binary encoding
261 val = pspp_reg_coefficient_to_value (c, &(c->coeff[j]), j);
262 val_s = value_to_string (val, c->coeff[j].v);
263 strncat (tmp, val_s, MAX_STRING);
266 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
268 Regression coefficients.
270 coeff = c->coeff[j].estimate;
271 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
273 Standard error of the coefficients.
275 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
276 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
278 'Standardized' coefficient, i.e., regression coefficient
279 if all variables had unit variance.
281 beta = gsl_vector_get (c->indep_std, j);
282 beta *= coeff / c->depvar_std;
283 tab_float (t, 4, j + 1, 0, beta, 10, 2);
286 Test statistic for H0: coefficient is 0.
288 t_stat = coeff / std_err;
289 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
291 P values for the test statistic above.
293 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
294 tab_float (t, 6, j + 1, 0, pval, 10, 2);
296 tab_title (t, 0, _("Coefficients"));
302 Display the ANOVA table.
305 reg_stats_anova (pspp_linreg_cache * c)
309 const double msm = c->ssm / c->dfm;
310 const double mse = c->sse / c->dfe;
311 const double F = msm / mse;
312 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
317 t = tab_create (n_cols, n_rows, 0);
318 tab_headers (t, 2, 0, 1, 0);
319 tab_dim (t, tab_natural_dimensions);
321 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
323 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
324 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
325 tab_vline (t, TAL_0, 1, 0, 0);
327 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
328 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
329 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
330 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
331 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
333 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
334 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
335 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
337 /* Sums of Squares */
338 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
339 tab_float (t, 2, 3, 0, c->sst, 10, 2);
340 tab_float (t, 2, 2, 0, c->sse, 10, 2);
343 /* Degrees of freedom */
344 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
345 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
346 tab_float (t, 3, 3, 0, c->dft, 4, 0);
350 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
351 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
353 tab_float (t, 5, 1, 0, F, 8, 3);
355 tab_float (t, 6, 1, 0, pval, 8, 3);
357 tab_title (t, 0, _("ANOVA"));
361 reg_stats_outs (pspp_linreg_cache * c)
366 reg_stats_zpp (pspp_linreg_cache * c)
371 reg_stats_label (pspp_linreg_cache * c)
376 reg_stats_sha (pspp_linreg_cache * c)
381 reg_stats_ci (pspp_linreg_cache * c)
386 reg_stats_f (pspp_linreg_cache * c)
391 reg_stats_bcov (pspp_linreg_cache * c)
404 n_cols = c->n_indeps + 1 + 2;
405 n_rows = 2 * (c->n_indeps + 1);
406 t = tab_create (n_cols, n_rows, 0);
407 tab_headers (t, 2, 0, 1, 0);
408 tab_dim (t, tab_natural_dimensions);
409 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
410 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
411 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
412 tab_vline (t, TAL_0, 1, 0, 0);
413 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
414 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
415 for (i = 1; i < c->n_indeps + 1; i++)
417 j = indep_vars[(i - 1)];
418 struct variable *v = cmd.v_variables[j];
419 label = var_to_string (v);
420 tab_text (t, 2, i, TAB_CENTER, label);
421 tab_text (t, i + 2, 0, TAB_CENTER, label);
422 for (k = 1; k < c->n_indeps + 1; k++)
424 col = (i <= k) ? k : i;
425 row = (i <= k) ? i : k;
426 tab_float (t, k + 2, i, TAB_CENTER,
427 gsl_matrix_get (c->cov, row, col), 8, 3);
430 tab_title (t, 0, _("Coefficient Correlations"));
434 reg_stats_ses (pspp_linreg_cache * c)
439 reg_stats_xtx (pspp_linreg_cache * c)
444 reg_stats_collin (pspp_linreg_cache * c)
449 reg_stats_tol (pspp_linreg_cache * c)
454 reg_stats_selection (pspp_linreg_cache * c)
460 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
461 int keyword, pspp_linreg_cache * c)
470 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
473 The order here must match the order in which the STATISTICS
474 keywords appear in the specification section above.
501 Set everything but F.
503 for (i = 0; i < f; i++)
510 for (i = 0; i < all; i++)
518 Default output: ANOVA table, parameter estimates,
519 and statistics for variables not entered into model,
522 if (keywords[defaults] | d)
530 statistics_keyword_output (reg_stats_r, keywords[r], c);
531 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
532 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
533 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
534 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
535 statistics_keyword_output (reg_stats_label, keywords[label], c);
536 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
537 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
538 statistics_keyword_output (reg_stats_f, keywords[f], c);
539 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
540 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
541 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
542 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
543 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
544 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
547 reg_inserted (struct variable *v, struct variable **varlist, int n_vars)
551 for (i = 0; i < n_vars; i++)
553 if (v->index == varlist[i]->index)
561 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
566 struct variable **varlist;
567 struct pspp_linreg_coeff coeff;
570 fprintf (fp, "%s", reg_export_categorical_encode_1);
572 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
573 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
576 if (coeff.v->type == ALPHA)
578 if (!reg_inserted (coeff.v, varlist, n_vars))
580 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
582 varlist[n_vars] = coeff.v;
587 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
588 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
590 for (i = 0; i < n_vars - 1; i++)
592 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
594 fprintf (fp, "&%s};\n\t", varlist[i]->name);
596 for (i = 0; i < n_vars; i++)
599 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
601 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
602 varlist[i]->obs_vals->n_categories);
604 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
606 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
607 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
608 value_to_string (val, varlist[i]));
611 fprintf (fp, "%s", reg_export_categorical_encode_2);
615 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
618 struct pspp_linreg_coeff coeff;
620 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
621 for (i = 1; i < c->n_indeps; i++)
624 fprintf (fp, "\"%s\",\n\t\t", coeff.v->name);
627 fprintf (fp, "\"%s\"};\n\t", coeff.v->name);
630 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
632 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
633 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
634 reg_print_depvars (fp, c);
635 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
637 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
638 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
641 subcommand_export (int export, pspp_linreg_cache * c)
646 int n_quantiles = 100;
649 struct pspp_linreg_coeff coeff;
654 assert (model_file != NULL);
656 fp = fopen (handle_get_filename (model_file), "w");
657 fprintf (fp, "%s", reg_preamble);
658 reg_print_getvar (fp, c);
659 reg_print_categorical_encoding (fp, c);
660 fprintf (fp, "%s", reg_export_t_quantiles_1);
661 increment = 0.5 / (double) increment;
662 for (i = 0; i < n_quantiles - 1; i++)
664 tmp = 0.5 + 0.005 * (double) i;
665 fprintf (fp, "%.15e,\n\t\t",
666 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
668 fprintf (fp, "%.15e};\n\t",
669 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
670 fprintf (fp, "%s", reg_export_t_quantiles_2);
671 fprintf (fp, "%s", reg_mean_cmt);
672 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
673 fprintf (fp, "const char *var_names[])\n{\n\t");
674 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
675 for (i = 1; i < c->n_indeps; i++)
678 fprintf (fp, "%.15e,\n\t\t", coeff.estimate);
681 fprintf (fp, "%.15e};\n\t", coeff.estimate);
683 fprintf (fp, "double estimate = %.15e;\n\t", coeff.estimate);
684 fprintf (fp, "int i;\n\tint j;\n\n\t");
685 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
686 fprintf (fp, "%s", reg_getvar);
687 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
689 for (i = 0; i < c->cov->size1 - 1; i++)
692 for (j = 0; j < c->cov->size2 - 1; j++)
694 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
696 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
699 for (j = 0; j < c->cov->size2 - 1; j++)
701 fprintf (fp, "%.15e, ",
702 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
704 fprintf (fp, "%.15e}\n\t",
705 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
706 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
708 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
709 fprintf (fp, "%s", reg_variance);
710 fprintf (fp, "%s", reg_export_confidence_interval);
711 tmp = c->mse * c->mse;
712 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
713 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
714 fprintf (fp, "%s", reg_export_prediction_interval_3);
716 fp = fopen ("pspp_model_reg.h", "w");
717 fprintf (fp, "%s", reg_header);
722 regression_custom_export (struct cmd_regression *cmd)
724 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
725 if (!lex_force_match ('('))
732 model_file = fh_parse ();
733 if (model_file == NULL)
737 if (!lex_force_match (')'))
744 cmd_regression (void)
746 if (!parse_regression (&cmd))
750 multipass_procedure_with_splits (run_regression, &cmd);
756 Is variable k one of the dependent variables?
762 for (j = 0; j < cmd.n_dependent; j++)
765 compare_var_names returns 0 if the variable
768 if (!compare_var_names (cmd.v_dependent[j], cmd.v_variables[k], NULL))
775 Mark missing cases. Return the number of non-missing cases.
778 mark_missing_cases (const struct casefile *cf, struct variable *v,
779 double *is_missing_case, double n_data)
781 struct casereader *r;
786 for (r = casefile_get_reader (cf);
787 casereader_read (r, &c); case_destroy (&c))
789 row = casereader_cnum (r) - 1;
791 val = case_data (&c, v->fv);
792 cat_value_update (v, val);
793 if (mv_is_value_missing (&v->miss, val))
795 if (!is_missing_case[row])
797 /* Now it is missing. */
799 is_missing_case[row] = 1;
803 casereader_destroy (r);
808 run_regression (const struct casefile *cf, void *cmd_ UNUSED)
818 Keep track of the missing cases.
820 int *is_missing_case;
821 const union value *val;
822 struct casereader *r;
825 struct variable *depvar;
826 struct variable **indep_vars;
827 struct design_matrix *X;
829 pspp_linreg_cache *lcache;
830 pspp_linreg_opts lopts;
832 n_data = casefile_get_case_cnt (cf);
834 for (i = 0; i < cmd.n_dependent; i++)
836 if (cmd.v_dependent[i]->type != NUMERIC)
838 msg (SE, gettext ("Dependent variable must be numeric."));
839 pspp_reg_rc = CMD_FAILURE;
844 is_missing_case = xnmalloc (n_data, sizeof (*is_missing_case));
845 for (i = 0; i < n_data; i++)
846 is_missing_case[i] = 0;
848 n_indep = cmd.n_variables - cmd.n_dependent;
849 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
851 lopts.get_depvar_mean_std = 1;
852 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
855 Read from the active file. The first pass encodes categorical
856 variables and drops cases with missing values.
859 for (i = 0; i < cmd.n_variables; i++)
863 v = cmd.v_variables[i];
866 if (v->type == ALPHA)
868 /* Make a place to hold the binary vectors
869 corresponding to this variable's values. */
870 cat_stored_values_create (v);
872 n_data = mark_missing_cases (cf, v, is_missing_case, n_data);
877 Drop cases with missing values for any dependent variable.
880 for (i = 0; i < cmd.n_dependent; i++)
882 v = cmd.v_dependent[i];
884 n_data = mark_missing_cases (cf, v, is_missing_case, n_data);
887 for (k = 0; k < cmd.n_dependent; k++)
889 depvar = cmd.v_dependent[k];
890 Y = gsl_vector_alloc (n_data);
893 design_matrix_create (n_indep, (const struct variable **) indep_vars,
895 for (i = 0; i < X->m->size2; i++)
897 lopts.get_indep_mean_std[i] = 1;
899 lcache = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
900 lcache->indep_means = gsl_vector_alloc (X->m->size2);
901 lcache->indep_std = gsl_vector_alloc (X->m->size2);
902 lcache->depvar = (const struct variable *) depvar;
904 For large data sets, use QR decomposition.
906 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
908 lcache->method = PSPP_LINREG_SVD;
912 The second pass creates the design matrix.
915 for (r = casefile_get_reader (cf); casereader_read (r, &c);
917 /* Iterate over the cases. */
919 case_num = casereader_cnum (r) - 1;
920 if (!is_missing_case[case_num])
922 for (i = 0; i < cmd.n_variables; ++i) /* Iterate over the variables
923 for the current case.
926 v = cmd.v_variables[i];
927 val = case_data (&c, v->fv);
929 Independent/dependent variable separation. The
930 'variables' subcommand specifies a varlist which contains
931 both dependent and independent variables. The dependent
932 variables are specified with the 'dependent'
933 subcommand, and maybe also in the 'variables' subcommand.
934 We need to separate the two.
938 if (v->type == ALPHA)
940 design_matrix_set_categorical (X, row, v, val);
942 else if (v->type == NUMERIC)
944 design_matrix_set_numeric (X, row, v, val);
948 val = case_data (&c, depvar->fv);
949 gsl_vector_set (Y, row, val->f);
954 Now that we know the number of coefficients, allocate space
955 and store pointers to the variables that correspond to the
958 lcache->coeff = xnmalloc (X->m->size2 + 1, sizeof (*lcache->coeff));
959 for (i = 0; i < X->m->size2; i++)
961 j = i + 1; /* The first coeff is the intercept. */
963 (const struct variable *) design_matrix_col_to_var (X, i);
964 assert (lcache->coeff[j].v != NULL);
968 Find the least-squares estimates and other statistics.
970 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, lcache);
971 subcommand_statistics (cmd.a_statistics, lcache);
972 subcommand_export (cmd.sbc_export, lcache);
974 design_matrix_destroy (X);
975 pspp_linreg_cache_free (lcache);
976 free (lopts.get_indep_mean_std);
977 casereader_destroy (r);
980 free (is_missing_case);