1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_matrix.h>
24 #include <gsl/gsl_vector.h>
28 #include "regression-export.h"
29 #include <data/case.h>
30 #include <data/casefile.h>
31 #include <data/cat-routines.h>
32 #include <data/category.h>
33 #include <data/dictionary.h>
34 #include <data/missing-values.h>
35 #include <data/procedure.h>
36 #include <data/transformations.h>
37 #include <data/value-labels.h>
38 #include <data/variable.h>
39 #include <language/command.h>
40 #include <language/data-io/file-handle.h>
41 #include <language/lexer/lexer.h>
42 #include <libpspp/alloc.h>
43 #include <libpspp/compiler.h>
44 #include <libpspp/message.h>
45 #include <math/design-matrix.h>
46 #include <math/linreg/coefficient.h>
47 #include <math/linreg/linreg.h>
48 #include <output/table.h>
52 #define REG_LARGE_DATA 1000
57 "REGRESSION" (regression_):
83 static struct cmd_regression cmd;
85 /* Linear regression models. */
86 pspp_linreg_cache **models = NULL;
89 Transformations for saving predicted values
94 int n_trns; /* Number of transformations. */
95 int trns_id; /* Which trns is this one? */
96 pspp_linreg_cache *c; /* Linear model for this trns. */
99 Variables used (both explanatory and response).
101 static struct variable **v_variables;
106 static size_t n_variables;
109 File where the model will be saved if the EXPORT subcommand
112 struct file_handle *model_file;
115 Return value for the procedure.
117 int pspp_reg_rc = CMD_SUCCESS;
119 static bool run_regression (const struct casefile *, void *);
122 STATISTICS subcommand output functions.
124 static void reg_stats_r (pspp_linreg_cache *);
125 static void reg_stats_coeff (pspp_linreg_cache *);
126 static void reg_stats_anova (pspp_linreg_cache *);
127 static void reg_stats_outs (pspp_linreg_cache *);
128 static void reg_stats_zpp (pspp_linreg_cache *);
129 static void reg_stats_label (pspp_linreg_cache *);
130 static void reg_stats_sha (pspp_linreg_cache *);
131 static void reg_stats_ci (pspp_linreg_cache *);
132 static void reg_stats_f (pspp_linreg_cache *);
133 static void reg_stats_bcov (pspp_linreg_cache *);
134 static void reg_stats_ses (pspp_linreg_cache *);
135 static void reg_stats_xtx (pspp_linreg_cache *);
136 static void reg_stats_collin (pspp_linreg_cache *);
137 static void reg_stats_tol (pspp_linreg_cache *);
138 static void reg_stats_selection (pspp_linreg_cache *);
139 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
140 int, pspp_linreg_cache *);
143 reg_stats_r (pspp_linreg_cache * c)
153 rsq = c->ssm / c->sst;
154 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
155 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
156 t = tab_create (n_cols, n_rows, 0);
157 tab_dim (t, tab_natural_dimensions);
158 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
159 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
160 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
161 tab_vline (t, TAL_0, 1, 0, 0);
163 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
164 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
165 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
166 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
167 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
168 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
169 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
170 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
171 tab_title (t, _("Model Summary"));
176 Table showing estimated regression coefficients.
179 reg_stats_coeff (pspp_linreg_cache * c)
191 const struct variable *v;
192 const union value *val;
197 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
198 n_rows = c->n_coeffs + 2;
200 t = tab_create (n_cols, n_rows, 0);
201 tab_headers (t, 2, 0, 1, 0);
202 tab_dim (t, tab_natural_dimensions);
203 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
204 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
205 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
206 tab_vline (t, TAL_0, 1, 0, 0);
208 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
209 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
210 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
211 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
212 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
213 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
214 coeff = c->coeff[0].estimate;
215 tab_float (t, 2, 1, 0, coeff, 10, 2);
216 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
217 tab_float (t, 3, 1, 0, std_err, 10, 2);
218 beta = coeff / c->depvar_std;
219 tab_float (t, 4, 1, 0, beta, 10, 2);
220 t_stat = coeff / std_err;
221 tab_float (t, 5, 1, 0, t_stat, 10, 2);
222 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
223 tab_float (t, 6, 1, 0, pval, 10, 2);
224 for (j = 1; j <= c->n_indeps; j++)
226 v = pspp_linreg_coeff_get_var (c->coeff + j, 0);
227 label = var_to_string (v);
228 /* Do not overwrite the variable's name. */
229 strncpy (tmp, label, MAX_STRING);
230 if (v->type == ALPHA)
233 Append the value associated with this coefficient.
234 This makes sense only if we us the usual binary encoding
238 val = pspp_linreg_coeff_get_value (c->coeff + j, v);
239 val_s = value_to_string (val, v);
240 strncat (tmp, val_s, MAX_STRING);
243 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
245 Regression coefficients.
247 coeff = c->coeff[j].estimate;
248 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
250 Standard error of the coefficients.
252 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
253 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
255 'Standardized' coefficient, i.e., regression coefficient
256 if all variables had unit variance.
258 beta = gsl_vector_get (c->indep_std, j);
259 beta *= coeff / c->depvar_std;
260 tab_float (t, 4, j + 1, 0, beta, 10, 2);
263 Test statistic for H0: coefficient is 0.
265 t_stat = coeff / std_err;
266 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
268 P values for the test statistic above.
270 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
271 tab_float (t, 6, j + 1, 0, pval, 10, 2);
273 tab_title (t, _("Coefficients"));
279 Display the ANOVA table.
282 reg_stats_anova (pspp_linreg_cache * c)
286 const double msm = c->ssm / c->dfm;
287 const double mse = c->sse / c->dfe;
288 const double F = msm / mse;
289 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
294 t = tab_create (n_cols, n_rows, 0);
295 tab_headers (t, 2, 0, 1, 0);
296 tab_dim (t, tab_natural_dimensions);
298 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
300 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
301 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
302 tab_vline (t, TAL_0, 1, 0, 0);
304 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
305 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
306 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
307 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
308 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
310 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
311 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
312 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
314 /* Sums of Squares */
315 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
316 tab_float (t, 2, 3, 0, c->sst, 10, 2);
317 tab_float (t, 2, 2, 0, c->sse, 10, 2);
320 /* Degrees of freedom */
321 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
322 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
323 tab_float (t, 3, 3, 0, c->dft, 4, 0);
327 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
328 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
330 tab_float (t, 5, 1, 0, F, 8, 3);
332 tab_float (t, 6, 1, 0, pval, 8, 3);
334 tab_title (t, _("ANOVA"));
338 reg_stats_outs (pspp_linreg_cache * c)
343 reg_stats_zpp (pspp_linreg_cache * c)
348 reg_stats_label (pspp_linreg_cache * c)
353 reg_stats_sha (pspp_linreg_cache * c)
358 reg_stats_ci (pspp_linreg_cache * c)
363 reg_stats_f (pspp_linreg_cache * c)
368 reg_stats_bcov (pspp_linreg_cache * c)
380 n_cols = c->n_indeps + 1 + 2;
381 n_rows = 2 * (c->n_indeps + 1);
382 t = tab_create (n_cols, n_rows, 0);
383 tab_headers (t, 2, 0, 1, 0);
384 tab_dim (t, tab_natural_dimensions);
385 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
386 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
387 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
388 tab_vline (t, TAL_0, 1, 0, 0);
389 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
390 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
391 for (i = 1; i < c->n_coeffs; i++)
393 const struct variable *v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
394 label = var_to_string (v);
395 tab_text (t, 2, i, TAB_CENTER, label);
396 tab_text (t, i + 2, 0, TAB_CENTER, label);
397 for (k = 1; k < c->n_coeffs; k++)
399 col = (i <= k) ? k : i;
400 row = (i <= k) ? i : k;
401 tab_float (t, k + 2, i, TAB_CENTER,
402 gsl_matrix_get (c->cov, row, col), 8, 3);
405 tab_title (t, _("Coefficient Correlations"));
409 reg_stats_ses (pspp_linreg_cache * c)
414 reg_stats_xtx (pspp_linreg_cache * c)
419 reg_stats_collin (pspp_linreg_cache * c)
424 reg_stats_tol (pspp_linreg_cache * c)
429 reg_stats_selection (pspp_linreg_cache * c)
435 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
436 int keyword, pspp_linreg_cache * c)
445 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
448 The order here must match the order in which the STATISTICS
449 keywords appear in the specification section above.
476 Set everything but F.
478 for (i = 0; i < f; i++)
485 for (i = 0; i < all; i++)
493 Default output: ANOVA table, parameter estimates,
494 and statistics for variables not entered into model,
497 if (keywords[defaults] | d)
505 statistics_keyword_output (reg_stats_r, keywords[r], c);
506 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
507 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
508 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
509 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
510 statistics_keyword_output (reg_stats_label, keywords[label], c);
511 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
512 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
513 statistics_keyword_output (reg_stats_f, keywords[f], c);
514 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
515 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
516 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
517 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
518 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
519 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
522 Free the transformation. Free its linear model if this
523 transformation is the last one.
526 bool regression_trns_free (void *t_)
529 struct reg_trns *t = t_;
531 if (t->trns_id == t->n_trns)
533 result = pspp_linreg_cache_free (t->c);
540 Gets the predicted values.
543 regression_trns_pred_proc (void *t_, struct ccase *c, int case_idx UNUSED)
547 struct reg_trns *trns = t_;
548 pspp_linreg_cache *model;
549 union value *output = NULL;
550 const union value **vals = NULL;
551 struct variable **vars = NULL;
553 assert (trns!= NULL);
555 assert (model != NULL);
556 assert (model->depvar != NULL);
557 assert (model->pred != NULL);
559 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
560 n_vals = (*model->get_vars) (model, vars);
562 vals = xnmalloc (n_vals, sizeof (*vals));
563 output = case_data_rw (c, model->pred->fv);
564 assert (output != NULL);
566 for (i = 0; i < n_vals; i++)
568 vals[i] = case_data (c, vars[i]->fv);
570 output->f = (*model->predict) ((const struct variable **) vars,
571 vals, model, n_vals);
574 return TRNS_CONTINUE;
580 regression_trns_resid_proc (void *t_, struct ccase *c, int case_idx UNUSED)
584 struct reg_trns *trns = t_;
585 pspp_linreg_cache *model;
586 union value *output = NULL;
587 const union value **vals = NULL;
588 const union value *obs = NULL;
589 struct variable **vars = NULL;
591 assert (trns!= NULL);
593 assert (model != NULL);
594 assert (model->depvar != NULL);
595 assert (model->resid != NULL);
597 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
598 n_vals = (*model->get_vars) (model, vars);
600 vals = xnmalloc (n_vals, sizeof (*vals));
601 output = case_data_rw (c, model->resid->fv);
602 assert (output != NULL);
604 for (i = 0; i < n_vals; i++)
606 vals[i] = case_data (c, vars[i]->fv);
608 obs = case_data (c, model->depvar->fv);
609 output->f = (*model->residual) ((const struct variable **) vars,
610 vals, obs, model, n_vals);
613 return TRNS_CONTINUE;
616 Returns 0 if NAME is a duplicate of any existing variable name.
619 try_name (char *name)
621 if (dict_lookup_var (default_dict, name) != NULL)
627 void reg_get_name (char name[LONG_NAME_LEN], const char prefix[LONG_NAME_LEN])
631 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
632 while (!try_name(name))
635 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
639 reg_save_var (const char *prefix, trns_proc_func *f,
640 pspp_linreg_cache *c, struct variable **v,
643 static int trns_index = 1;
644 char name[LONG_NAME_LEN];
645 struct variable *new_var;
646 struct reg_trns *t = NULL;
648 t = xmalloc (sizeof (*t));
649 t->trns_id = trns_index;
652 reg_get_name (name, prefix);
653 new_var = dict_create_var (default_dict, name, 0);
654 assert (new_var != NULL);
656 add_transformation (f, regression_trns_free, t);
660 subcommand_save (int save, pspp_linreg_cache **models)
662 pspp_linreg_cache **lc;
666 assert (models != NULL);
670 /* Count the number of transformations we will need. */
671 for (i = 0; i < REGRESSION_SV_count; i++)
678 n_trns *= cmd.n_dependent;
680 for (lc = models; lc < models + cmd.n_dependent; lc++)
682 assert (*lc != NULL);
683 assert ((*lc)->depvar != NULL);
684 if (cmd.a_save[REGRESSION_SV_RESID])
686 reg_save_var ("RES", regression_trns_resid_proc, *lc, &(*lc)->resid, n_trns);
688 if (cmd.a_save[REGRESSION_SV_PRED])
690 reg_save_var ("PRED", regression_trns_pred_proc, *lc, &(*lc)->pred, n_trns);
696 for (lc = models; lc < models + cmd.n_dependent; lc++)
698 assert (*lc != NULL);
699 pspp_linreg_cache_free (*lc);
704 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
708 for (i = 0; i < n_vars; i++)
710 if (v->index == varlist[i]->index)
718 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
723 struct variable **varlist;
724 struct pspp_linreg_coeff *coeff;
725 const struct variable *v;
728 fprintf (fp, "%s", reg_export_categorical_encode_1);
730 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
731 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
733 coeff = c->coeff + i;
734 v = pspp_linreg_coeff_get_var (coeff, 0);
735 if (v->type == ALPHA)
737 if (!reg_inserted (v, varlist, n_vars))
739 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
741 varlist[n_vars] = (struct variable *) v;
746 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
747 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
749 for (i = 0; i < n_vars - 1; i++)
751 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
753 fprintf (fp, "&%s};\n\t", varlist[i]->name);
755 for (i = 0; i < n_vars; i++)
757 coeff = c->coeff + i;
758 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
760 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
761 varlist[i]->obs_vals->n_categories);
763 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
765 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
766 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
767 value_to_string (val, varlist[i]));
770 fprintf (fp, "%s", reg_export_categorical_encode_2);
774 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
777 struct pspp_linreg_coeff *coeff;
778 const struct variable *v;
780 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
781 for (i = 1; i < c->n_indeps; i++)
783 coeff = c->coeff + i;
784 v = pspp_linreg_coeff_get_var (coeff, 0);
785 fprintf (fp, "\"%s\",\n\t\t", v->name);
787 coeff = c->coeff + i;
788 v = pspp_linreg_coeff_get_var (coeff, 0);
789 fprintf (fp, "\"%s\"};\n\t", v->name);
792 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
794 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
795 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
796 reg_print_depvars (fp, c);
797 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
799 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
800 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
803 reg_has_categorical (pspp_linreg_cache * c)
806 const struct variable *v;
808 for (i = 1; i < c->n_coeffs; i++)
810 v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
811 if (v->type == ALPHA)
820 subcommand_export (int export, pspp_linreg_cache * c)
825 int n_quantiles = 100;
828 struct pspp_linreg_coeff coeff;
833 assert (model_file != NULL);
834 fp = fopen (fh_get_file_name (model_file), "w");
836 fprintf (fp, "%s", reg_preamble);
837 reg_print_getvar (fp, c);
838 if (reg_has_categorical (c))
840 reg_print_categorical_encoding (fp, c);
842 fprintf (fp, "%s", reg_export_t_quantiles_1);
843 increment = 0.5 / (double) increment;
844 for (i = 0; i < n_quantiles - 1; i++)
846 tmp = 0.5 + 0.005 * (double) i;
847 fprintf (fp, "%.15e,\n\t\t",
848 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
850 fprintf (fp, "%.15e};\n\t",
851 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
852 fprintf (fp, "%s", reg_export_t_quantiles_2);
853 fprintf (fp, "%s", reg_mean_cmt);
854 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
855 fprintf (fp, "const char *var_names[])\n{\n\t");
856 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
857 for (i = 1; i < c->n_indeps; i++)
860 fprintf (fp, "%.15e,\n\t\t", coeff.estimate);
863 fprintf (fp, "%.15e};\n\t", coeff.estimate);
865 fprintf (fp, "double estimate = %.15e;\n\t", coeff.estimate);
866 fprintf (fp, "int i;\n\tint j;\n\n\t");
867 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
868 fprintf (fp, "%s", reg_getvar);
869 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
871 for (i = 0; i < c->cov->size1 - 1; i++)
874 for (j = 0; j < c->cov->size2 - 1; j++)
876 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
878 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
881 for (j = 0; j < c->cov->size2 - 1; j++)
883 fprintf (fp, "%.15e, ",
884 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
886 fprintf (fp, "%.15e}\n\t",
887 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
888 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
890 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
891 fprintf (fp, "%s", reg_variance);
892 fprintf (fp, "%s", reg_export_confidence_interval);
893 tmp = c->mse * c->mse;
894 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
895 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
896 fprintf (fp, "%s", reg_export_prediction_interval_3);
898 fp = fopen ("pspp_model_reg.h", "w");
899 fprintf (fp, "%s", reg_header);
904 regression_custom_export (struct cmd_regression *cmd UNUSED)
906 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
907 if (!lex_force_match ('('))
914 model_file = fh_parse (FH_REF_FILE);
915 if (model_file == NULL)
919 if (!lex_force_match (')'))
926 cmd_regression (void)
928 if (!parse_regression (&cmd))
931 models = xnmalloc (cmd.n_dependent, sizeof *models);
932 if (!multipass_procedure_with_splits (run_regression, &cmd))
933 return CMD_CASCADING_FAILURE;
934 subcommand_save (cmd.sbc_save, models);
941 Is variable k the dependent variable?
944 is_depvar (size_t k, const struct variable *v)
947 compare_var_names returns 0 if the variable
950 if (!compare_var_names (v, v_variables[k], NULL))
957 Mark missing cases. Return the number of non-missing cases.
960 mark_missing_cases (const struct casefile *cf, struct variable *v,
961 int *is_missing_case, double n_data)
963 struct casereader *r;
966 const union value *val;
968 for (r = casefile_get_reader (cf);
969 casereader_read (r, &c); case_destroy (&c))
971 row = casereader_cnum (r) - 1;
973 val = case_data (&c, v->fv);
974 cat_value_update (v, val);
975 if (mv_is_value_missing (&v->miss, val))
977 if (!is_missing_case[row])
979 /* Now it is missing. */
981 is_missing_case[row] = 1;
985 casereader_destroy (r);
990 /* Parser for the variables sub command */
992 regression_custom_variables(struct cmd_regression *cmd UNUSED)
997 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
1002 if (!parse_variables (default_dict, &v_variables, &n_variables,
1008 assert(n_variables);
1013 Count the explanatory variables. The user may or may
1014 not have specified a response variable in the syntax.
1017 int get_n_indep (const struct variable *v)
1022 result = n_variables;
1023 while (i < n_variables)
1025 if (is_depvar (i, v))
1035 Read from the active file. Identify the explanatory variables in
1036 v_variables. Encode categorical variables. Drop cases with missing
1040 int prepare_data (int n_data, int is_missing_case[],
1041 struct variable **indep_vars,
1042 struct variable *depvar,
1043 const struct casefile *cf)
1048 assert (indep_vars != NULL);
1050 for (i = 0; i < n_variables; i++)
1052 if (!is_depvar (i, depvar))
1054 indep_vars[j] = v_variables[i];
1056 if (v_variables[i]->type == ALPHA)
1058 /* Make a place to hold the binary vectors
1059 corresponding to this variable's values. */
1060 cat_stored_values_create (v_variables[i]);
1062 n_data = mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
1066 Mark missing cases for the dependent variable.
1068 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
1073 run_regression (const struct casefile *cf, void *cmd_ UNUSED)
1076 size_t n_data = 0; /* Number of valide cases. */
1077 size_t n_cases; /* Number of cases. */
1083 Keep track of the missing cases.
1085 int *is_missing_case;
1086 const union value *val;
1087 struct casereader *r;
1089 struct variable **indep_vars;
1090 struct design_matrix *X;
1093 pspp_linreg_opts lopts;
1095 assert (models != NULL);
1098 dict_get_vars (default_dict, &v_variables, &n_variables,
1102 n_cases = casefile_get_case_cnt (cf);
1104 for (i = 0; i < cmd.n_dependent; i++)
1106 if (cmd.v_dependent[i]->type != NUMERIC)
1108 msg (SE, gettext ("Dependent variable must be numeric."));
1109 pspp_reg_rc = CMD_FAILURE;
1114 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
1116 lopts.get_depvar_mean_std = 1;
1118 for (k = 0; k < cmd.n_dependent; k++)
1120 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
1121 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
1122 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
1123 assert (indep_vars != NULL);
1125 for (i = 0; i < n_cases; i++)
1127 is_missing_case[i] = 0;
1129 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
1131 (const struct casefile *) cf);
1132 Y = gsl_vector_alloc (n_data);
1135 design_matrix_create (n_indep, (const struct variable **) indep_vars,
1137 for (i = 0; i < X->m->size2; i++)
1139 lopts.get_indep_mean_std[i] = 1;
1141 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1142 models[k]->indep_means = gsl_vector_alloc (X->m->size2);
1143 models[k]->indep_std = gsl_vector_alloc (X->m->size2);
1144 models[k]->depvar = (const struct variable *) cmd.v_dependent[k];
1146 For large data sets, use QR decomposition.
1148 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1150 models[k]->method = PSPP_LINREG_SVD;
1154 The second pass fills the design matrix.
1157 for (r = casefile_get_reader (cf); casereader_read (r, &c);
1159 /* Iterate over the cases. */
1161 case_num = casereader_cnum (r) - 1;
1162 if (!is_missing_case[case_num])
1164 for (i = 0; i < n_variables; ++i) /* Iterate over the
1169 val = case_data (&c, v_variables[i]->fv);
1171 Independent/dependent variable separation. The
1172 'variables' subcommand specifies a varlist which contains
1173 both dependent and independent variables. The dependent
1174 variables are specified with the 'dependent'
1175 subcommand, and maybe also in the 'variables' subcommand.
1176 We need to separate the two.
1178 if (!is_depvar (i, cmd.v_dependent[k]))
1180 if (v_variables[i]->type == ALPHA)
1182 design_matrix_set_categorical (X, row, v_variables[i], val);
1184 else if (v_variables[i]->type == NUMERIC)
1186 design_matrix_set_numeric (X, row, v_variables[i], val);
1190 val = case_data (&c, cmd.v_dependent[k]->fv);
1191 gsl_vector_set (Y, row, val->f);
1196 Now that we know the number of coefficients, allocate space
1197 and store pointers to the variables that correspond to the
1200 pspp_linreg_coeff_init (models[k], X);
1203 Find the least-squares estimates and other statistics.
1205 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1206 subcommand_statistics (cmd.a_statistics, models[k]);
1207 subcommand_export (cmd.sbc_export, models[k]);
1209 gsl_vector_free (Y);
1210 design_matrix_destroy (X);
1212 free (lopts.get_indep_mean_std);
1213 casereader_destroy (r);
1216 free (is_missing_case);