1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_matrix.h>
24 #include <gsl/gsl_vector.h>
28 #include "regression-export.h"
29 #include <data/case.h>
30 #include <data/casefile.h>
31 #include <data/cat-routines.h>
32 #include <data/category.h>
33 #include <data/dictionary.h>
34 #include <data/missing-values.h>
35 #include <data/procedure.h>
36 #include <data/transformations.h>
37 #include <data/value-labels.h>
38 #include <data/variable.h>
39 #include <language/command.h>
40 #include <language/dictionary/split-file.h>
41 #include <language/data-io/file-handle.h>
42 #include <language/lexer/lexer.h>
43 #include <libpspp/alloc.h>
44 #include <libpspp/compiler.h>
45 #include <libpspp/message.h>
46 #include <math/design-matrix.h>
47 #include <math/coefficient.h>
48 #include <math/linreg/linreg.h>
49 #include <output/table.h>
53 #define REG_LARGE_DATA 1000
58 "REGRESSION" (regression_):
84 static struct cmd_regression cmd;
86 /* Linear regression models. */
87 pspp_linreg_cache **models = NULL;
90 Transformations for saving predicted values
95 int n_trns; /* Number of transformations. */
96 int trns_id; /* Which trns is this one? */
97 pspp_linreg_cache *c; /* Linear model for this trns. */
100 Variables used (both explanatory and response).
102 static struct variable **v_variables;
107 static size_t n_variables;
110 File where the model will be saved if the EXPORT subcommand
113 struct file_handle *model_file;
116 Return value for the procedure.
118 int pspp_reg_rc = CMD_SUCCESS;
120 static bool run_regression (const struct ccase *,
121 const struct casefile *, void *);
124 STATISTICS subcommand output functions.
126 static void reg_stats_r (pspp_linreg_cache *);
127 static void reg_stats_coeff (pspp_linreg_cache *);
128 static void reg_stats_anova (pspp_linreg_cache *);
129 static void reg_stats_outs (pspp_linreg_cache *);
130 static void reg_stats_zpp (pspp_linreg_cache *);
131 static void reg_stats_label (pspp_linreg_cache *);
132 static void reg_stats_sha (pspp_linreg_cache *);
133 static void reg_stats_ci (pspp_linreg_cache *);
134 static void reg_stats_f (pspp_linreg_cache *);
135 static void reg_stats_bcov (pspp_linreg_cache *);
136 static void reg_stats_ses (pspp_linreg_cache *);
137 static void reg_stats_xtx (pspp_linreg_cache *);
138 static void reg_stats_collin (pspp_linreg_cache *);
139 static void reg_stats_tol (pspp_linreg_cache *);
140 static void reg_stats_selection (pspp_linreg_cache *);
141 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
142 int, pspp_linreg_cache *);
145 reg_stats_r (pspp_linreg_cache * c)
155 rsq = c->ssm / c->sst;
156 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
157 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
158 t = tab_create (n_cols, n_rows, 0);
159 tab_dim (t, tab_natural_dimensions);
160 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
161 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
162 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
163 tab_vline (t, TAL_0, 1, 0, 0);
165 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
166 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
167 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
168 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
169 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
170 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
171 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
172 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
173 tab_title (t, _("Model Summary"));
178 Table showing estimated regression coefficients.
181 reg_stats_coeff (pspp_linreg_cache * c)
193 const struct variable *v;
194 const union value *val;
199 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
200 n_rows = c->n_coeffs + 2;
202 t = tab_create (n_cols, n_rows, 0);
203 tab_headers (t, 2, 0, 1, 0);
204 tab_dim (t, tab_natural_dimensions);
205 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
206 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
207 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
208 tab_vline (t, TAL_0, 1, 0, 0);
210 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
211 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
212 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
213 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
214 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
215 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
216 coeff = c->coeff[0]->estimate;
217 tab_float (t, 2, 1, 0, coeff, 10, 2);
218 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
219 tab_float (t, 3, 1, 0, std_err, 10, 2);
220 beta = coeff / c->depvar_std;
221 tab_float (t, 4, 1, 0, beta, 10, 2);
222 t_stat = coeff / std_err;
223 tab_float (t, 5, 1, 0, t_stat, 10, 2);
224 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
225 tab_float (t, 6, 1, 0, pval, 10, 2);
226 for (j = 1; j <= c->n_indeps; j++)
228 v = pspp_linreg_coeff_get_var (c->coeff[j], 0);
229 label = var_to_string (v);
230 /* Do not overwrite the variable's name. */
231 strncpy (tmp, label, MAX_STRING);
232 if (v->type == ALPHA)
235 Append the value associated with this coefficient.
236 This makes sense only if we us the usual binary encoding
240 val = pspp_linreg_coeff_get_value (c->coeff[j], v);
241 val_s = value_to_string (val, v);
242 strncat (tmp, val_s, MAX_STRING);
245 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
247 Regression coefficients.
249 coeff = c->coeff[j]->estimate;
250 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
252 Standard error of the coefficients.
254 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
255 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
257 'Standardized' coefficient, i.e., regression coefficient
258 if all variables had unit variance.
260 beta = gsl_vector_get (c->indep_std, j);
261 beta *= coeff / c->depvar_std;
262 tab_float (t, 4, j + 1, 0, beta, 10, 2);
265 Test statistic for H0: coefficient is 0.
267 t_stat = coeff / std_err;
268 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
270 P values for the test statistic above.
272 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
273 tab_float (t, 6, j + 1, 0, pval, 10, 2);
275 tab_title (t, _("Coefficients"));
281 Display the ANOVA table.
284 reg_stats_anova (pspp_linreg_cache * c)
288 const double msm = c->ssm / c->dfm;
289 const double mse = c->sse / c->dfe;
290 const double F = msm / mse;
291 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
296 t = tab_create (n_cols, n_rows, 0);
297 tab_headers (t, 2, 0, 1, 0);
298 tab_dim (t, tab_natural_dimensions);
300 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
302 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
303 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
304 tab_vline (t, TAL_0, 1, 0, 0);
306 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
307 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
308 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
309 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
310 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
312 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
313 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
314 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
316 /* Sums of Squares */
317 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
318 tab_float (t, 2, 3, 0, c->sst, 10, 2);
319 tab_float (t, 2, 2, 0, c->sse, 10, 2);
322 /* Degrees of freedom */
323 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
324 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
325 tab_float (t, 3, 3, 0, c->dft, 4, 0);
329 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
330 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
332 tab_float (t, 5, 1, 0, F, 8, 3);
334 tab_float (t, 6, 1, 0, pval, 8, 3);
336 tab_title (t, _("ANOVA"));
340 reg_stats_outs (pspp_linreg_cache * c)
345 reg_stats_zpp (pspp_linreg_cache * c)
350 reg_stats_label (pspp_linreg_cache * c)
355 reg_stats_sha (pspp_linreg_cache * c)
360 reg_stats_ci (pspp_linreg_cache * c)
365 reg_stats_f (pspp_linreg_cache * c)
370 reg_stats_bcov (pspp_linreg_cache * c)
382 n_cols = c->n_indeps + 1 + 2;
383 n_rows = 2 * (c->n_indeps + 1);
384 t = tab_create (n_cols, n_rows, 0);
385 tab_headers (t, 2, 0, 1, 0);
386 tab_dim (t, tab_natural_dimensions);
387 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
388 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
389 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
390 tab_vline (t, TAL_0, 1, 0, 0);
391 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
392 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
393 for (i = 1; i < c->n_coeffs; i++)
395 const struct variable *v = pspp_linreg_coeff_get_var (c->coeff[i], 0);
396 label = var_to_string (v);
397 tab_text (t, 2, i, TAB_CENTER, label);
398 tab_text (t, i + 2, 0, TAB_CENTER, label);
399 for (k = 1; k < c->n_coeffs; k++)
401 col = (i <= k) ? k : i;
402 row = (i <= k) ? i : k;
403 tab_float (t, k + 2, i, TAB_CENTER,
404 gsl_matrix_get (c->cov, row, col), 8, 3);
407 tab_title (t, _("Coefficient Correlations"));
411 reg_stats_ses (pspp_linreg_cache * c)
416 reg_stats_xtx (pspp_linreg_cache * c)
421 reg_stats_collin (pspp_linreg_cache * c)
426 reg_stats_tol (pspp_linreg_cache * c)
431 reg_stats_selection (pspp_linreg_cache * c)
437 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
438 int keyword, pspp_linreg_cache * c)
447 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
450 The order here must match the order in which the STATISTICS
451 keywords appear in the specification section above.
478 Set everything but F.
480 for (i = 0; i < f; i++)
487 for (i = 0; i < all; i++)
495 Default output: ANOVA table, parameter estimates,
496 and statistics for variables not entered into model,
499 if (keywords[defaults] | d)
507 statistics_keyword_output (reg_stats_r, keywords[r], c);
508 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
509 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
510 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
511 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
512 statistics_keyword_output (reg_stats_label, keywords[label], c);
513 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
514 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
515 statistics_keyword_output (reg_stats_f, keywords[f], c);
516 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
517 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
518 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
519 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
520 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
521 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
525 Free the transformation. Free its linear model if this
526 transformation is the last one.
529 regression_trns_free (void *t_)
532 struct reg_trns *t = t_;
534 if (t->trns_id == t->n_trns)
536 result = pspp_linreg_cache_free (t->c);
544 Gets the predicted values.
547 regression_trns_pred_proc (void *t_, struct ccase *c, int case_idx UNUSED)
551 struct reg_trns *trns = t_;
552 pspp_linreg_cache *model;
553 union value *output = NULL;
554 const union value **vals = NULL;
555 struct variable **vars = NULL;
557 assert (trns != NULL);
559 assert (model != NULL);
560 assert (model->depvar != NULL);
561 assert (model->pred != NULL);
563 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
564 n_vals = (*model->get_vars) (model, vars);
566 vals = xnmalloc (n_vals, sizeof (*vals));
567 output = case_data_rw (c, model->pred->fv);
568 assert (output != NULL);
570 for (i = 0; i < n_vals; i++)
572 vals[i] = case_data (c, vars[i]->fv);
574 output->f = (*model->predict) ((const struct variable **) vars,
575 vals, model, n_vals);
578 return TRNS_CONTINUE;
585 regression_trns_resid_proc (void *t_, struct ccase *c, int case_idx UNUSED)
589 struct reg_trns *trns = t_;
590 pspp_linreg_cache *model;
591 union value *output = NULL;
592 const union value **vals = NULL;
593 const union value *obs = NULL;
594 struct variable **vars = NULL;
596 assert (trns != NULL);
598 assert (model != NULL);
599 assert (model->depvar != NULL);
600 assert (model->resid != NULL);
602 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
603 n_vals = (*model->get_vars) (model, vars);
605 vals = xnmalloc (n_vals, sizeof (*vals));
606 output = case_data_rw (c, model->resid->fv);
607 assert (output != NULL);
609 for (i = 0; i < n_vals; i++)
611 vals[i] = case_data (c, vars[i]->fv);
613 obs = case_data (c, model->depvar->fv);
614 output->f = (*model->residual) ((const struct variable **) vars,
615 vals, obs, model, n_vals);
618 return TRNS_CONTINUE;
622 Returns 0 if NAME is a duplicate of any existing variable name.
625 try_name (char *name)
627 if (dict_lookup_var (default_dict, name) != NULL)
633 reg_get_name (char name[LONG_NAME_LEN], const char prefix[LONG_NAME_LEN])
637 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
638 while (!try_name (name))
641 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
645 reg_save_var (const char *prefix, trns_proc_func * f,
646 pspp_linreg_cache * c, struct variable **v, int n_trns)
648 static int trns_index = 1;
649 char name[LONG_NAME_LEN];
650 struct variable *new_var;
651 struct reg_trns *t = NULL;
653 t = xmalloc (sizeof (*t));
654 t->trns_id = trns_index;
657 reg_get_name (name, prefix);
658 new_var = dict_create_var (default_dict, name, 0);
659 assert (new_var != NULL);
661 add_transformation (f, regression_trns_free, t);
665 subcommand_save (int save, pspp_linreg_cache ** models)
667 pspp_linreg_cache **lc;
671 assert (models != NULL);
675 /* Count the number of transformations we will need. */
676 for (i = 0; i < REGRESSION_SV_count; i++)
683 n_trns *= cmd.n_dependent;
685 for (lc = models; lc < models + cmd.n_dependent; lc++)
687 assert (*lc != NULL);
688 assert ((*lc)->depvar != NULL);
689 if (cmd.a_save[REGRESSION_SV_RESID])
691 reg_save_var ("RES", regression_trns_resid_proc, *lc,
692 &(*lc)->resid, n_trns);
694 if (cmd.a_save[REGRESSION_SV_PRED])
696 reg_save_var ("PRED", regression_trns_pred_proc, *lc,
697 &(*lc)->pred, n_trns);
703 for (lc = models; lc < models + cmd.n_dependent; lc++)
705 assert (*lc != NULL);
706 pspp_linreg_cache_free (*lc);
711 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
715 for (i = 0; i < n_vars; i++)
717 if (v->index == varlist[i]->index)
725 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
730 struct variable **varlist;
731 struct pspp_linreg_coeff *coeff;
732 const struct variable *v;
735 fprintf (fp, "%s", reg_export_categorical_encode_1);
737 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
738 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
741 v = pspp_linreg_coeff_get_var (coeff, 0);
742 if (v->type == ALPHA)
744 if (!reg_inserted (v, varlist, n_vars))
746 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
748 varlist[n_vars] = (struct variable *) v;
753 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
754 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
756 for (i = 0; i < n_vars - 1; i++)
758 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
760 fprintf (fp, "&%s};\n\t", varlist[i]->name);
762 for (i = 0; i < n_vars; i++)
765 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
767 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
768 varlist[i]->obs_vals->n_categories);
770 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
772 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
773 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
774 value_to_string (val, varlist[i]));
777 fprintf (fp, "%s", reg_export_categorical_encode_2);
781 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
784 struct pspp_linreg_coeff *coeff;
785 const struct variable *v;
787 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
788 for (i = 1; i < c->n_indeps; i++)
791 v = pspp_linreg_coeff_get_var (coeff, 0);
792 fprintf (fp, "\"%s\",\n\t\t", v->name);
795 v = pspp_linreg_coeff_get_var (coeff, 0);
796 fprintf (fp, "\"%s\"};\n\t", v->name);
799 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
801 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
802 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
803 reg_print_depvars (fp, c);
804 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
806 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
807 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
810 reg_has_categorical (pspp_linreg_cache * c)
813 const struct variable *v;
815 for (i = 1; i < c->n_coeffs; i++)
817 v = pspp_linreg_coeff_get_var (c->coeff[i], 0);
818 if (v->type == ALPHA)
827 subcommand_export (int export, pspp_linreg_cache * c)
832 int n_quantiles = 100;
835 struct pspp_linreg_coeff *coeff;
840 assert (model_file != NULL);
841 fp = fopen (fh_get_file_name (model_file), "w");
843 fprintf (fp, "%s", reg_preamble);
844 reg_print_getvar (fp, c);
845 if (reg_has_categorical (c))
847 reg_print_categorical_encoding (fp, c);
849 fprintf (fp, "%s", reg_export_t_quantiles_1);
850 increment = 0.5 / (double) increment;
851 for (i = 0; i < n_quantiles - 1; i++)
853 tmp = 0.5 + 0.005 * (double) i;
854 fprintf (fp, "%.15e,\n\t\t",
855 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
857 fprintf (fp, "%.15e};\n\t",
858 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
859 fprintf (fp, "%s", reg_export_t_quantiles_2);
860 fprintf (fp, "%s", reg_mean_cmt);
861 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
862 fprintf (fp, "const char *var_names[])\n{\n\t");
863 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
864 for (i = 1; i < c->n_indeps; i++)
867 fprintf (fp, "%.15e,\n\t\t", coeff->estimate);
870 fprintf (fp, "%.15e};\n\t", coeff->estimate);
872 fprintf (fp, "double estimate = %.15e;\n\t", coeff->estimate);
873 fprintf (fp, "int i;\n\tint j;\n\n\t");
874 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
875 fprintf (fp, "%s", reg_getvar);
876 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
878 for (i = 0; i < c->cov->size1 - 1; i++)
881 for (j = 0; j < c->cov->size2 - 1; j++)
883 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
885 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
888 for (j = 0; j < c->cov->size2 - 1; j++)
890 fprintf (fp, "%.15e, ",
891 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
893 fprintf (fp, "%.15e}\n\t",
894 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
895 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
897 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
898 fprintf (fp, "%s", reg_variance);
899 fprintf (fp, "%s", reg_export_confidence_interval);
900 tmp = c->mse * c->mse;
901 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
902 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
903 fprintf (fp, "%s", reg_export_prediction_interval_3);
905 fp = fopen ("pspp_model_reg.h", "w");
906 fprintf (fp, "%s", reg_header);
911 regression_custom_export (struct cmd_regression *cmd UNUSED)
913 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
914 if (!lex_force_match ('('))
921 model_file = fh_parse (FH_REF_FILE);
922 if (model_file == NULL)
926 if (!lex_force_match (')'))
933 cmd_regression (void)
935 if (!parse_regression (&cmd))
938 models = xnmalloc (cmd.n_dependent, sizeof *models);
939 if (!multipass_procedure_with_splits (run_regression, &cmd))
940 return CMD_CASCADING_FAILURE;
941 subcommand_save (cmd.sbc_save, models);
948 Is variable k the dependent variable?
951 is_depvar (size_t k, const struct variable *v)
954 compare_var_names returns 0 if the variable
957 if (!compare_var_names (v, v_variables[k], NULL))
964 Mark missing cases. Return the number of non-missing cases.
967 mark_missing_cases (const struct casefile *cf, struct variable *v,
968 int *is_missing_case, double n_data)
970 struct casereader *r;
973 const union value *val;
975 for (r = casefile_get_reader (cf);
976 casereader_read (r, &c); case_destroy (&c))
978 row = casereader_cnum (r) - 1;
980 val = case_data (&c, v->fv);
981 cat_value_update (v, val);
982 if (mv_is_value_missing (&v->miss, val))
984 if (!is_missing_case[row])
986 /* Now it is missing. */
988 is_missing_case[row] = 1;
992 casereader_destroy (r);
997 /* Parser for the variables sub command */
999 regression_custom_variables (struct cmd_regression *cmd UNUSED)
1004 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
1009 if (!parse_variables (default_dict, &v_variables, &n_variables, PV_NONE))
1014 assert (n_variables);
1020 Count the explanatory variables. The user may or may
1021 not have specified a response variable in the syntax.
1024 get_n_indep (const struct variable *v)
1029 result = n_variables;
1030 while (i < n_variables)
1032 if (is_depvar (i, v))
1043 Read from the active file. Identify the explanatory variables in
1044 v_variables. Encode categorical variables. Drop cases with missing
1048 prepare_data (int n_data, int is_missing_case[],
1049 struct variable **indep_vars,
1050 struct variable *depvar, const struct casefile *cf)
1055 assert (indep_vars != NULL);
1057 for (i = 0; i < n_variables; i++)
1059 if (!is_depvar (i, depvar))
1061 indep_vars[j] = v_variables[i];
1063 if (v_variables[i]->type == ALPHA)
1065 /* Make a place to hold the binary vectors
1066 corresponding to this variable's values. */
1067 cat_stored_values_create (v_variables[i]);
1070 mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
1074 Mark missing cases for the dependent variable.
1076 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
1082 run_regression (const struct ccase *first,
1083 const struct casefile *cf, void *cmd_ UNUSED)
1086 size_t n_data = 0; /* Number of valide cases. */
1087 size_t n_cases; /* Number of cases. */
1093 Keep track of the missing cases.
1095 int *is_missing_case;
1096 const union value *val;
1097 struct casereader *r;
1099 struct variable **indep_vars;
1100 struct design_matrix *X;
1103 pspp_linreg_opts lopts;
1105 assert (models != NULL);
1107 output_split_file_values (first);
1111 dict_get_vars (default_dict, &v_variables, &n_variables,
1115 n_cases = casefile_get_case_cnt (cf);
1117 for (i = 0; i < cmd.n_dependent; i++)
1119 if (cmd.v_dependent[i]->type != NUMERIC)
1121 msg (SE, gettext ("Dependent variable must be numeric."));
1122 pspp_reg_rc = CMD_FAILURE;
1127 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
1129 lopts.get_depvar_mean_std = 1;
1131 for (k = 0; k < cmd.n_dependent; k++)
1133 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
1134 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
1135 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
1136 assert (indep_vars != NULL);
1138 for (i = 0; i < n_cases; i++)
1140 is_missing_case[i] = 0;
1142 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
1144 (const struct casefile *) cf);
1145 Y = gsl_vector_alloc (n_data);
1148 design_matrix_create (n_indep, (const struct variable **) indep_vars,
1150 for (i = 0; i < X->m->size2; i++)
1152 lopts.get_indep_mean_std[i] = 1;
1154 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1155 models[k]->indep_means = gsl_vector_alloc (X->m->size2);
1156 models[k]->indep_std = gsl_vector_alloc (X->m->size2);
1157 models[k]->depvar = (const struct variable *) cmd.v_dependent[k];
1159 For large data sets, use QR decomposition.
1161 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1163 models[k]->method = PSPP_LINREG_SVD;
1167 The second pass fills the design matrix.
1170 for (r = casefile_get_reader (cf); casereader_read (r, &c);
1172 /* Iterate over the cases. */
1174 case_num = casereader_cnum (r) - 1;
1175 if (!is_missing_case[case_num])
1177 for (i = 0; i < n_variables; ++i) /* Iterate over the
1182 val = case_data (&c, v_variables[i]->fv);
1184 Independent/dependent variable separation. The
1185 'variables' subcommand specifies a varlist which contains
1186 both dependent and independent variables. The dependent
1187 variables are specified with the 'dependent'
1188 subcommand, and maybe also in the 'variables' subcommand.
1189 We need to separate the two.
1191 if (!is_depvar (i, cmd.v_dependent[k]))
1193 if (v_variables[i]->type == ALPHA)
1195 design_matrix_set_categorical (X, row,
1196 v_variables[i], val);
1198 else if (v_variables[i]->type == NUMERIC)
1200 design_matrix_set_numeric (X, row, v_variables[i],
1205 val = case_data (&c, cmd.v_dependent[k]->fv);
1206 gsl_vector_set (Y, row, val->f);
1211 Now that we know the number of coefficients, allocate space
1212 and store pointers to the variables that correspond to the
1215 pspp_linreg_coeff_init (models[k], X);
1218 Find the least-squares estimates and other statistics.
1220 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1221 subcommand_statistics (cmd.a_statistics, models[k]);
1222 subcommand_export (cmd.sbc_export, models[k]);
1224 gsl_vector_free (Y);
1225 design_matrix_destroy (X);
1227 free (lopts.get_indep_mean_std);
1228 casereader_destroy (r);
1231 free (is_missing_case);