1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <gsl/gsl_matrix.h>
21 #include <gsl/gsl_vector.h>
25 #include "regression-export.h"
26 #include <data/case.h>
27 #include <data/casegrouper.h>
28 #include <data/casereader.h>
29 #include <data/category.h>
30 #include <data/dictionary.h>
31 #include <data/missing-values.h>
32 #include <data/procedure.h>
33 #include <data/transformations.h>
34 #include <data/value-labels.h>
35 #include <data/variable.h>
36 #include <language/command.h>
37 #include <language/dictionary/split-file.h>
38 #include <language/data-io/file-handle.h>
39 #include <language/lexer/lexer.h>
40 #include <libpspp/compiler.h>
41 #include <libpspp/message.h>
42 #include <libpspp/taint.h>
43 #include <math/design-matrix.h>
44 #include <math/coefficient.h>
45 #include <math/linreg/linreg.h>
46 #include <math/moments.h>
47 #include <output/table.h>
52 #define _(msgid) gettext (msgid)
54 #define REG_LARGE_DATA 1000
59 "REGRESSION" (regression_):
80 +save[sv_]=resid,pred;
85 static struct cmd_regression cmd;
88 Moments for each of the variables used.
93 const struct variable *v;
97 Transformations for saving predicted values
102 int n_trns; /* Number of transformations. */
103 int trns_id; /* Which trns is this one? */
104 pspp_linreg_cache *c; /* Linear model for this trns. */
107 Variables used (both explanatory and response).
109 static const struct variable **v_variables;
114 static size_t n_variables;
117 File where the model will be saved if the EXPORT subcommand
120 static struct file_handle *model_file;
122 static bool run_regression (struct casereader *, struct cmd_regression *,
123 struct dataset *, pspp_linreg_cache **);
126 STATISTICS subcommand output functions.
128 static void reg_stats_r (pspp_linreg_cache *);
129 static void reg_stats_coeff (pspp_linreg_cache *);
130 static void reg_stats_anova (pspp_linreg_cache *);
131 static void reg_stats_outs (pspp_linreg_cache *);
132 static void reg_stats_zpp (pspp_linreg_cache *);
133 static void reg_stats_label (pspp_linreg_cache *);
134 static void reg_stats_sha (pspp_linreg_cache *);
135 static void reg_stats_ci (pspp_linreg_cache *);
136 static void reg_stats_f (pspp_linreg_cache *);
137 static void reg_stats_bcov (pspp_linreg_cache *);
138 static void reg_stats_ses (pspp_linreg_cache *);
139 static void reg_stats_xtx (pspp_linreg_cache *);
140 static void reg_stats_collin (pspp_linreg_cache *);
141 static void reg_stats_tol (pspp_linreg_cache *);
142 static void reg_stats_selection (pspp_linreg_cache *);
143 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
144 int, pspp_linreg_cache *);
147 reg_stats_r (pspp_linreg_cache * c)
157 rsq = c->ssm / c->sst;
158 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
159 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
160 t = tab_create (n_cols, n_rows, 0);
161 tab_dim (t, tab_natural_dimensions);
162 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
163 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
164 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
165 tab_vline (t, TAL_0, 1, 0, 0);
167 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
168 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
169 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
170 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
171 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
172 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
173 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
174 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
175 tab_title (t, _("Model Summary"));
180 Table showing estimated regression coefficients.
183 reg_stats_coeff (pspp_linreg_cache * c)
195 const struct variable *v;
196 const union value *val;
201 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
202 n_rows = c->n_coeffs + 2;
204 t = tab_create (n_cols, n_rows, 0);
205 tab_headers (t, 2, 0, 1, 0);
206 tab_dim (t, tab_natural_dimensions);
207 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
208 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
209 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
210 tab_vline (t, TAL_0, 1, 0, 0);
212 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
213 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
214 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
215 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
216 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
217 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
218 coeff = c->coeff[0]->estimate;
219 tab_float (t, 2, 1, 0, coeff, 10, 2);
220 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
221 tab_float (t, 3, 1, 0, std_err, 10, 2);
222 beta = coeff / c->depvar_std;
223 tab_float (t, 4, 1, 0, beta, 10, 2);
224 t_stat = coeff / std_err;
225 tab_float (t, 5, 1, 0, t_stat, 10, 2);
226 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
227 tab_float (t, 6, 1, 0, pval, 10, 2);
228 for (j = 1; j <= c->n_indeps; j++)
230 v = pspp_coeff_get_var (c->coeff[j], 0);
231 label = var_to_string (v);
232 /* Do not overwrite the variable's name. */
233 strncpy (tmp, label, MAX_STRING);
234 if (var_is_alpha (v))
237 Append the value associated with this coefficient.
238 This makes sense only if we us the usual binary encoding
242 val = pspp_coeff_get_value (c->coeff[j], v);
243 val_s = var_get_value_name (v, val);
244 strncat (tmp, val_s, MAX_STRING);
247 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
249 Regression coefficients.
251 coeff = c->coeff[j]->estimate;
252 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
254 Standard error of the coefficients.
256 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
257 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
259 'Standardized' coefficient, i.e., regression coefficient
260 if all variables had unit variance.
262 beta = gsl_vector_get (c->indep_std, j);
263 beta *= coeff / c->depvar_std;
264 tab_float (t, 4, j + 1, 0, beta, 10, 2);
267 Test statistic for H0: coefficient is 0.
269 t_stat = coeff / std_err;
270 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
272 P values for the test statistic above.
275 2 * gsl_cdf_tdist_Q (fabs (t_stat),
276 (double) (c->n_obs - c->n_coeffs));
277 tab_float (t, 6, j + 1, 0, pval, 10, 2);
279 tab_title (t, _("Coefficients"));
285 Display the ANOVA table.
288 reg_stats_anova (pspp_linreg_cache * c)
292 const double msm = c->ssm / c->dfm;
293 const double mse = c->sse / c->dfe;
294 const double F = msm / mse;
295 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
300 t = tab_create (n_cols, n_rows, 0);
301 tab_headers (t, 2, 0, 1, 0);
302 tab_dim (t, tab_natural_dimensions);
304 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
306 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
307 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
308 tab_vline (t, TAL_0, 1, 0, 0);
310 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
311 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
312 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
313 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
314 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
316 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
317 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
318 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
320 /* Sums of Squares */
321 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
322 tab_float (t, 2, 3, 0, c->sst, 10, 2);
323 tab_float (t, 2, 2, 0, c->sse, 10, 2);
326 /* Degrees of freedom */
327 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
328 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
329 tab_float (t, 3, 3, 0, c->dft, 4, 0);
333 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
334 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
336 tab_float (t, 5, 1, 0, F, 8, 3);
338 tab_float (t, 6, 1, 0, pval, 8, 3);
340 tab_title (t, _("ANOVA"));
344 reg_stats_outs (pspp_linreg_cache * c)
349 reg_stats_zpp (pspp_linreg_cache * c)
354 reg_stats_label (pspp_linreg_cache * c)
359 reg_stats_sha (pspp_linreg_cache * c)
364 reg_stats_ci (pspp_linreg_cache * c)
369 reg_stats_f (pspp_linreg_cache * c)
374 reg_stats_bcov (pspp_linreg_cache * c)
386 n_cols = c->n_indeps + 1 + 2;
387 n_rows = 2 * (c->n_indeps + 1);
388 t = tab_create (n_cols, n_rows, 0);
389 tab_headers (t, 2, 0, 1, 0);
390 tab_dim (t, tab_natural_dimensions);
391 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
392 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
393 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
394 tab_vline (t, TAL_0, 1, 0, 0);
395 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
396 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
397 for (i = 1; i < c->n_coeffs; i++)
399 const struct variable *v = pspp_coeff_get_var (c->coeff[i], 0);
400 label = var_to_string (v);
401 tab_text (t, 2, i, TAB_CENTER, label);
402 tab_text (t, i + 2, 0, TAB_CENTER, label);
403 for (k = 1; k < c->n_coeffs; k++)
405 col = (i <= k) ? k : i;
406 row = (i <= k) ? i : k;
407 tab_float (t, k + 2, i, TAB_CENTER,
408 gsl_matrix_get (c->cov, row, col), 8, 3);
411 tab_title (t, _("Coefficient Correlations"));
415 reg_stats_ses (pspp_linreg_cache * c)
420 reg_stats_xtx (pspp_linreg_cache * c)
425 reg_stats_collin (pspp_linreg_cache * c)
430 reg_stats_tol (pspp_linreg_cache * c)
435 reg_stats_selection (pspp_linreg_cache * c)
441 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
442 int keyword, pspp_linreg_cache * c)
451 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
454 The order here must match the order in which the STATISTICS
455 keywords appear in the specification section above.
482 Set everything but F.
484 for (i = 0; i < f; i++)
491 for (i = 0; i < all; i++)
499 Default output: ANOVA table, parameter estimates,
500 and statistics for variables not entered into model,
503 if (keywords[defaults] | d)
511 statistics_keyword_output (reg_stats_r, keywords[r], c);
512 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
513 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
514 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
515 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
516 statistics_keyword_output (reg_stats_label, keywords[label], c);
517 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
518 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
519 statistics_keyword_output (reg_stats_f, keywords[f], c);
520 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
521 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
522 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
523 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
524 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
525 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
529 Free the transformation. Free its linear model if this
530 transformation is the last one.
533 regression_trns_free (void *t_)
536 struct reg_trns *t = t_;
538 if (t->trns_id == t->n_trns)
540 result = pspp_linreg_cache_free (t->c);
548 Gets the predicted values.
551 regression_trns_pred_proc (void *t_, struct ccase *c,
552 casenumber case_idx UNUSED)
556 struct reg_trns *trns = t_;
557 pspp_linreg_cache *model;
558 union value *output = NULL;
559 const union value **vals = NULL;
560 const struct variable **vars = NULL;
562 assert (trns != NULL);
564 assert (model != NULL);
565 assert (model->depvar != NULL);
566 assert (model->pred != NULL);
568 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
569 n_vals = (*model->get_vars) (model, vars);
571 vals = xnmalloc (n_vals, sizeof (*vals));
572 output = case_data_rw (c, model->pred);
573 assert (output != NULL);
575 for (i = 0; i < n_vals; i++)
577 vals[i] = case_data (c, vars[i]);
579 output->f = (*model->predict) ((const struct variable **) vars,
580 vals, model, n_vals);
583 return TRNS_CONTINUE;
590 regression_trns_resid_proc (void *t_, struct ccase *c,
591 casenumber case_idx UNUSED)
595 struct reg_trns *trns = t_;
596 pspp_linreg_cache *model;
597 union value *output = NULL;
598 const union value **vals = NULL;
599 const union value *obs = NULL;
600 const struct variable **vars = NULL;
602 assert (trns != NULL);
604 assert (model != NULL);
605 assert (model->depvar != NULL);
606 assert (model->resid != NULL);
608 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
609 n_vals = (*model->get_vars) (model, vars);
611 vals = xnmalloc (n_vals, sizeof (*vals));
612 output = case_data_rw (c, model->resid);
613 assert (output != NULL);
615 for (i = 0; i < n_vals; i++)
617 vals[i] = case_data (c, vars[i]);
619 obs = case_data (c, model->depvar);
620 output->f = (*model->residual) ((const struct variable **) vars,
621 vals, obs, model, n_vals);
624 return TRNS_CONTINUE;
628 Returns false if NAME is a duplicate of any existing variable name.
631 try_name (const struct dictionary *dict, const char *name)
633 if (dict_lookup_var (dict, name) != NULL)
640 reg_get_name (const struct dictionary *dict, char name[VAR_NAME_LEN],
641 const char prefix[VAR_NAME_LEN])
645 snprintf (name, VAR_NAME_LEN, "%s%d", prefix, i);
646 while (!try_name (dict, name))
649 snprintf (name, VAR_NAME_LEN, "%s%d", prefix, i);
654 reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
655 pspp_linreg_cache * c, struct variable **v, int n_trns)
657 struct dictionary *dict = dataset_dict (ds);
658 static int trns_index = 1;
659 char name[VAR_NAME_LEN];
660 struct variable *new_var;
661 struct reg_trns *t = NULL;
663 t = xmalloc (sizeof (*t));
664 t->trns_id = trns_index;
667 reg_get_name (dict, name, prefix);
668 new_var = dict_create_var (dict, name, 0);
669 assert (new_var != NULL);
671 add_transformation (ds, f, regression_trns_free, t);
676 subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models)
678 pspp_linreg_cache **lc;
682 assert (models != NULL);
686 /* Count the number of transformations we will need. */
687 for (i = 0; i < REGRESSION_SV_count; i++)
694 n_trns *= cmd.n_dependent;
696 for (lc = models; lc < models + cmd.n_dependent; lc++)
698 assert (*lc != NULL);
699 assert ((*lc)->depvar != NULL);
700 if (cmd.a_save[REGRESSION_SV_RESID])
702 reg_save_var (ds, "RES", regression_trns_resid_proc, *lc,
703 &(*lc)->resid, n_trns);
705 if (cmd.a_save[REGRESSION_SV_PRED])
707 reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc,
708 &(*lc)->pred, n_trns);
714 for (lc = models; lc < models + cmd.n_dependent; lc++)
718 pspp_linreg_cache_free (*lc);
725 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
729 for (i = 0; i < n_vars; i++)
740 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
744 struct variable **varlist;
746 fprintf (fp, "%s", reg_export_categorical_encode_1);
748 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
749 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
751 struct pspp_coeff *coeff = c->coeff[i];
752 const struct variable *v = pspp_coeff_get_var (coeff, 0);
753 if (var_is_alpha (v))
755 if (!reg_inserted (v, varlist, n_vars))
757 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
759 varlist[n_vars] = (struct variable *) v;
764 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
765 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
767 for (i = 0; i < n_vars - 1; i++)
769 fprintf (fp, "&%s,\n\t\t", var_get_name (varlist[i]));
771 fprintf (fp, "&%s};\n\t", var_get_name (varlist[i]));
773 for (i = 0; i < n_vars; i++)
775 int n_categories = cat_get_n_categories (varlist[i]);
778 fprintf (fp, "%s.name = \"%s\";\n\t",
779 var_get_name (varlist[i]), var_get_name (varlist[i]));
780 fprintf (fp, "%s.n_vals = %d;\n\t",
781 var_get_name (varlist[i]), n_categories);
783 for (j = 0; j < n_categories; j++)
785 const union value *val = cat_subscript_to_value (j, varlist[i]);
786 fprintf (fp, "%s.values[%d] = \"%s\";\n\t",
787 var_get_name (varlist[i]), j,
788 var_get_value_name (varlist[i], val));
791 fprintf (fp, "%s", reg_export_categorical_encode_2);
795 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
798 struct pspp_coeff *coeff;
799 const struct variable *v;
801 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
802 for (i = 1; i < c->n_indeps; i++)
805 v = pspp_coeff_get_var (coeff, 0);
806 fprintf (fp, "\"%s\",\n\t\t", var_get_name (v));
809 v = pspp_coeff_get_var (coeff, 0);
810 fprintf (fp, "\"%s\"};\n\t", var_get_name (v));
813 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
815 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
816 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
817 reg_print_depvars (fp, c);
818 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
820 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
821 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
824 reg_has_categorical (pspp_linreg_cache * c)
827 const struct variable *v;
829 for (i = 1; i < c->n_coeffs; i++)
831 v = pspp_coeff_get_var (c->coeff[i], 0);
832 if (var_is_alpha (v))
839 subcommand_export (int export, pspp_linreg_cache * c)
844 int n_quantiles = 100;
846 struct pspp_coeff *coeff;
851 assert (model_file != NULL);
852 fp = fopen (fh_get_file_name (model_file), "w");
854 fprintf (fp, "%s", reg_preamble);
855 reg_print_getvar (fp, c);
856 if (reg_has_categorical (c))
858 reg_print_categorical_encoding (fp, c);
860 fprintf (fp, "%s", reg_export_t_quantiles_1);
861 for (i = 0; i < n_quantiles - 1; i++)
863 tmp = 0.5 + 0.005 * (double) i;
864 fprintf (fp, "%.15e,\n\t\t",
865 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
867 fprintf (fp, "%.15e};\n\t",
868 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
869 fprintf (fp, "%s", reg_export_t_quantiles_2);
870 fprintf (fp, "%s", reg_mean_cmt);
871 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
872 fprintf (fp, "const char *var_names[])\n{\n\t");
873 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
874 for (i = 1; i < c->n_indeps; i++)
877 fprintf (fp, "%.15e,\n\t\t", coeff->estimate);
880 fprintf (fp, "%.15e};\n\t", coeff->estimate);
882 fprintf (fp, "double estimate = %.15e;\n\t", coeff->estimate);
883 fprintf (fp, "int i;\n\tint j;\n\n\t");
884 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
885 fprintf (fp, "%s", reg_getvar);
886 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
888 for (i = 0; i < c->cov->size1 - 1; i++)
891 for (j = 0; j < c->cov->size2 - 1; j++)
893 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
895 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
898 for (j = 0; j < c->cov->size2 - 1; j++)
900 fprintf (fp, "%.15e, ",
901 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
903 fprintf (fp, "%.15e}\n\t",
904 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
905 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
907 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
908 fprintf (fp, "%s", reg_variance);
909 fprintf (fp, "%s", reg_export_confidence_interval);
910 tmp = c->mse * c->mse;
911 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
912 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
913 fprintf (fp, "%s", reg_export_prediction_interval_3);
915 fp = fopen ("pspp_model_reg.h", "w");
916 fprintf (fp, "%s", reg_header);
922 regression_custom_export (struct lexer *lexer, struct dataset *ds UNUSED,
923 struct cmd_regression *cmd UNUSED, void *aux UNUSED)
925 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
926 if (!lex_force_match (lexer, '('))
929 if (lex_match (lexer, '*'))
933 fh_unref (model_file);
934 model_file = fh_parse (lexer, FH_REF_FILE);
935 if (model_file == NULL)
939 if (!lex_force_match (lexer, ')'))
946 cmd_regression (struct lexer *lexer, struct dataset *ds)
948 struct casegrouper *grouper;
949 struct casereader *group;
950 pspp_linreg_cache **models;
955 if (!parse_regression (lexer, ds, &cmd, NULL))
957 fh_unref (model_file);
961 models = xnmalloc (cmd.n_dependent, sizeof *models);
962 for (i = 0; i < cmd.n_dependent; i++)
968 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
969 while (casegrouper_get_next_group (grouper, &group))
970 run_regression (group, &cmd, ds, models);
971 ok = casegrouper_destroy (grouper);
972 ok = proc_commit (ds) && ok;
974 subcommand_save (ds, cmd.sbc_save, models);
977 free_regression (&cmd);
978 fh_unref (model_file);
980 return ok ? CMD_SUCCESS : CMD_FAILURE;
984 Is variable k the dependent variable?
987 is_depvar (size_t k, const struct variable *v)
989 return v == v_variables[k];
992 /* Parser for the variables sub command */
994 regression_custom_variables (struct lexer *lexer, struct dataset *ds,
995 struct cmd_regression *cmd UNUSED,
998 const struct dictionary *dict = dataset_dict (ds);
1000 lex_match (lexer, '=');
1002 if ((lex_token (lexer) != T_ID
1003 || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
1004 && lex_token (lexer) != T_ALL)
1008 if (!parse_variables_const
1009 (lexer, dict, &v_variables, &n_variables, PV_NONE))
1014 assert (n_variables);
1019 /* Identify the explanatory variables in v_variables. Returns
1020 the number of independent variables. */
1022 identify_indep_vars (const struct variable **indep_vars,
1023 const struct variable *depvar)
1025 int n_indep_vars = 0;
1028 for (i = 0; i < n_variables; i++)
1029 if (!is_depvar (i, depvar))
1030 indep_vars[n_indep_vars++] = v_variables[i];
1031 if ((n_indep_vars < 2) && is_depvar (0, depvar))
1034 There is only one independent variable, and it is the same
1035 as the dependent variable. Print a warning and continue.
1038 gettext ("The dependent variable is equal to the independent variable."
1039 "The least squares line is therefore Y=X."
1040 "Standard errors and related statistics may be meaningless."));
1042 indep_vars[0] = v_variables[0];
1044 return n_indep_vars;
1047 /* Encode categorical variables.
1048 Returns number of valid cases. */
1050 prepare_categories (struct casereader *input,
1051 const struct variable **vars, size_t n_vars,
1052 struct moments_var *mom)
1058 assert (vars != NULL);
1059 assert (mom != NULL);
1061 for (i = 0; i < n_vars; i++)
1062 if (var_is_alpha (vars[i]))
1063 cat_stored_values_create (vars[i]);
1066 for (; casereader_read (input, &c); case_destroy (&c))
1069 The second condition ensures the program will run even if
1070 there is only one variable to act as both explanatory and
1073 for (i = 0; i < n_vars; i++)
1075 const union value *val = case_data (&c, vars[i]);
1076 if (var_is_alpha (vars[i]))
1077 cat_value_update (vars[i], val);
1079 moments1_add (mom[i].m, val->f, 1.0);
1083 casereader_destroy (input);
1089 coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
1091 c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
1092 c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
1093 c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
1094 pspp_coeff_init (c->coeff + 1, dm);
1098 Put the moments in the linreg cache.
1101 compute_moments (pspp_linreg_cache * c, struct moments_var *mom,
1102 struct design_matrix *dm, size_t n)
1112 Scan the variable names in the columns of the design matrix.
1113 When we find the variable we need, insert its mean in the cache.
1115 for (i = 0; i < dm->m->size2; i++)
1117 for (j = 0; j < n; j++)
1119 if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
1121 moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
1122 &skewness, &kurtosis);
1123 gsl_vector_set (c->indep_means, i, mean);
1124 gsl_vector_set (c->indep_std, i, sqrt (variance));
1131 run_regression (struct casereader *input, struct cmd_regression *cmd,
1132 struct dataset *ds, pspp_linreg_cache **models)
1138 const struct variable **indep_vars;
1139 struct design_matrix *X;
1140 struct moments_var *mom;
1143 pspp_linreg_opts lopts;
1145 assert (models != NULL);
1147 if (!casereader_peek (input, 0, &c))
1149 casereader_destroy (input);
1152 output_split_file_values (ds, &c);
1157 dict_get_vars (dataset_dict (ds), &v_variables, &n_variables, 0);
1160 for (i = 0; i < cmd->n_dependent; i++)
1162 if (!var_is_numeric (cmd->v_dependent[i]))
1164 msg (SE, _("Dependent variable must be numeric."));
1169 mom = xnmalloc (n_variables, sizeof (*mom));
1170 for (i = 0; i < n_variables; i++)
1172 (mom + i)->m = moments1_create (MOMENT_VARIANCE);
1173 (mom + i)->v = v_variables[i];
1175 lopts.get_depvar_mean_std = 1;
1177 lopts.get_indep_mean_std = xnmalloc (n_variables, sizeof (int));
1178 indep_vars = xnmalloc (n_variables, sizeof *indep_vars);
1180 for (k = 0; k < cmd->n_dependent; k++)
1182 const struct variable *dep_var;
1183 struct casereader *reader;
1186 size_t n_data; /* Number of valid cases. */
1188 dep_var = cmd->v_dependent[k];
1189 n_indep = identify_indep_vars (indep_vars, dep_var);
1190 reader = casereader_clone (input);
1191 reader = casereader_create_filter_missing (reader, indep_vars, n_indep,
1193 reader = casereader_create_filter_missing (reader, &dep_var, 1,
1195 n_data = prepare_categories (casereader_clone (reader),
1196 indep_vars, n_indep, mom);
1198 if ((n_data > 0) && (n_indep > 0))
1200 Y = gsl_vector_alloc (n_data);
1202 design_matrix_create (n_indep,
1203 (const struct variable **) indep_vars,
1205 for (i = 0; i < X->m->size2; i++)
1207 lopts.get_indep_mean_std[i] = 1;
1209 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1210 models[k]->depvar = dep_var;
1212 For large data sets, use QR decomposition.
1214 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1216 models[k]->method = PSPP_LINREG_QR;
1220 The second pass fills the design matrix.
1222 reader = casereader_create_counter (reader, &row, -1);
1223 for (; casereader_read (reader, &c); case_destroy (&c))
1225 for (i = 0; i < n_indep; ++i)
1227 const struct variable *v = indep_vars[i];
1228 const union value *val = case_data (&c, v);
1229 if (var_is_alpha (v))
1230 design_matrix_set_categorical (X, row, v, val);
1232 design_matrix_set_numeric (X, row, v, val);
1234 gsl_vector_set (Y, row, case_num (&c, dep_var));
1237 Now that we know the number of coefficients, allocate space
1238 and store pointers to the variables that correspond to the
1241 coeff_init (models[k], X);
1244 Find the least-squares estimates and other statistics.
1246 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1247 compute_moments (models[k], mom, X, n_variables);
1249 if (!taint_has_tainted_successor (casereader_get_taint (input)))
1251 subcommand_statistics (cmd->a_statistics, models[k]);
1252 subcommand_export (cmd->sbc_export, models[k]);
1255 gsl_vector_free (Y);
1256 design_matrix_destroy (X);
1261 gettext ("No valid data found. This command was skipped."));
1263 casereader_destroy (reader);
1265 for (i = 0; i < n_variables; i++)
1267 moments1_destroy ((mom + i)->m);
1271 free (lopts.get_indep_mean_std);
1272 casereader_destroy (input);