1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <gsl/gsl_matrix.h>
21 #include <gsl/gsl_vector.h>
25 #include "regression-export.h"
26 #include <data/case.h>
27 #include <data/casegrouper.h>
28 #include <data/casereader.h>
29 #include <data/category.h>
30 #include <data/dictionary.h>
31 #include <data/missing-values.h>
32 #include <data/procedure.h>
33 #include <data/transformations.h>
34 #include <data/value-labels.h>
35 #include <data/variable.h>
36 #include <language/command.h>
37 #include <language/dictionary/split-file.h>
38 #include <language/data-io/file-handle.h>
39 #include <language/lexer/lexer.h>
40 #include <libpspp/compiler.h>
41 #include <libpspp/message.h>
42 #include <libpspp/taint.h>
43 #include <math/design-matrix.h>
44 #include <math/coefficient.h>
45 #include <math/linreg/linreg.h>
46 #include <math/moments.h>
47 #include <output/table.h>
52 #define _(msgid) gettext (msgid)
54 #define REG_LARGE_DATA 1000
59 "REGRESSION" (regression_):
80 +save[sv_]=resid,pred;
85 static struct cmd_regression cmd;
88 Moments for each of the variables used.
93 const struct variable *v;
97 Transformations for saving predicted values
102 int n_trns; /* Number of transformations. */
103 int trns_id; /* Which trns is this one? */
104 pspp_linreg_cache *c; /* Linear model for this trns. */
107 Variables used (both explanatory and response).
109 static const struct variable **v_variables;
114 static size_t n_variables;
117 File where the model will be saved if the EXPORT subcommand
120 static struct file_handle *model_file;
122 static bool run_regression (struct casereader *, struct cmd_regression *,
123 struct dataset *, pspp_linreg_cache **);
126 STATISTICS subcommand output functions.
128 static void reg_stats_r (pspp_linreg_cache *);
129 static void reg_stats_coeff (pspp_linreg_cache *);
130 static void reg_stats_anova (pspp_linreg_cache *);
131 static void reg_stats_outs (pspp_linreg_cache *);
132 static void reg_stats_zpp (pspp_linreg_cache *);
133 static void reg_stats_label (pspp_linreg_cache *);
134 static void reg_stats_sha (pspp_linreg_cache *);
135 static void reg_stats_ci (pspp_linreg_cache *);
136 static void reg_stats_f (pspp_linreg_cache *);
137 static void reg_stats_bcov (pspp_linreg_cache *);
138 static void reg_stats_ses (pspp_linreg_cache *);
139 static void reg_stats_xtx (pspp_linreg_cache *);
140 static void reg_stats_collin (pspp_linreg_cache *);
141 static void reg_stats_tol (pspp_linreg_cache *);
142 static void reg_stats_selection (pspp_linreg_cache *);
143 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
144 int, pspp_linreg_cache *);
147 reg_stats_r (pspp_linreg_cache * c)
157 rsq = c->ssm / c->sst;
158 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
159 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
160 t = tab_create (n_cols, n_rows, 0);
161 tab_dim (t, tab_natural_dimensions);
162 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
163 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
164 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
165 tab_vline (t, TAL_0, 1, 0, 0);
167 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
168 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
169 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
170 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
171 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
172 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
173 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
174 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
175 tab_title (t, _("Model Summary"));
180 Table showing estimated regression coefficients.
183 reg_stats_coeff (pspp_linreg_cache * c)
195 const struct variable *v;
196 const union value *val;
200 n_rows = c->n_coeffs + 2;
202 t = tab_create (n_cols, n_rows, 0);
203 tab_headers (t, 2, 0, 1, 0);
204 tab_dim (t, tab_natural_dimensions);
205 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
206 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
207 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
208 tab_vline (t, TAL_0, 1, 0, 0);
210 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
211 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
212 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
213 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
214 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
215 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
216 coeff = c->coeff[0]->estimate;
217 tab_float (t, 2, 1, 0, coeff, 10, 2);
218 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
219 tab_float (t, 3, 1, 0, std_err, 10, 2);
220 beta = coeff / c->depvar_std;
221 tab_float (t, 4, 1, 0, beta, 10, 2);
222 t_stat = coeff / std_err;
223 tab_float (t, 5, 1, 0, t_stat, 10, 2);
224 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
225 tab_float (t, 6, 1, 0, pval, 10, 2);
226 for (j = 1; j <= c->n_indeps; j++)
229 ds_init_empty (&tstr);
231 v = pspp_coeff_get_var (c->coeff[j], 0);
232 label = var_to_string (v);
233 /* Do not overwrite the variable's name. */
234 ds_put_cstr (&tstr, label);
235 if (var_is_alpha (v))
238 Append the value associated with this coefficient.
239 This makes sense only if we us the usual binary encoding
243 val = pspp_coeff_get_value (c->coeff[j], v);
245 var_append_value_name (v, val, &tstr);
248 tab_text (t, 1, j + 1, TAB_CENTER, ds_cstr (&tstr));
250 Regression coefficients.
252 coeff = c->coeff[j]->estimate;
253 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
255 Standard error of the coefficients.
257 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
258 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
260 'Standardized' coefficient, i.e., regression coefficient
261 if all variables had unit variance.
263 beta = gsl_vector_get (c->indep_std, j);
264 beta *= coeff / c->depvar_std;
265 tab_float (t, 4, j + 1, 0, beta, 10, 2);
268 Test statistic for H0: coefficient is 0.
270 t_stat = coeff / std_err;
271 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
273 P values for the test statistic above.
276 2 * gsl_cdf_tdist_Q (fabs (t_stat),
277 (double) (c->n_obs - c->n_coeffs));
278 tab_float (t, 6, j + 1, 0, pval, 10, 2);
281 tab_title (t, _("Coefficients"));
286 Display the ANOVA table.
289 reg_stats_anova (pspp_linreg_cache * c)
293 const double msm = c->ssm / c->dfm;
294 const double mse = c->sse / c->dfe;
295 const double F = msm / mse;
296 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
301 t = tab_create (n_cols, n_rows, 0);
302 tab_headers (t, 2, 0, 1, 0);
303 tab_dim (t, tab_natural_dimensions);
305 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
307 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
308 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
309 tab_vline (t, TAL_0, 1, 0, 0);
311 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
312 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
313 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
314 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
315 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
317 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
318 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
319 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
321 /* Sums of Squares */
322 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
323 tab_float (t, 2, 3, 0, c->sst, 10, 2);
324 tab_float (t, 2, 2, 0, c->sse, 10, 2);
327 /* Degrees of freedom */
328 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
329 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
330 tab_float (t, 3, 3, 0, c->dft, 4, 0);
334 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
335 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
337 tab_float (t, 5, 1, 0, F, 8, 3);
339 tab_float (t, 6, 1, 0, pval, 8, 3);
341 tab_title (t, _("ANOVA"));
345 reg_stats_outs (pspp_linreg_cache * c)
350 reg_stats_zpp (pspp_linreg_cache * c)
355 reg_stats_label (pspp_linreg_cache * c)
360 reg_stats_sha (pspp_linreg_cache * c)
365 reg_stats_ci (pspp_linreg_cache * c)
370 reg_stats_f (pspp_linreg_cache * c)
375 reg_stats_bcov (pspp_linreg_cache * c)
387 n_cols = c->n_indeps + 1 + 2;
388 n_rows = 2 * (c->n_indeps + 1);
389 t = tab_create (n_cols, n_rows, 0);
390 tab_headers (t, 2, 0, 1, 0);
391 tab_dim (t, tab_natural_dimensions);
392 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
393 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
394 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
395 tab_vline (t, TAL_0, 1, 0, 0);
396 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
397 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
398 for (i = 1; i < c->n_coeffs; i++)
400 const struct variable *v = pspp_coeff_get_var (c->coeff[i], 0);
401 label = var_to_string (v);
402 tab_text (t, 2, i, TAB_CENTER, label);
403 tab_text (t, i + 2, 0, TAB_CENTER, label);
404 for (k = 1; k < c->n_coeffs; k++)
406 col = (i <= k) ? k : i;
407 row = (i <= k) ? i : k;
408 tab_float (t, k + 2, i, TAB_CENTER,
409 gsl_matrix_get (c->cov, row, col), 8, 3);
412 tab_title (t, _("Coefficient Correlations"));
416 reg_stats_ses (pspp_linreg_cache * c)
421 reg_stats_xtx (pspp_linreg_cache * c)
426 reg_stats_collin (pspp_linreg_cache * c)
431 reg_stats_tol (pspp_linreg_cache * c)
436 reg_stats_selection (pspp_linreg_cache * c)
442 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
443 int keyword, pspp_linreg_cache * c)
452 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
455 The order here must match the order in which the STATISTICS
456 keywords appear in the specification section above.
483 Set everything but F.
485 for (i = 0; i < f; i++)
492 for (i = 0; i < all; i++)
500 Default output: ANOVA table, parameter estimates,
501 and statistics for variables not entered into model,
504 if (keywords[defaults] | d)
512 statistics_keyword_output (reg_stats_r, keywords[r], c);
513 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
514 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
515 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
516 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
517 statistics_keyword_output (reg_stats_label, keywords[label], c);
518 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
519 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
520 statistics_keyword_output (reg_stats_f, keywords[f], c);
521 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
522 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
523 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
524 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
525 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
526 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
530 Free the transformation. Free its linear model if this
531 transformation is the last one.
534 regression_trns_free (void *t_)
537 struct reg_trns *t = t_;
539 if (t->trns_id == t->n_trns)
541 result = pspp_linreg_cache_free (t->c);
549 Gets the predicted values.
552 regression_trns_pred_proc (void *t_, struct ccase *c,
553 casenumber case_idx UNUSED)
557 struct reg_trns *trns = t_;
558 pspp_linreg_cache *model;
559 union value *output = NULL;
560 const union value **vals = NULL;
561 const struct variable **vars = NULL;
563 assert (trns != NULL);
565 assert (model != NULL);
566 assert (model->depvar != NULL);
567 assert (model->pred != NULL);
569 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
570 n_vals = (*model->get_vars) (model, vars);
572 vals = xnmalloc (n_vals, sizeof (*vals));
573 output = case_data_rw (c, model->pred);
574 assert (output != NULL);
576 for (i = 0; i < n_vals; i++)
578 vals[i] = case_data (c, vars[i]);
580 output->f = (*model->predict) ((const struct variable **) vars,
581 vals, model, n_vals);
584 return TRNS_CONTINUE;
591 regression_trns_resid_proc (void *t_, struct ccase *c,
592 casenumber case_idx UNUSED)
596 struct reg_trns *trns = t_;
597 pspp_linreg_cache *model;
598 union value *output = NULL;
599 const union value **vals = NULL;
600 const union value *obs = NULL;
601 const struct variable **vars = NULL;
603 assert (trns != NULL);
605 assert (model != NULL);
606 assert (model->depvar != NULL);
607 assert (model->resid != NULL);
609 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
610 n_vals = (*model->get_vars) (model, vars);
612 vals = xnmalloc (n_vals, sizeof (*vals));
613 output = case_data_rw (c, model->resid);
614 assert (output != NULL);
616 for (i = 0; i < n_vals; i++)
618 vals[i] = case_data (c, vars[i]);
620 obs = case_data (c, model->depvar);
621 output->f = (*model->residual) ((const struct variable **) vars,
622 vals, obs, model, n_vals);
625 return TRNS_CONTINUE;
629 Returns false if NAME is a duplicate of any existing variable name.
632 try_name (const struct dictionary *dict, const char *name)
634 if (dict_lookup_var (dict, name) != NULL)
641 reg_get_name (const struct dictionary *dict, char name[VAR_NAME_LEN],
642 const char prefix[VAR_NAME_LEN])
646 snprintf (name, VAR_NAME_LEN, "%s%d", prefix, i);
647 while (!try_name (dict, name))
650 snprintf (name, VAR_NAME_LEN, "%s%d", prefix, i);
655 reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
656 pspp_linreg_cache * c, struct variable **v, int n_trns)
658 struct dictionary *dict = dataset_dict (ds);
659 static int trns_index = 1;
660 char name[VAR_NAME_LEN];
661 struct variable *new_var;
662 struct reg_trns *t = NULL;
664 t = xmalloc (sizeof (*t));
665 t->trns_id = trns_index;
668 reg_get_name (dict, name, prefix);
669 new_var = dict_create_var (dict, name, 0);
670 assert (new_var != NULL);
672 add_transformation (ds, f, regression_trns_free, t);
677 subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models)
679 pspp_linreg_cache **lc;
683 assert (models != NULL);
687 /* Count the number of transformations we will need. */
688 for (i = 0; i < REGRESSION_SV_count; i++)
695 n_trns *= cmd.n_dependent;
697 for (lc = models; lc < models + cmd.n_dependent; lc++)
699 assert (*lc != NULL);
700 assert ((*lc)->depvar != NULL);
701 if (cmd.a_save[REGRESSION_SV_RESID])
703 reg_save_var (ds, "RES", regression_trns_resid_proc, *lc,
704 &(*lc)->resid, n_trns);
706 if (cmd.a_save[REGRESSION_SV_PRED])
708 reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc,
709 &(*lc)->pred, n_trns);
715 for (lc = models; lc < models + cmd.n_dependent; lc++)
719 pspp_linreg_cache_free (*lc);
726 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
730 for (i = 0; i < n_vars; i++)
741 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
745 struct variable **varlist;
747 fprintf (fp, "%s", reg_export_categorical_encode_1);
749 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
750 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
752 struct pspp_coeff *coeff = c->coeff[i];
753 const struct variable *v = pspp_coeff_get_var (coeff, 0);
754 if (var_is_alpha (v))
756 if (!reg_inserted (v, varlist, n_vars))
758 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
760 varlist[n_vars] = (struct variable *) v;
765 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
766 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
768 for (i = 0; i < n_vars - 1; i++)
770 fprintf (fp, "&%s,\n\t\t", var_get_name (varlist[i]));
772 fprintf (fp, "&%s};\n\t", var_get_name (varlist[i]));
774 for (i = 0; i < n_vars; i++)
776 int n_categories = cat_get_n_categories (varlist[i]);
779 fprintf (fp, "%s.name = \"%s\";\n\t",
780 var_get_name (varlist[i]), var_get_name (varlist[i]));
781 fprintf (fp, "%s.n_vals = %d;\n\t",
782 var_get_name (varlist[i]), n_categories);
784 for (j = 0; j < n_categories; j++)
787 const union value *val = cat_subscript_to_value (j, varlist[i]);
788 ds_init_empty (&vstr);
789 var_append_value_name (varlist[i], val, &vstr);
790 fprintf (fp, "%s.values[%d] = \"%s\";\n\t",
791 var_get_name (varlist[i]), j,
797 fprintf (fp, "%s", reg_export_categorical_encode_2);
801 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
804 struct pspp_coeff *coeff;
805 const struct variable *v;
807 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
808 for (i = 1; i < c->n_indeps; i++)
811 v = pspp_coeff_get_var (coeff, 0);
812 fprintf (fp, "\"%s\",\n\t\t", var_get_name (v));
815 v = pspp_coeff_get_var (coeff, 0);
816 fprintf (fp, "\"%s\"};\n\t", var_get_name (v));
819 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
821 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
822 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
823 reg_print_depvars (fp, c);
824 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
826 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
827 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
830 reg_has_categorical (pspp_linreg_cache * c)
833 const struct variable *v;
835 for (i = 1; i < c->n_coeffs; i++)
837 v = pspp_coeff_get_var (c->coeff[i], 0);
838 if (var_is_alpha (v))
845 subcommand_export (int export, pspp_linreg_cache * c)
850 int n_quantiles = 100;
852 struct pspp_coeff *coeff;
857 assert (model_file != NULL);
858 fp = fopen (fh_get_file_name (model_file), "w");
860 fprintf (fp, "%s", reg_preamble);
861 reg_print_getvar (fp, c);
862 if (reg_has_categorical (c))
864 reg_print_categorical_encoding (fp, c);
866 fprintf (fp, "%s", reg_export_t_quantiles_1);
867 for (i = 0; i < n_quantiles - 1; i++)
869 tmp = 0.5 + 0.005 * (double) i;
870 fprintf (fp, "%.15e,\n\t\t",
871 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
873 fprintf (fp, "%.15e};\n\t",
874 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
875 fprintf (fp, "%s", reg_export_t_quantiles_2);
876 fprintf (fp, "%s", reg_mean_cmt);
877 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
878 fprintf (fp, "const char *var_names[])\n{\n\t");
879 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
880 for (i = 1; i < c->n_indeps; i++)
883 fprintf (fp, "%.15e,\n\t\t", coeff->estimate);
886 fprintf (fp, "%.15e};\n\t", coeff->estimate);
888 fprintf (fp, "double estimate = %.15e;\n\t", coeff->estimate);
889 fprintf (fp, "int i;\n\tint j;\n\n\t");
890 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
891 fprintf (fp, "%s", reg_getvar);
892 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
894 for (i = 0; i < c->cov->size1 - 1; i++)
897 for (j = 0; j < c->cov->size2 - 1; j++)
899 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
901 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
904 for (j = 0; j < c->cov->size2 - 1; j++)
906 fprintf (fp, "%.15e, ",
907 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
909 fprintf (fp, "%.15e}\n\t",
910 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
911 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
913 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
914 fprintf (fp, "%s", reg_variance);
915 fprintf (fp, "%s", reg_export_confidence_interval);
916 tmp = c->mse * c->mse;
917 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
918 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
919 fprintf (fp, "%s", reg_export_prediction_interval_3);
921 fp = fopen ("pspp_model_reg.h", "w");
922 fprintf (fp, "%s", reg_header);
928 regression_custom_export (struct lexer *lexer, struct dataset *ds UNUSED,
929 struct cmd_regression *cmd UNUSED, void *aux UNUSED)
931 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
932 if (!lex_force_match (lexer, '('))
935 if (lex_match (lexer, '*'))
939 fh_unref (model_file);
940 model_file = fh_parse (lexer, FH_REF_FILE);
941 if (model_file == NULL)
945 if (!lex_force_match (lexer, ')'))
952 cmd_regression (struct lexer *lexer, struct dataset *ds)
954 struct casegrouper *grouper;
955 struct casereader *group;
956 pspp_linreg_cache **models;
961 if (!parse_regression (lexer, ds, &cmd, NULL))
963 fh_unref (model_file);
967 models = xnmalloc (cmd.n_dependent, sizeof *models);
968 for (i = 0; i < cmd.n_dependent; i++)
974 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
975 while (casegrouper_get_next_group (grouper, &group))
976 run_regression (group, &cmd, ds, models);
977 ok = casegrouper_destroy (grouper);
978 ok = proc_commit (ds) && ok;
980 subcommand_save (ds, cmd.sbc_save, models);
983 free_regression (&cmd);
984 fh_unref (model_file);
986 return ok ? CMD_SUCCESS : CMD_FAILURE;
990 Is variable k the dependent variable?
993 is_depvar (size_t k, const struct variable *v)
995 return v == v_variables[k];
998 /* Parser for the variables sub command */
1000 regression_custom_variables (struct lexer *lexer, struct dataset *ds,
1001 struct cmd_regression *cmd UNUSED,
1004 const struct dictionary *dict = dataset_dict (ds);
1006 lex_match (lexer, '=');
1008 if ((lex_token (lexer) != T_ID
1009 || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
1010 && lex_token (lexer) != T_ALL)
1014 if (!parse_variables_const
1015 (lexer, dict, &v_variables, &n_variables, PV_NONE))
1020 assert (n_variables);
1025 /* Identify the explanatory variables in v_variables. Returns
1026 the number of independent variables. */
1028 identify_indep_vars (const struct variable **indep_vars,
1029 const struct variable *depvar)
1031 int n_indep_vars = 0;
1034 for (i = 0; i < n_variables; i++)
1035 if (!is_depvar (i, depvar))
1036 indep_vars[n_indep_vars++] = v_variables[i];
1037 if ((n_indep_vars < 2) && is_depvar (0, depvar))
1040 There is only one independent variable, and it is the same
1041 as the dependent variable. Print a warning and continue.
1044 gettext ("The dependent variable is equal to the independent variable."
1045 "The least squares line is therefore Y=X."
1046 "Standard errors and related statistics may be meaningless."));
1048 indep_vars[0] = v_variables[0];
1050 return n_indep_vars;
1053 /* Encode categorical variables.
1054 Returns number of valid cases. */
1056 prepare_categories (struct casereader *input,
1057 const struct variable **vars, size_t n_vars,
1058 struct moments_var *mom)
1064 assert (vars != NULL);
1065 assert (mom != NULL);
1067 for (i = 0; i < n_vars; i++)
1068 if (var_is_alpha (vars[i]))
1069 cat_stored_values_create (vars[i]);
1072 for (; casereader_read (input, &c); case_destroy (&c))
1075 The second condition ensures the program will run even if
1076 there is only one variable to act as both explanatory and
1079 for (i = 0; i < n_vars; i++)
1081 const union value *val = case_data (&c, vars[i]);
1082 if (var_is_alpha (vars[i]))
1083 cat_value_update (vars[i], val);
1085 moments1_add (mom[i].m, val->f, 1.0);
1089 casereader_destroy (input);
1095 coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
1097 c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
1098 c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
1099 c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
1100 pspp_coeff_init (c->coeff + 1, dm);
1104 Put the moments in the linreg cache.
1107 compute_moments (pspp_linreg_cache * c, struct moments_var *mom,
1108 struct design_matrix *dm, size_t n)
1118 Scan the variable names in the columns of the design matrix.
1119 When we find the variable we need, insert its mean in the cache.
1121 for (i = 0; i < dm->m->size2; i++)
1123 for (j = 0; j < n; j++)
1125 if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
1127 moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
1128 &skewness, &kurtosis);
1129 gsl_vector_set (c->indep_means, i, mean);
1130 gsl_vector_set (c->indep_std, i, sqrt (variance));
1137 run_regression (struct casereader *input, struct cmd_regression *cmd,
1138 struct dataset *ds, pspp_linreg_cache **models)
1144 const struct variable **indep_vars;
1145 struct design_matrix *X;
1146 struct moments_var *mom;
1149 pspp_linreg_opts lopts;
1151 assert (models != NULL);
1153 if (!casereader_peek (input, 0, &c))
1155 casereader_destroy (input);
1158 output_split_file_values (ds, &c);
1163 dict_get_vars (dataset_dict (ds), &v_variables, &n_variables, 0);
1166 for (i = 0; i < cmd->n_dependent; i++)
1168 if (!var_is_numeric (cmd->v_dependent[i]))
1170 msg (SE, _("Dependent variable must be numeric."));
1175 mom = xnmalloc (n_variables, sizeof (*mom));
1176 for (i = 0; i < n_variables; i++)
1178 (mom + i)->m = moments1_create (MOMENT_VARIANCE);
1179 (mom + i)->v = v_variables[i];
1181 lopts.get_depvar_mean_std = 1;
1183 lopts.get_indep_mean_std = xnmalloc (n_variables, sizeof (int));
1184 indep_vars = xnmalloc (n_variables, sizeof *indep_vars);
1186 for (k = 0; k < cmd->n_dependent; k++)
1188 const struct variable *dep_var;
1189 struct casereader *reader;
1192 size_t n_data; /* Number of valid cases. */
1194 dep_var = cmd->v_dependent[k];
1195 n_indep = identify_indep_vars (indep_vars, dep_var);
1196 reader = casereader_clone (input);
1197 reader = casereader_create_filter_missing (reader, indep_vars, n_indep,
1199 reader = casereader_create_filter_missing (reader, &dep_var, 1,
1201 n_data = prepare_categories (casereader_clone (reader),
1202 indep_vars, n_indep, mom);
1204 if ((n_data > 0) && (n_indep > 0))
1206 Y = gsl_vector_alloc (n_data);
1208 design_matrix_create (n_indep,
1209 (const struct variable **) indep_vars,
1211 for (i = 0; i < X->m->size2; i++)
1213 lopts.get_indep_mean_std[i] = 1;
1215 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1216 models[k]->depvar = dep_var;
1218 For large data sets, use QR decomposition.
1220 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1222 models[k]->method = PSPP_LINREG_QR;
1226 The second pass fills the design matrix.
1228 reader = casereader_create_counter (reader, &row, -1);
1229 for (; casereader_read (reader, &c); case_destroy (&c))
1231 for (i = 0; i < n_indep; ++i)
1233 const struct variable *v = indep_vars[i];
1234 const union value *val = case_data (&c, v);
1235 if (var_is_alpha (v))
1236 design_matrix_set_categorical (X, row, v, val);
1238 design_matrix_set_numeric (X, row, v, val);
1240 gsl_vector_set (Y, row, case_num (&c, dep_var));
1243 Now that we know the number of coefficients, allocate space
1244 and store pointers to the variables that correspond to the
1247 coeff_init (models[k], X);
1250 Find the least-squares estimates and other statistics.
1252 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1253 compute_moments (models[k], mom, X, n_variables);
1255 if (!taint_has_tainted_successor (casereader_get_taint (input)))
1257 subcommand_statistics (cmd->a_statistics, models[k]);
1258 subcommand_export (cmd->sbc_export, models[k]);
1261 gsl_vector_free (Y);
1262 design_matrix_destroy (X);
1267 gettext ("No valid data found. This command was skipped."));
1269 casereader_destroy (reader);
1271 for (i = 0; i < n_variables; i++)
1273 moments1_destroy ((mom + i)->m);
1277 free (lopts.get_indep_mean_std);
1278 casereader_destroy (input);