1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <gsl/gsl_vector.h>
24 #include <gsl/gsl_matrix.h>
26 #include <libpspp/alloc.h>
27 #include <data/case.h>
28 #include <data/casefile.h>
29 #include <data/category.h>
30 #include <data/cat-routines.h>
31 #include <language/command.h>
32 #include <libpspp/compiler.h>
33 #include <math/design-matrix.h>
34 #include <data/dictionary.h>
35 #include <libpspp/message.h>
36 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <math/linreg/linreg.h>
40 #include <math/linreg/coefficient.h>
41 #include <data/missing-values.h>
42 #include "regression-export.h"
43 #include <output/table.h>
44 #include <data/value-labels.h>
45 #include <data/variable.h>
46 #include <procedure.h>
48 #define REG_LARGE_DATA 1000
53 "REGRESSION" (regression_):
79 static struct cmd_regression cmd;
81 /* Linear regression models. */
82 pspp_linreg_cache **models = NULL;
85 Transformations for saving predicted values
90 int n_trns; /* Number of transformations. */
91 int trns_id; /* Which trns is this one? */
92 pspp_linreg_cache *c; /* Linear model for this trns. */
95 Variables used (both explanatory and response).
97 static struct variable **v_variables;
102 static size_t n_variables;
105 File where the model will be saved if the EXPORT subcommand
108 struct file_handle *model_file;
111 Return value for the procedure.
113 int pspp_reg_rc = CMD_SUCCESS;
115 static bool run_regression (const struct casefile *, void *);
118 STATISTICS subcommand output functions.
120 static void reg_stats_r (pspp_linreg_cache *);
121 static void reg_stats_coeff (pspp_linreg_cache *);
122 static void reg_stats_anova (pspp_linreg_cache *);
123 static void reg_stats_outs (pspp_linreg_cache *);
124 static void reg_stats_zpp (pspp_linreg_cache *);
125 static void reg_stats_label (pspp_linreg_cache *);
126 static void reg_stats_sha (pspp_linreg_cache *);
127 static void reg_stats_ci (pspp_linreg_cache *);
128 static void reg_stats_f (pspp_linreg_cache *);
129 static void reg_stats_bcov (pspp_linreg_cache *);
130 static void reg_stats_ses (pspp_linreg_cache *);
131 static void reg_stats_xtx (pspp_linreg_cache *);
132 static void reg_stats_collin (pspp_linreg_cache *);
133 static void reg_stats_tol (pspp_linreg_cache *);
134 static void reg_stats_selection (pspp_linreg_cache *);
135 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
136 int, pspp_linreg_cache *);
139 reg_stats_r (pspp_linreg_cache * c)
149 rsq = c->ssm / c->sst;
150 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
151 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
152 t = tab_create (n_cols, n_rows, 0);
153 tab_dim (t, tab_natural_dimensions);
154 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
155 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
156 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
157 tab_vline (t, TAL_0, 1, 0, 0);
159 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
160 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
161 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
162 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
163 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
164 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
165 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
166 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
167 tab_title (t, _("Model Summary"));
172 Table showing estimated regression coefficients.
175 reg_stats_coeff (pspp_linreg_cache * c)
187 const struct variable *v;
188 const union value *val;
193 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
194 n_rows = c->n_coeffs + 2;
196 t = tab_create (n_cols, n_rows, 0);
197 tab_headers (t, 2, 0, 1, 0);
198 tab_dim (t, tab_natural_dimensions);
199 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
200 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
201 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
202 tab_vline (t, TAL_0, 1, 0, 0);
204 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
205 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
206 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
207 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
208 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
209 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
210 coeff = c->coeff[0].estimate;
211 tab_float (t, 2, 1, 0, coeff, 10, 2);
212 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
213 tab_float (t, 3, 1, 0, std_err, 10, 2);
214 beta = coeff / c->depvar_std;
215 tab_float (t, 4, 1, 0, beta, 10, 2);
216 t_stat = coeff / std_err;
217 tab_float (t, 5, 1, 0, t_stat, 10, 2);
218 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
219 tab_float (t, 6, 1, 0, pval, 10, 2);
220 for (j = 1; j <= c->n_indeps; j++)
222 v = pspp_linreg_coeff_get_var (c->coeff + j, 0);
223 label = var_to_string (v);
224 /* Do not overwrite the variable's name. */
225 strncpy (tmp, label, MAX_STRING);
226 if (v->type == ALPHA)
229 Append the value associated with this coefficient.
230 This makes sense only if we us the usual binary encoding
234 val = pspp_linreg_coeff_get_value (c->coeff + j, v);
235 val_s = value_to_string (val, v);
236 strncat (tmp, val_s, MAX_STRING);
239 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
241 Regression coefficients.
243 coeff = c->coeff[j].estimate;
244 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
246 Standard error of the coefficients.
248 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
249 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
251 'Standardized' coefficient, i.e., regression coefficient
252 if all variables had unit variance.
254 beta = gsl_vector_get (c->indep_std, j);
255 beta *= coeff / c->depvar_std;
256 tab_float (t, 4, j + 1, 0, beta, 10, 2);
259 Test statistic for H0: coefficient is 0.
261 t_stat = coeff / std_err;
262 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
264 P values for the test statistic above.
266 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
267 tab_float (t, 6, j + 1, 0, pval, 10, 2);
269 tab_title (t, _("Coefficients"));
275 Display the ANOVA table.
278 reg_stats_anova (pspp_linreg_cache * c)
282 const double msm = c->ssm / c->dfm;
283 const double mse = c->sse / c->dfe;
284 const double F = msm / mse;
285 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
290 t = tab_create (n_cols, n_rows, 0);
291 tab_headers (t, 2, 0, 1, 0);
292 tab_dim (t, tab_natural_dimensions);
294 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
296 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
297 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
298 tab_vline (t, TAL_0, 1, 0, 0);
300 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
301 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
302 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
303 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
304 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
306 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
307 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
308 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
310 /* Sums of Squares */
311 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
312 tab_float (t, 2, 3, 0, c->sst, 10, 2);
313 tab_float (t, 2, 2, 0, c->sse, 10, 2);
316 /* Degrees of freedom */
317 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
318 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
319 tab_float (t, 3, 3, 0, c->dft, 4, 0);
323 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
324 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
326 tab_float (t, 5, 1, 0, F, 8, 3);
328 tab_float (t, 6, 1, 0, pval, 8, 3);
330 tab_title (t, _("ANOVA"));
334 reg_stats_outs (pspp_linreg_cache * c)
339 reg_stats_zpp (pspp_linreg_cache * c)
344 reg_stats_label (pspp_linreg_cache * c)
349 reg_stats_sha (pspp_linreg_cache * c)
354 reg_stats_ci (pspp_linreg_cache * c)
359 reg_stats_f (pspp_linreg_cache * c)
364 reg_stats_bcov (pspp_linreg_cache * c)
376 n_cols = c->n_indeps + 1 + 2;
377 n_rows = 2 * (c->n_indeps + 1);
378 t = tab_create (n_cols, n_rows, 0);
379 tab_headers (t, 2, 0, 1, 0);
380 tab_dim (t, tab_natural_dimensions);
381 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
382 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
383 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
384 tab_vline (t, TAL_0, 1, 0, 0);
385 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
386 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
387 for (i = 1; i < c->n_coeffs; i++)
389 const struct variable *v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
390 label = var_to_string (v);
391 tab_text (t, 2, i, TAB_CENTER, label);
392 tab_text (t, i + 2, 0, TAB_CENTER, label);
393 for (k = 1; k < c->n_coeffs; k++)
395 col = (i <= k) ? k : i;
396 row = (i <= k) ? i : k;
397 tab_float (t, k + 2, i, TAB_CENTER,
398 gsl_matrix_get (c->cov, row, col), 8, 3);
401 tab_title (t, _("Coefficient Correlations"));
405 reg_stats_ses (pspp_linreg_cache * c)
410 reg_stats_xtx (pspp_linreg_cache * c)
415 reg_stats_collin (pspp_linreg_cache * c)
420 reg_stats_tol (pspp_linreg_cache * c)
425 reg_stats_selection (pspp_linreg_cache * c)
431 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
432 int keyword, pspp_linreg_cache * c)
441 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
444 The order here must match the order in which the STATISTICS
445 keywords appear in the specification section above.
472 Set everything but F.
474 for (i = 0; i < f; i++)
481 for (i = 0; i < all; i++)
489 Default output: ANOVA table, parameter estimates,
490 and statistics for variables not entered into model,
493 if (keywords[defaults] | d)
501 statistics_keyword_output (reg_stats_r, keywords[r], c);
502 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
503 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
504 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
505 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
506 statistics_keyword_output (reg_stats_label, keywords[label], c);
507 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
508 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
509 statistics_keyword_output (reg_stats_f, keywords[f], c);
510 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
511 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
512 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
513 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
514 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
515 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
518 Free the transformation. Free its linear model if this
519 transformation is the last one.
522 bool regression_trns_free (void *t_)
525 struct reg_trns *t = t_;
527 if (t->trns_id == t->n_trns)
529 result = pspp_linreg_cache_free (t->c);
536 Gets the predicted values.
539 regression_trns_pred_proc (void *t_, struct ccase *c, int case_idx UNUSED)
544 union value *tmp = NULL;
545 struct reg_trns *t = t_;
546 pspp_linreg_cache *model;
548 const union value **vals = NULL;
549 struct variable **vars = NULL;
550 struct variable **model_vars = NULL;
554 assert (model != NULL);
555 assert (model->depvar != NULL);
556 assert (model->pred != NULL);
558 dict_get_vars (default_dict, &vars, &n_vars, 1u << DC_SYSTEM);
559 vals = xnmalloc (n_vars, sizeof (*vals));
560 model_vars = xnmalloc (n_vars, sizeof (*model_vars));
561 assert (vals != NULL);
562 output = case_data_rw (c, model->pred->fv);
563 assert (output != NULL);
565 for (i = 0; i < n_vars; i++)
567 /* Use neither the predicted values nor the dependent variable. */
568 if (vars[i]->index != model->pred->index &&
569 vars[i]->index != model->depvar->index)
571 if (vars[i]->type == ALPHA && vars[i]->obs_vals != NULL)
573 tmp = vars[i]->obs_vals->vals;
580 Make sure the variable we use is in the linear model.
582 if (pspp_linreg_get_coeff (model, vars[i], tmp) != NULL)
584 vals[n_vals] = case_data (c, i);
585 model_vars[n_vals] = vars[i];
590 output->f = (*model->predict) ((const struct variable **) vars,
591 vals, model, n_vals);
594 return TRNS_CONTINUE;
600 regression_trns_resid_proc (void *t_, struct ccase *c, int case_idx UNUSED)
605 struct reg_trns *t = t_;
606 pspp_linreg_cache *model;
609 const union value **vals = NULL;
610 const union value *obs = NULL;
611 struct variable **vars = NULL;
612 struct variable **model_vars = NULL;
616 assert (model != NULL);
617 assert (model->depvar != NULL);
618 assert (model->resid != NULL);
620 dict_get_vars (default_dict, &vars, &n_vars, 1u << DC_SYSTEM);
621 vals = xnmalloc (n_vars, sizeof (*vals));
622 model_vars = xnmalloc (n_vars, sizeof (*model_vars));
623 assert (vals != NULL);
624 output = case_data_rw (c, model->resid->fv);
625 assert (output != NULL);
627 for (i = 0; i < n_vars; i++)
629 /* Use neither the predicted values nor the dependent variable. */
630 if (vars[i]->index != model->resid->index &&
631 vars[i]->index != model->depvar->index)
633 if (vars[i]->type == ALPHA && vars[i]->obs_vals != NULL)
635 tmp = vars[i]->obs_vals->vals;
642 Make sure the variable we use is in the linear model.
644 if (pspp_linreg_get_coeff (model, vars[i], tmp) != NULL)
646 vals[n_vals] = case_data (c, i);
647 model_vars[n_vals] = vars[i];
651 if (vars[i]->index == model->depvar->index)
653 obs = case_data (c, i);
654 assert (obs != NULL);
657 output->f = (*model->residual) ((const struct variable **) vars,
658 vals, obs, model, n_vals);
661 return TRNS_CONTINUE;
664 Returns 0 if NAME is a duplicate of any existing variable name.
667 try_name (char *name)
669 if (dict_lookup_var (default_dict, name) != NULL)
675 void reg_get_name (char name[LONG_NAME_LEN], const char prefix[LONG_NAME_LEN])
679 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
680 while (!try_name(name))
683 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
687 reg_save_var (const char *prefix, trns_proc_func *f,
688 pspp_linreg_cache *c, struct variable **v,
691 static int trns_index = 1;
692 char name[LONG_NAME_LEN + 1];
693 struct variable *new_var;
694 struct reg_trns *t = NULL;
696 t = xmalloc (sizeof (*t));
698 t->trns_id = trns_index;
701 reg_get_name (name, prefix);
702 new_var = dict_create_var (default_dict, name, 0);
703 assert (new_var != NULL);
705 add_transformation (f, regression_trns_free, t);
709 subcommand_save (int save, pspp_linreg_cache **models)
711 pspp_linreg_cache **lc;
715 assert (models != NULL);
719 /* Count the number of transformations we will need. */
720 for (i = 0; i < REGRESSION_SV_count; i++)
727 n_trns *= cmd.n_dependent;
729 for (lc = models; lc < models + cmd.n_dependent; lc++)
731 assert (*lc != NULL);
732 assert ((*lc)->depvar != NULL);
733 if (cmd.a_save[REGRESSION_SV_RESID])
735 reg_save_var ("RES", regression_trns_resid_proc, *lc, &(*lc)->resid, n_trns);
737 if (cmd.a_save[REGRESSION_SV_PRED])
739 reg_save_var ("PRED", regression_trns_pred_proc, *lc, &(*lc)->pred, n_trns);
745 for (lc = models; lc < models + cmd.n_dependent; lc++)
747 assert (*lc != NULL);
748 pspp_linreg_cache_free (*lc);
753 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
757 for (i = 0; i < n_vars; i++)
759 if (v->index == varlist[i]->index)
767 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
772 struct variable **varlist;
773 struct pspp_linreg_coeff *coeff;
774 const struct variable *v;
777 fprintf (fp, "%s", reg_export_categorical_encode_1);
779 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
780 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
782 coeff = c->coeff + i;
783 v = pspp_linreg_coeff_get_var (coeff, 0);
784 if (v->type == ALPHA)
786 if (!reg_inserted (v, varlist, n_vars))
788 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
790 varlist[n_vars] = (struct variable *) v;
795 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
796 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
798 for (i = 0; i < n_vars - 1; i++)
800 fprintf (fp, "&%s,\n\t\t", varlist[i]->name);
802 fprintf (fp, "&%s};\n\t", varlist[i]->name);
804 for (i = 0; i < n_vars; i++)
806 coeff = c->coeff + i;
807 fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name,
809 fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name,
810 varlist[i]->obs_vals->n_categories);
812 for (j = 0; j < varlist[i]->obs_vals->n_categories; j++)
814 val = cat_subscript_to_value ((const size_t) j, varlist[i]);
815 fprintf (fp, "%s.values[%d] = \"%s\";\n\t", varlist[i]->name, j,
816 value_to_string (val, varlist[i]));
819 fprintf (fp, "%s", reg_export_categorical_encode_2);
823 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
826 struct pspp_linreg_coeff *coeff;
827 const struct variable *v;
829 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
830 for (i = 1; i < c->n_indeps; i++)
832 coeff = c->coeff + i;
833 v = pspp_linreg_coeff_get_var (coeff, 0);
834 fprintf (fp, "\"%s\",\n\t\t", v->name);
836 coeff = c->coeff + i;
837 v = pspp_linreg_coeff_get_var (coeff, 0);
838 fprintf (fp, "\"%s\"};\n\t", v->name);
841 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
843 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
844 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
845 reg_print_depvars (fp, c);
846 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
848 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
849 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
852 reg_has_categorical (pspp_linreg_cache * c)
855 const struct variable *v;
857 for (i = 1; i < c->n_coeffs; i++)
859 v = pspp_linreg_coeff_get_var (c->coeff + i, 0);
860 if (v->type == ALPHA)
869 subcommand_export (int export, pspp_linreg_cache * c)
874 int n_quantiles = 100;
877 struct pspp_linreg_coeff coeff;
882 assert (model_file != NULL);
883 fp = fopen (fh_get_file_name (model_file), "w");
885 fprintf (fp, "%s", reg_preamble);
886 reg_print_getvar (fp, c);
887 if (reg_has_categorical (c))
889 reg_print_categorical_encoding (fp, c);
891 fprintf (fp, "%s", reg_export_t_quantiles_1);
892 increment = 0.5 / (double) increment;
893 for (i = 0; i < n_quantiles - 1; i++)
895 tmp = 0.5 + 0.005 * (double) i;
896 fprintf (fp, "%.15e,\n\t\t",
897 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
899 fprintf (fp, "%.15e};\n\t",
900 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
901 fprintf (fp, "%s", reg_export_t_quantiles_2);
902 fprintf (fp, "%s", reg_mean_cmt);
903 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
904 fprintf (fp, "const char *var_names[])\n{\n\t");
905 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
906 for (i = 1; i < c->n_indeps; i++)
909 fprintf (fp, "%.15e,\n\t\t", coeff.estimate);
912 fprintf (fp, "%.15e};\n\t", coeff.estimate);
914 fprintf (fp, "double estimate = %.15e;\n\t", coeff.estimate);
915 fprintf (fp, "int i;\n\tint j;\n\n\t");
916 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
917 fprintf (fp, "%s", reg_getvar);
918 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
920 for (i = 0; i < c->cov->size1 - 1; i++)
923 for (j = 0; j < c->cov->size2 - 1; j++)
925 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
927 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
930 for (j = 0; j < c->cov->size2 - 1; j++)
932 fprintf (fp, "%.15e, ",
933 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
935 fprintf (fp, "%.15e}\n\t",
936 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
937 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
939 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
940 fprintf (fp, "%s", reg_variance);
941 fprintf (fp, "%s", reg_export_confidence_interval);
942 tmp = c->mse * c->mse;
943 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
944 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
945 fprintf (fp, "%s", reg_export_prediction_interval_3);
947 fp = fopen ("pspp_model_reg.h", "w");
948 fprintf (fp, "%s", reg_header);
953 regression_custom_export (struct cmd_regression *cmd UNUSED)
955 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
956 if (!lex_force_match ('('))
963 model_file = fh_parse (FH_REF_FILE);
964 if (model_file == NULL)
968 if (!lex_force_match (')'))
975 cmd_regression (void)
977 if (!parse_regression (&cmd))
980 models = xnmalloc (cmd.n_dependent, sizeof *models);
981 if (!multipass_procedure_with_splits (run_regression, &cmd))
982 return CMD_CASCADING_FAILURE;
983 subcommand_save (cmd.sbc_save, models);
990 Is variable k the dependent variable?
993 is_depvar (size_t k, const struct variable *v)
996 compare_var_names returns 0 if the variable
999 if (!compare_var_names (v, v_variables[k], NULL))
1006 Mark missing cases. Return the number of non-missing cases.
1009 mark_missing_cases (const struct casefile *cf, struct variable *v,
1010 int *is_missing_case, double n_data)
1012 struct casereader *r;
1015 const union value *val;
1017 for (r = casefile_get_reader (cf);
1018 casereader_read (r, &c); case_destroy (&c))
1020 row = casereader_cnum (r) - 1;
1022 val = case_data (&c, v->fv);
1023 cat_value_update (v, val);
1024 if (mv_is_value_missing (&v->miss, val))
1026 if (!is_missing_case[row])
1028 /* Now it is missing. */
1030 is_missing_case[row] = 1;
1034 casereader_destroy (r);
1039 /* Parser for the variables sub command */
1041 regression_custom_variables(struct cmd_regression *cmd UNUSED)
1046 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
1051 if (!parse_variables (default_dict, &v_variables, &n_variables,
1057 assert(n_variables);
1062 Count the explanatory variables. The user may or may
1063 not have specified a response variable in the syntax.
1066 int get_n_indep (const struct variable *v)
1071 result = n_variables;
1072 while (i < n_variables)
1074 if (is_depvar (i, v))
1084 Read from the active file. Identify the explanatory variables in
1085 v_variables. Encode categorical variables. Drop cases with missing
1089 int prepare_data (int n_data, int is_missing_case[],
1090 struct variable **indep_vars,
1091 struct variable *depvar,
1092 const struct casefile *cf)
1097 assert (indep_vars != NULL);
1099 for (i = 0; i < n_variables; i++)
1101 if (!is_depvar (i, depvar))
1103 indep_vars[j] = v_variables[i];
1105 if (v_variables[i]->type == ALPHA)
1107 /* Make a place to hold the binary vectors
1108 corresponding to this variable's values. */
1109 cat_stored_values_create (v_variables[i]);
1111 n_data = mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
1115 Mark missing cases for the dependent variable.
1117 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
1122 run_regression (const struct casefile *cf, void *cmd_ UNUSED)
1125 size_t n_data = 0; /* Number of valide cases. */
1126 size_t n_cases; /* Number of cases. */
1132 Keep track of the missing cases.
1134 int *is_missing_case;
1135 const union value *val;
1136 struct casereader *r;
1138 struct variable **indep_vars;
1139 struct design_matrix *X;
1142 pspp_linreg_opts lopts;
1144 assert (models != NULL);
1147 dict_get_vars (default_dict, &v_variables, &n_variables,
1151 n_cases = casefile_get_case_cnt (cf);
1153 for (i = 0; i < cmd.n_dependent; i++)
1155 if (cmd.v_dependent[i]->type != NUMERIC)
1157 msg (SE, gettext ("Dependent variable must be numeric."));
1158 pspp_reg_rc = CMD_FAILURE;
1163 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
1165 lopts.get_depvar_mean_std = 1;
1167 for (k = 0; k < cmd.n_dependent; k++)
1169 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
1170 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
1171 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
1172 assert (indep_vars != NULL);
1174 for (i = 0; i < n_cases; i++)
1176 is_missing_case[i] = 0;
1178 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
1180 (const struct casefile *) cf);
1181 Y = gsl_vector_alloc (n_data);
1184 design_matrix_create (n_indep, (const struct variable **) indep_vars,
1186 for (i = 0; i < X->m->size2; i++)
1188 lopts.get_indep_mean_std[i] = 1;
1190 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1191 models[k]->indep_means = gsl_vector_alloc (X->m->size2);
1192 models[k]->indep_std = gsl_vector_alloc (X->m->size2);
1193 models[k]->depvar = (const struct variable *) cmd.v_dependent[k];
1195 For large data sets, use QR decomposition.
1197 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1199 models[k]->method = PSPP_LINREG_SVD;
1203 The second pass fills the design matrix.
1206 for (r = casefile_get_reader (cf); casereader_read (r, &c);
1208 /* Iterate over the cases. */
1210 case_num = casereader_cnum (r) - 1;
1211 if (!is_missing_case[case_num])
1213 for (i = 0; i < n_variables; ++i) /* Iterate over the
1218 val = case_data (&c, v_variables[i]->fv);
1220 Independent/dependent variable separation. The
1221 'variables' subcommand specifies a varlist which contains
1222 both dependent and independent variables. The dependent
1223 variables are specified with the 'dependent'
1224 subcommand, and maybe also in the 'variables' subcommand.
1225 We need to separate the two.
1227 if (!is_depvar (i, cmd.v_dependent[k]))
1229 if (v_variables[i]->type == ALPHA)
1231 design_matrix_set_categorical (X, row, v_variables[i], val);
1233 else if (v_variables[i]->type == NUMERIC)
1235 design_matrix_set_numeric (X, row, v_variables[i], val);
1239 val = case_data (&c, cmd.v_dependent[k]->fv);
1240 gsl_vector_set (Y, row, val->f);
1245 Now that we know the number of coefficients, allocate space
1246 and store pointers to the variables that correspond to the
1249 pspp_linreg_coeff_init (models[k], X);
1252 Find the least-squares estimates and other statistics.
1254 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1255 subcommand_statistics (cmd.a_statistics, models[k]);
1256 subcommand_export (cmd.sbc_export, models[k]);
1258 gsl_vector_free (Y);
1259 design_matrix_destroy (X);
1261 free (lopts.get_indep_mean_std);
1262 casereader_destroy (r);
1265 free (is_missing_case);