1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 #include <gsl/gsl_cdf.h>
22 #include <gsl/gsl_matrix.h>
23 #include <gsl/gsl_vector.h>
27 #include "regression-export.h"
28 #include <data/case.h>
29 #include <data/casefile.h>
30 #include <data/cat-routines.h>
31 #include <data/category.h>
32 #include <data/dictionary.h>
33 #include <data/missing-values.h>
34 #include <data/procedure.h>
35 #include <data/transformations.h>
36 #include <data/value-labels.h>
37 #include <data/variable.h>
38 #include <language/command.h>
39 #include <language/dictionary/split-file.h>
40 #include <language/data-io/file-handle.h>
41 #include <language/lexer/lexer.h>
42 #include <libpspp/alloc.h>
43 #include <libpspp/compiler.h>
44 #include <libpspp/message.h>
45 #include <math/design-matrix.h>
46 #include <math/coefficient.h>
47 #include <math/linreg/linreg.h>
48 #include <output/table.h>
52 #define REG_LARGE_DATA 1000
57 "REGRESSION" (regression_):
78 +save[sv_]=resid,pred;
83 static struct cmd_regression cmd;
85 /* Linear regression models. */
86 static pspp_linreg_cache **models = NULL;
89 Transformations for saving predicted values
94 int n_trns; /* Number of transformations. */
95 int trns_id; /* Which trns is this one? */
96 pspp_linreg_cache *c; /* Linear model for this trns. */
99 Variables used (both explanatory and response).
101 static struct variable **v_variables;
106 static size_t n_variables;
109 File where the model will be saved if the EXPORT subcommand
112 static struct file_handle *model_file;
115 Return value for the procedure.
117 static int pspp_reg_rc = CMD_SUCCESS;
119 static bool run_regression (const struct ccase *,
120 const struct casefile *, void *,
121 const struct dataset *);
124 STATISTICS subcommand output functions.
126 static void reg_stats_r (pspp_linreg_cache *);
127 static void reg_stats_coeff (pspp_linreg_cache *);
128 static void reg_stats_anova (pspp_linreg_cache *);
129 static void reg_stats_outs (pspp_linreg_cache *);
130 static void reg_stats_zpp (pspp_linreg_cache *);
131 static void reg_stats_label (pspp_linreg_cache *);
132 static void reg_stats_sha (pspp_linreg_cache *);
133 static void reg_stats_ci (pspp_linreg_cache *);
134 static void reg_stats_f (pspp_linreg_cache *);
135 static void reg_stats_bcov (pspp_linreg_cache *);
136 static void reg_stats_ses (pspp_linreg_cache *);
137 static void reg_stats_xtx (pspp_linreg_cache *);
138 static void reg_stats_collin (pspp_linreg_cache *);
139 static void reg_stats_tol (pspp_linreg_cache *);
140 static void reg_stats_selection (pspp_linreg_cache *);
141 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
142 int, pspp_linreg_cache *);
145 reg_stats_r (pspp_linreg_cache * c)
155 rsq = c->ssm / c->sst;
156 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
157 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
158 t = tab_create (n_cols, n_rows, 0);
159 tab_dim (t, tab_natural_dimensions);
160 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
161 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
162 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
163 tab_vline (t, TAL_0, 1, 0, 0);
165 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
166 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
167 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
168 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
169 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
170 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
171 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
172 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
173 tab_title (t, _("Model Summary"));
178 Table showing estimated regression coefficients.
181 reg_stats_coeff (pspp_linreg_cache * c)
193 const struct variable *v;
194 const union value *val;
199 tmp = xnmalloc (MAX_STRING, sizeof (*tmp));
200 n_rows = c->n_coeffs + 2;
202 t = tab_create (n_cols, n_rows, 0);
203 tab_headers (t, 2, 0, 1, 0);
204 tab_dim (t, tab_natural_dimensions);
205 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
206 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
207 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
208 tab_vline (t, TAL_0, 1, 0, 0);
210 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
211 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
212 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
213 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
214 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
215 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
216 coeff = c->coeff[0]->estimate;
217 tab_float (t, 2, 1, 0, coeff, 10, 2);
218 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
219 tab_float (t, 3, 1, 0, std_err, 10, 2);
220 beta = coeff / c->depvar_std;
221 tab_float (t, 4, 1, 0, beta, 10, 2);
222 t_stat = coeff / std_err;
223 tab_float (t, 5, 1, 0, t_stat, 10, 2);
224 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
225 tab_float (t, 6, 1, 0, pval, 10, 2);
226 for (j = 1; j <= c->n_indeps; j++)
228 v = pspp_coeff_get_var (c->coeff[j], 0);
229 label = var_to_string (v);
230 /* Do not overwrite the variable's name. */
231 strncpy (tmp, label, MAX_STRING);
232 if (var_is_alpha (v))
235 Append the value associated with this coefficient.
236 This makes sense only if we us the usual binary encoding
240 val = pspp_coeff_get_value (c->coeff[j], v);
241 val_s = var_get_value_name (v, val);
242 strncat (tmp, val_s, MAX_STRING);
245 tab_text (t, 1, j + 1, TAB_CENTER, tmp);
247 Regression coefficients.
249 coeff = c->coeff[j]->estimate;
250 tab_float (t, 2, j + 1, 0, coeff, 10, 2);
252 Standard error of the coefficients.
254 std_err = sqrt (gsl_matrix_get (c->cov, j, j));
255 tab_float (t, 3, j + 1, 0, std_err, 10, 2);
257 'Standardized' coefficient, i.e., regression coefficient
258 if all variables had unit variance.
260 beta = gsl_vector_get (c->indep_std, j);
261 beta *= coeff / c->depvar_std;
262 tab_float (t, 4, j + 1, 0, beta, 10, 2);
265 Test statistic for H0: coefficient is 0.
267 t_stat = coeff / std_err;
268 tab_float (t, 5, j + 1, 0, t_stat, 10, 2);
270 P values for the test statistic above.
272 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), (double) (c->n_obs - c->n_coeffs));
273 tab_float (t, 6, j + 1, 0, pval, 10, 2);
275 tab_title (t, _("Coefficients"));
281 Display the ANOVA table.
284 reg_stats_anova (pspp_linreg_cache * c)
288 const double msm = c->ssm / c->dfm;
289 const double mse = c->sse / c->dfe;
290 const double F = msm / mse;
291 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
296 t = tab_create (n_cols, n_rows, 0);
297 tab_headers (t, 2, 0, 1, 0);
298 tab_dim (t, tab_natural_dimensions);
300 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
302 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
303 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
304 tab_vline (t, TAL_0, 1, 0, 0);
306 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
307 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
308 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
309 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
310 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
312 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
313 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
314 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
316 /* Sums of Squares */
317 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
318 tab_float (t, 2, 3, 0, c->sst, 10, 2);
319 tab_float (t, 2, 2, 0, c->sse, 10, 2);
322 /* Degrees of freedom */
323 tab_float (t, 3, 1, 0, c->dfm, 4, 0);
324 tab_float (t, 3, 2, 0, c->dfe, 4, 0);
325 tab_float (t, 3, 3, 0, c->dft, 4, 0);
329 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
330 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
332 tab_float (t, 5, 1, 0, F, 8, 3);
334 tab_float (t, 6, 1, 0, pval, 8, 3);
336 tab_title (t, _("ANOVA"));
340 reg_stats_outs (pspp_linreg_cache * c)
345 reg_stats_zpp (pspp_linreg_cache * c)
350 reg_stats_label (pspp_linreg_cache * c)
355 reg_stats_sha (pspp_linreg_cache * c)
360 reg_stats_ci (pspp_linreg_cache * c)
365 reg_stats_f (pspp_linreg_cache * c)
370 reg_stats_bcov (pspp_linreg_cache * c)
382 n_cols = c->n_indeps + 1 + 2;
383 n_rows = 2 * (c->n_indeps + 1);
384 t = tab_create (n_cols, n_rows, 0);
385 tab_headers (t, 2, 0, 1, 0);
386 tab_dim (t, tab_natural_dimensions);
387 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
388 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
389 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
390 tab_vline (t, TAL_0, 1, 0, 0);
391 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
392 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
393 for (i = 1; i < c->n_coeffs; i++)
395 const struct variable *v = pspp_coeff_get_var (c->coeff[i], 0);
396 label = var_to_string (v);
397 tab_text (t, 2, i, TAB_CENTER, label);
398 tab_text (t, i + 2, 0, TAB_CENTER, label);
399 for (k = 1; k < c->n_coeffs; k++)
401 col = (i <= k) ? k : i;
402 row = (i <= k) ? i : k;
403 tab_float (t, k + 2, i, TAB_CENTER,
404 gsl_matrix_get (c->cov, row, col), 8, 3);
407 tab_title (t, _("Coefficient Correlations"));
411 reg_stats_ses (pspp_linreg_cache * c)
416 reg_stats_xtx (pspp_linreg_cache * c)
421 reg_stats_collin (pspp_linreg_cache * c)
426 reg_stats_tol (pspp_linreg_cache * c)
431 reg_stats_selection (pspp_linreg_cache * c)
437 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
438 int keyword, pspp_linreg_cache * c)
447 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
450 The order here must match the order in which the STATISTICS
451 keywords appear in the specification section above.
478 Set everything but F.
480 for (i = 0; i < f; i++)
487 for (i = 0; i < all; i++)
495 Default output: ANOVA table, parameter estimates,
496 and statistics for variables not entered into model,
499 if (keywords[defaults] | d)
507 statistics_keyword_output (reg_stats_r, keywords[r], c);
508 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
509 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
510 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
511 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
512 statistics_keyword_output (reg_stats_label, keywords[label], c);
513 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
514 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
515 statistics_keyword_output (reg_stats_f, keywords[f], c);
516 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
517 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
518 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
519 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
520 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
521 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
525 Free the transformation. Free its linear model if this
526 transformation is the last one.
529 regression_trns_free (void *t_)
532 struct reg_trns *t = t_;
534 if (t->trns_id == t->n_trns)
536 result = pspp_linreg_cache_free (t->c);
544 Gets the predicted values.
547 regression_trns_pred_proc (void *t_, struct ccase *c,
548 casenumber case_idx UNUSED)
552 struct reg_trns *trns = t_;
553 pspp_linreg_cache *model;
554 union value *output = NULL;
555 const union value **vals = NULL;
556 struct variable **vars = NULL;
558 assert (trns != NULL);
560 assert (model != NULL);
561 assert (model->depvar != NULL);
562 assert (model->pred != NULL);
564 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
565 n_vals = (*model->get_vars) (model, vars);
567 vals = xnmalloc (n_vals, sizeof (*vals));
568 output = case_data_rw (c, model->pred);
569 assert (output != NULL);
571 for (i = 0; i < n_vals; i++)
573 vals[i] = case_data (c, vars[i]);
575 output->f = (*model->predict) ((const struct variable **) vars,
576 vals, model, n_vals);
579 return TRNS_CONTINUE;
586 regression_trns_resid_proc (void *t_, struct ccase *c,
587 casenumber case_idx UNUSED)
591 struct reg_trns *trns = t_;
592 pspp_linreg_cache *model;
593 union value *output = NULL;
594 const union value **vals = NULL;
595 const union value *obs = NULL;
596 struct variable **vars = NULL;
598 assert (trns != NULL);
600 assert (model != NULL);
601 assert (model->depvar != NULL);
602 assert (model->resid != NULL);
604 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
605 n_vals = (*model->get_vars) (model, vars);
607 vals = xnmalloc (n_vals, sizeof (*vals));
608 output = case_data_rw (c, model->resid);
609 assert (output != NULL);
611 for (i = 0; i < n_vals; i++)
613 vals[i] = case_data (c, vars[i]);
615 obs = case_data (c, model->depvar);
616 output->f = (*model->residual) ((const struct variable **) vars,
617 vals, obs, model, n_vals);
620 return TRNS_CONTINUE;
624 Returns false if NAME is a duplicate of any existing variable name.
627 try_name (const struct dictionary *dict, const char *name)
629 if (dict_lookup_var (dict, name) != NULL)
636 reg_get_name (const struct dictionary *dict, char name[LONG_NAME_LEN], const char prefix[LONG_NAME_LEN])
640 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
641 while (!try_name (dict, name))
644 snprintf (name, LONG_NAME_LEN, "%s%d", prefix, i);
649 reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
650 pspp_linreg_cache * c, struct variable **v, int n_trns)
652 struct dictionary *dict = dataset_dict (ds);
653 static int trns_index = 1;
654 char name[LONG_NAME_LEN];
655 struct variable *new_var;
656 struct reg_trns *t = NULL;
658 t = xmalloc (sizeof (*t));
659 t->trns_id = trns_index;
662 reg_get_name (dict, name, prefix);
663 new_var = dict_create_var (dict, name, 0);
664 assert (new_var != NULL);
666 add_transformation (ds, f, regression_trns_free, t);
671 subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models)
673 pspp_linreg_cache **lc;
677 assert (models != NULL);
681 /* Count the number of transformations we will need. */
682 for (i = 0; i < REGRESSION_SV_count; i++)
689 n_trns *= cmd.n_dependent;
691 for (lc = models; lc < models + cmd.n_dependent; lc++)
693 assert (*lc != NULL);
694 assert ((*lc)->depvar != NULL);
695 if (cmd.a_save[REGRESSION_SV_RESID])
697 reg_save_var (ds, "RES", regression_trns_resid_proc, *lc,
698 &(*lc)->resid, n_trns);
700 if (cmd.a_save[REGRESSION_SV_PRED])
702 reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc,
703 &(*lc)->pred, n_trns);
709 for (lc = models; lc < models + cmd.n_dependent; lc++)
711 assert (*lc != NULL);
712 pspp_linreg_cache_free (*lc);
718 reg_inserted (const struct variable *v, struct variable **varlist, int n_vars)
722 for (i = 0; i < n_vars; i++)
733 reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c)
737 struct variable **varlist;
739 fprintf (fp, "%s", reg_export_categorical_encode_1);
741 varlist = xnmalloc (c->n_indeps, sizeof (*varlist));
742 for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */
744 struct pspp_coeff *coeff = c->coeff[i];
745 const struct variable *v = pspp_coeff_get_var (coeff, 0);
746 if (var_is_alpha (v))
748 if (!reg_inserted (v, varlist, n_vars))
750 fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t",
752 varlist[n_vars] = (struct variable *) v;
757 fprintf (fp, "int n_vars = %d;\n\t", n_vars);
758 fprintf (fp, "struct pspp_reg_categorical_variable *varlist[%d] = {",
760 for (i = 0; i < n_vars - 1; i++)
762 fprintf (fp, "&%s,\n\t\t", var_get_name (varlist[i]));
764 fprintf (fp, "&%s};\n\t", var_get_name (varlist[i]));
766 for (i = 0; i < n_vars; i++)
768 int n_categories = cat_get_n_categories (varlist[i]);
771 fprintf (fp, "%s.name = \"%s\";\n\t",
772 var_get_name (varlist[i]),
773 var_get_name (varlist[i]));
774 fprintf (fp, "%s.n_vals = %d;\n\t",
775 var_get_name (varlist[i]),
778 for (j = 0; j < n_categories; j++)
780 union value *val = cat_subscript_to_value (j, varlist[i]);
781 fprintf (fp, "%s.values[%d] = \"%s\";\n\t",
782 var_get_name (varlist[i]), j,
783 var_get_value_name (varlist[i], val));
786 fprintf (fp, "%s", reg_export_categorical_encode_2);
790 reg_print_depvars (FILE * fp, pspp_linreg_cache * c)
793 struct pspp_coeff *coeff;
794 const struct variable *v;
796 fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps);
797 for (i = 1; i < c->n_indeps; i++)
800 v = pspp_coeff_get_var (coeff, 0);
801 fprintf (fp, "\"%s\",\n\t\t", var_get_name (v));
804 v = pspp_coeff_get_var (coeff, 0);
805 fprintf (fp, "\"%s\"};\n\t", var_get_name (v));
808 reg_print_getvar (FILE * fp, pspp_linreg_cache * c)
810 fprintf (fp, "static int\npspp_reg_getvar (char *v_name)\n{\n\t");
811 fprintf (fp, "int i;\n\tint n_vars = %d;\n\t", c->n_indeps);
812 reg_print_depvars (fp, c);
813 fprintf (fp, "for (i = 0; i < n_vars; i++)\n\t{\n\t\t");
815 "if (strncmp (v_name, model_depvars[i], PSPP_REG_MAXLEN) == 0)\n\t\t{\n\t\t\t");
816 fprintf (fp, "return i;\n\t\t}\n\t}\n}\n");
819 reg_has_categorical (pspp_linreg_cache * c)
822 const struct variable *v;
824 for (i = 1; i < c->n_coeffs; i++)
826 v = pspp_coeff_get_var (c->coeff[i], 0);
827 if (var_is_alpha (v))
834 subcommand_export (int export, pspp_linreg_cache * c)
839 int n_quantiles = 100;
841 struct pspp_coeff *coeff;
846 assert (model_file != NULL);
847 fp = fopen (fh_get_file_name (model_file), "w");
849 fprintf (fp, "%s", reg_preamble);
850 reg_print_getvar (fp, c);
851 if (reg_has_categorical (c))
853 reg_print_categorical_encoding (fp, c);
855 fprintf (fp, "%s", reg_export_t_quantiles_1);
856 for (i = 0; i < n_quantiles - 1; i++)
858 tmp = 0.5 + 0.005 * (double) i;
859 fprintf (fp, "%.15e,\n\t\t",
860 gsl_cdf_tdist_Pinv (tmp, c->n_obs - c->n_indeps));
862 fprintf (fp, "%.15e};\n\t",
863 gsl_cdf_tdist_Pinv (.9995, c->n_obs - c->n_indeps));
864 fprintf (fp, "%s", reg_export_t_quantiles_2);
865 fprintf (fp, "%s", reg_mean_cmt);
866 fprintf (fp, "double\npspp_reg_estimate (const double *var_vals,");
867 fprintf (fp, "const char *var_names[])\n{\n\t");
868 fprintf (fp, "double model_coeffs[%d] = {", c->n_indeps);
869 for (i = 1; i < c->n_indeps; i++)
872 fprintf (fp, "%.15e,\n\t\t", coeff->estimate);
875 fprintf (fp, "%.15e};\n\t", coeff->estimate);
877 fprintf (fp, "double estimate = %.15e;\n\t", coeff->estimate);
878 fprintf (fp, "int i;\n\tint j;\n\n\t");
879 fprintf (fp, "for (i = 0; i < %d; i++)\n\t", c->n_indeps);
880 fprintf (fp, "%s", reg_getvar);
881 fprintf (fp, "const double cov[%d][%d] = {\n\t", c->n_coeffs,
883 for (i = 0; i < c->cov->size1 - 1; i++)
886 for (j = 0; j < c->cov->size2 - 1; j++)
888 fprintf (fp, "%.15e, ", gsl_matrix_get (c->cov, i, j));
890 fprintf (fp, "%.15e},\n\t", gsl_matrix_get (c->cov, i, j));
893 for (j = 0; j < c->cov->size2 - 1; j++)
895 fprintf (fp, "%.15e, ",
896 gsl_matrix_get (c->cov, c->cov->size1 - 1, j));
898 fprintf (fp, "%.15e}\n\t",
899 gsl_matrix_get (c->cov, c->cov->size1 - 1, c->cov->size2 - 1));
900 fprintf (fp, "};\n\tint n_vars = %d;\n\tint i;\n\tint j;\n\t",
902 fprintf (fp, "double unshuffled_vals[%d];\n\t", c->n_indeps);
903 fprintf (fp, "%s", reg_variance);
904 fprintf (fp, "%s", reg_export_confidence_interval);
905 tmp = c->mse * c->mse;
906 fprintf (fp, "%s %.15e", reg_export_prediction_interval_1, tmp);
907 fprintf (fp, "%s %.15e", reg_export_prediction_interval_2, tmp);
908 fprintf (fp, "%s", reg_export_prediction_interval_3);
910 fp = fopen ("pspp_model_reg.h", "w");
911 fprintf (fp, "%s", reg_header);
917 regression_custom_export (struct lexer *lexer, struct dataset *ds UNUSED, struct cmd_regression *cmd UNUSED, void *aux UNUSED)
919 /* 0 on failure, 1 on success, 2 on failure that should result in syntax error */
920 if (!lex_force_match (lexer, '('))
923 if (lex_match (lexer, '*'))
927 model_file = fh_parse (lexer, FH_REF_FILE);
928 if (model_file == NULL)
932 if (!lex_force_match (lexer, ')'))
939 cmd_regression (struct lexer *lexer, struct dataset *ds)
941 if (!parse_regression (lexer, ds, &cmd, NULL))
944 models = xnmalloc (cmd.n_dependent, sizeof *models);
945 if (!multipass_procedure_with_splits (ds, run_regression, &cmd))
946 return CMD_CASCADING_FAILURE;
947 subcommand_save (ds, cmd.sbc_save, models);
954 Is variable k the dependent variable?
957 is_depvar (size_t k, const struct variable *v)
959 return v == v_variables[k];
963 Mark missing cases. Return the number of non-missing cases.
966 mark_missing_cases (const struct casefile *cf, struct variable *v,
967 int *is_missing_case, double n_data)
969 struct casereader *r;
972 const union value *val;
974 for (r = casefile_get_reader (cf, NULL);
975 casereader_read (r, &c); case_destroy (&c))
977 row = casereader_cnum (r) - 1;
979 val = case_data (&c, v);
980 cat_value_update (v, val);
981 if (var_is_value_missing (v, val, MV_ANY))
983 if (!is_missing_case[row])
985 /* Now it is missing. */
987 is_missing_case[row] = 1;
991 casereader_destroy (r);
996 /* Parser for the variables sub command */
998 regression_custom_variables (struct lexer *lexer, struct dataset *ds,
999 struct cmd_regression *cmd UNUSED,
1002 const struct dictionary *dict = dataset_dict (ds);
1004 lex_match (lexer, '=');
1006 if ((lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
1007 && lex_token (lexer) != T_ALL)
1011 if (!parse_variables (lexer, dict, &v_variables, &n_variables, PV_NONE))
1016 assert (n_variables);
1022 Count the explanatory variables. The user may or may
1023 not have specified a response variable in the syntax.
1026 get_n_indep (const struct variable *v)
1031 result = n_variables;
1032 while (i < n_variables)
1034 if (is_depvar (i, v))
1045 Read from the active file. Identify the explanatory variables in
1046 v_variables. Encode categorical variables. Drop cases with missing
1050 prepare_data (int n_data, int is_missing_case[],
1051 struct variable **indep_vars,
1052 struct variable *depvar, const struct casefile *cf)
1057 assert (indep_vars != NULL);
1059 for (i = 0; i < n_variables; i++)
1061 if (!is_depvar (i, depvar))
1063 indep_vars[j] = v_variables[i];
1065 if (var_is_alpha (v_variables[i]))
1067 /* Make a place to hold the binary vectors
1068 corresponding to this variable's values. */
1069 cat_stored_values_create (v_variables[i]);
1072 mark_missing_cases (cf, v_variables[i], is_missing_case, n_data);
1076 Mark missing cases for the dependent variable.
1078 n_data = mark_missing_cases (cf, depvar, is_missing_case, n_data);
1083 coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
1085 c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff));
1086 c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */
1087 c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */
1088 pspp_coeff_init (c->coeff + 1, dm);
1092 run_regression (const struct ccase *first,
1093 const struct casefile *cf, void *cmd_ UNUSED, const struct dataset *ds)
1096 size_t n_data = 0; /* Number of valide cases. */
1097 size_t n_cases; /* Number of cases. */
1103 Keep track of the missing cases.
1105 int *is_missing_case;
1106 const union value *val;
1107 struct casereader *r;
1109 struct variable **indep_vars;
1110 struct design_matrix *X;
1113 pspp_linreg_opts lopts;
1115 assert (models != NULL);
1117 output_split_file_values (ds, first);
1121 dict_get_vars (dataset_dict (ds), &v_variables, &n_variables,
1125 n_cases = casefile_get_case_cnt (cf);
1127 for (i = 0; i < cmd.n_dependent; i++)
1129 if (!var_is_numeric (cmd.v_dependent[i]))
1131 msg (SE, gettext ("Dependent variable must be numeric."));
1132 pspp_reg_rc = CMD_FAILURE;
1137 is_missing_case = xnmalloc (n_cases, sizeof (*is_missing_case));
1139 lopts.get_depvar_mean_std = 1;
1141 for (k = 0; k < cmd.n_dependent; k++)
1143 n_indep = get_n_indep ((const struct variable *) cmd.v_dependent[k]);
1144 lopts.get_indep_mean_std = xnmalloc (n_indep, sizeof (int));
1145 indep_vars = xnmalloc (n_indep, sizeof *indep_vars);
1146 assert (indep_vars != NULL);
1148 for (i = 0; i < n_cases; i++)
1150 is_missing_case[i] = 0;
1152 n_data = prepare_data (n_cases, is_missing_case, indep_vars,
1154 (const struct casefile *) cf);
1155 Y = gsl_vector_alloc (n_data);
1158 design_matrix_create (n_indep, (const struct variable **) indep_vars,
1160 for (i = 0; i < X->m->size2; i++)
1162 lopts.get_indep_mean_std[i] = 1;
1164 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
1165 models[k]->indep_means = gsl_vector_alloc (X->m->size2);
1166 models[k]->indep_std = gsl_vector_alloc (X->m->size2);
1167 models[k]->depvar = (const struct variable *) cmd.v_dependent[k];
1169 For large data sets, use QR decomposition.
1171 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
1173 models[k]->method = PSPP_LINREG_SVD;
1177 The second pass fills the design matrix.
1180 for (r = casefile_get_reader (cf, NULL); casereader_read (r, &c);
1182 /* Iterate over the cases. */
1184 case_num = casereader_cnum (r) - 1;
1185 if (!is_missing_case[case_num])
1187 for (i = 0; i < n_variables; ++i) /* Iterate over the
1192 val = case_data (&c, v_variables[i]);
1194 Independent/dependent variable separation. The
1195 'variables' subcommand specifies a varlist which contains
1196 both dependent and independent variables. The dependent
1197 variables are specified with the 'dependent'
1198 subcommand, and maybe also in the 'variables' subcommand.
1199 We need to separate the two.
1201 if (!is_depvar (i, cmd.v_dependent[k]))
1203 if (var_is_alpha (v_variables[i]))
1205 design_matrix_set_categorical (X, row,
1206 v_variables[i], val);
1210 design_matrix_set_numeric (X, row, v_variables[i],
1215 val = case_data (&c, cmd.v_dependent[k]);
1216 gsl_vector_set (Y, row, val->f);
1221 Now that we know the number of coefficients, allocate space
1222 and store pointers to the variables that correspond to the
1225 coeff_init (models[k], X);
1228 Find the least-squares estimates and other statistics.
1230 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1231 subcommand_statistics (cmd.a_statistics, models[k]);
1232 subcommand_export (cmd.sbc_export, models[k]);
1234 gsl_vector_free (Y);
1235 design_matrix_destroy (X);
1237 free (lopts.get_indep_mean_std);
1238 casereader_destroy (r);
1241 free (is_missing_case);