1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <gsl/gsl_matrix.h>
21 #include <gsl/gsl_vector.h>
25 #include <data/case.h>
26 #include <data/casegrouper.h>
27 #include <data/casereader.h>
28 #include <data/category.h>
29 #include <data/dictionary.h>
30 #include <data/missing-values.h>
31 #include <data/procedure.h>
32 #include <data/transformations.h>
33 #include <data/value-labels.h>
34 #include <data/variable.h>
35 #include <language/command.h>
36 #include <language/dictionary/split-file.h>
37 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <libpspp/compiler.h>
40 #include <libpspp/message.h>
41 #include <libpspp/taint.h>
42 #include <math/design-matrix.h>
43 #include <math/coefficient.h>
44 #include <math/linreg/linreg.h>
45 #include <math/moments.h>
46 #include <output/table.h>
51 #define _(msgid) gettext (msgid)
53 #define REG_LARGE_DATA 1000
58 "REGRESSION" (regression_):
78 +save[sv_]=resid,pred;
83 static struct cmd_regression cmd;
86 Moments for each of the variables used.
91 const struct variable *v;
95 Transformations for saving predicted values
100 int n_trns; /* Number of transformations. */
101 int trns_id; /* Which trns is this one? */
102 pspp_linreg_cache *c; /* Linear model for this trns. */
105 Variables used (both explanatory and response).
107 static const struct variable **v_variables;
112 static size_t n_variables;
114 static bool run_regression (struct casereader *, struct cmd_regression *,
115 struct dataset *, pspp_linreg_cache **);
118 STATISTICS subcommand output functions.
120 static void reg_stats_r (pspp_linreg_cache *);
121 static void reg_stats_coeff (pspp_linreg_cache *);
122 static void reg_stats_anova (pspp_linreg_cache *);
123 static void reg_stats_outs (pspp_linreg_cache *);
124 static void reg_stats_zpp (pspp_linreg_cache *);
125 static void reg_stats_label (pspp_linreg_cache *);
126 static void reg_stats_sha (pspp_linreg_cache *);
127 static void reg_stats_ci (pspp_linreg_cache *);
128 static void reg_stats_f (pspp_linreg_cache *);
129 static void reg_stats_bcov (pspp_linreg_cache *);
130 static void reg_stats_ses (pspp_linreg_cache *);
131 static void reg_stats_xtx (pspp_linreg_cache *);
132 static void reg_stats_collin (pspp_linreg_cache *);
133 static void reg_stats_tol (pspp_linreg_cache *);
134 static void reg_stats_selection (pspp_linreg_cache *);
135 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
136 int, pspp_linreg_cache *);
139 reg_stats_r (pspp_linreg_cache * c)
149 rsq = c->ssm / c->sst;
150 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
151 std_error = sqrt ((c->n_indeps - 1.0) / (c->n_obs - 1.0));
152 t = tab_create (n_cols, n_rows, 0);
153 tab_dim (t, tab_natural_dimensions);
154 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
155 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
156 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
157 tab_vline (t, TAL_0, 1, 0, 0);
159 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
160 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
161 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
162 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
163 tab_float (t, 1, 1, TAB_RIGHT, sqrt (rsq), 10, 2);
164 tab_float (t, 2, 1, TAB_RIGHT, rsq, 10, 2);
165 tab_float (t, 3, 1, TAB_RIGHT, adjrsq, 10, 2);
166 tab_float (t, 4, 1, TAB_RIGHT, std_error, 10, 2);
167 tab_title (t, _("Model Summary"));
172 Table showing estimated regression coefficients.
175 reg_stats_coeff (pspp_linreg_cache * c)
188 const struct variable *v;
189 const union value *val;
193 n_rows = c->n_coeffs + 3;
195 t = tab_create (n_cols, n_rows, 0);
196 tab_headers (t, 2, 0, 1, 0);
197 tab_dim (t, tab_natural_dimensions);
198 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
199 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
200 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
201 tab_vline (t, TAL_0, 1, 0, 0);
203 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
204 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
205 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
206 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
207 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
208 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
209 tab_float (t, 2, 1, 0, c->intercept, 10, 2);
210 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
211 tab_float (t, 3, 1, 0, std_err, 10, 2);
212 beta = c->intercept / c->depvar_std;
213 tab_float (t, 4, 1, 0, beta, 10, 2);
214 t_stat = c->intercept / std_err;
215 tab_float (t, 5, 1, 0, t_stat, 10, 2);
216 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
217 tab_float (t, 6, 1, 0, pval, 10, 2);
218 for (j = 0; j < c->n_coeffs; j++)
221 ds_init_empty (&tstr);
224 v = pspp_coeff_get_var (c->coeff[j], 0);
225 label = var_to_string (v);
226 /* Do not overwrite the variable's name. */
227 ds_put_cstr (&tstr, label);
228 if (var_is_alpha (v))
231 Append the value associated with this coefficient.
232 This makes sense only if we us the usual binary encoding
236 val = pspp_coeff_get_value (c->coeff[j], v);
238 var_append_value_name (v, val, &tstr);
241 tab_text (t, 1, this_row, TAB_CENTER, ds_cstr (&tstr));
243 Regression coefficients.
245 coeff = c->coeff[j]->estimate;
246 tab_float (t, 2, this_row, 0, coeff, 10, 2);
248 Standard error of the coefficients.
250 std_err = sqrt (gsl_matrix_get (c->cov, j + 1, j + 1));
251 tab_float (t, 3, this_row, 0, std_err, 10, 2);
253 'Standardized' coefficient, i.e., regression coefficient
254 if all variables had unit variance.
256 beta = gsl_vector_get (c->indep_std, j + 1);
257 beta *= coeff / c->depvar_std;
258 tab_float (t, 4, this_row, 0, beta, 10, 2);
261 Test statistic for H0: coefficient is 0.
263 t_stat = coeff / std_err;
264 tab_float (t, 5, this_row, 0, t_stat, 10, 2);
266 P values for the test statistic above.
269 2 * gsl_cdf_tdist_Q (fabs (t_stat),
270 (double) (c->n_obs - c->n_coeffs));
271 tab_float (t, 6, this_row, 0, pval, 10, 2);
274 tab_title (t, _("Coefficients"));
279 Display the ANOVA table.
282 reg_stats_anova (pspp_linreg_cache * c)
286 const double msm = c->ssm / c->dfm;
287 const double mse = c->sse / c->dfe;
288 const double F = msm / mse;
289 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
294 t = tab_create (n_cols, n_rows, 0);
295 tab_headers (t, 2, 0, 1, 0);
296 tab_dim (t, tab_natural_dimensions);
298 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
300 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
301 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
302 tab_vline (t, TAL_0, 1, 0, 0);
304 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
305 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
306 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
307 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
308 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
310 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
311 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
312 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
314 /* Sums of Squares */
315 tab_float (t, 2, 1, 0, c->ssm, 10, 2);
316 tab_float (t, 2, 3, 0, c->sst, 10, 2);
317 tab_float (t, 2, 2, 0, c->sse, 10, 2);
320 /* Degrees of freedom */
321 tab_text (t, 3, 1, TAB_RIGHT | TAT_PRINTF, "%g", c->dfm);
322 tab_text (t, 3, 2, TAB_RIGHT | TAT_PRINTF, "%g", c->dfe);
323 tab_text (t, 3, 3, TAB_RIGHT | TAT_PRINTF, "%g", c->dft);
326 tab_float (t, 4, 1, TAB_RIGHT, msm, 8, 3);
327 tab_float (t, 4, 2, TAB_RIGHT, mse, 8, 3);
329 tab_float (t, 5, 1, 0, F, 8, 3);
331 tab_float (t, 6, 1, 0, pval, 8, 3);
333 tab_title (t, _("ANOVA"));
338 reg_stats_outs (pspp_linreg_cache * c)
344 reg_stats_zpp (pspp_linreg_cache * c)
350 reg_stats_label (pspp_linreg_cache * c)
356 reg_stats_sha (pspp_linreg_cache * c)
361 reg_stats_ci (pspp_linreg_cache * c)
366 reg_stats_f (pspp_linreg_cache * c)
371 reg_stats_bcov (pspp_linreg_cache * c)
383 n_cols = c->n_indeps + 1 + 2;
384 n_rows = 2 * (c->n_indeps + 1);
385 t = tab_create (n_cols, n_rows, 0);
386 tab_headers (t, 2, 0, 1, 0);
387 tab_dim (t, tab_natural_dimensions);
388 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
389 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
390 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
391 tab_vline (t, TAL_0, 1, 0, 0);
392 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
393 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
394 for (i = 0; i < c->n_coeffs; i++)
396 const struct variable *v = pspp_coeff_get_var (c->coeff[i], 0);
397 label = var_to_string (v);
398 tab_text (t, 2, i, TAB_CENTER, label);
399 tab_text (t, i + 2, 0, TAB_CENTER, label);
400 for (k = 1; k < c->n_coeffs; k++)
402 col = (i <= k) ? k : i;
403 row = (i <= k) ? i : k;
404 tab_float (t, k + 2, i, TAB_CENTER,
405 gsl_matrix_get (c->cov, row, col), 8, 3);
408 tab_title (t, _("Coefficient Correlations"));
412 reg_stats_ses (pspp_linreg_cache * c)
417 reg_stats_xtx (pspp_linreg_cache * c)
422 reg_stats_collin (pspp_linreg_cache * c)
427 reg_stats_tol (pspp_linreg_cache * c)
432 reg_stats_selection (pspp_linreg_cache * c)
438 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
439 int keyword, pspp_linreg_cache * c)
448 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
451 The order here must match the order in which the STATISTICS
452 keywords appear in the specification section above.
479 Set everything but F.
481 for (i = 0; i < f; i++)
488 for (i = 0; i < all; i++)
496 Default output: ANOVA table, parameter estimates,
497 and statistics for variables not entered into model,
500 if (keywords[defaults] | d)
508 statistics_keyword_output (reg_stats_r, keywords[r], c);
509 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
510 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
511 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
512 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
513 statistics_keyword_output (reg_stats_label, keywords[label], c);
514 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
515 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
516 statistics_keyword_output (reg_stats_f, keywords[f], c);
517 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
518 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
519 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
520 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
521 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
522 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
526 Free the transformation. Free its linear model if this
527 transformation is the last one.
530 regression_trns_free (void *t_)
533 struct reg_trns *t = t_;
535 if (t->trns_id == t->n_trns)
537 result = pspp_linreg_cache_free (t->c);
545 Gets the predicted values.
548 regression_trns_pred_proc (void *t_, struct ccase *c,
549 casenumber case_idx UNUSED)
553 struct reg_trns *trns = t_;
554 pspp_linreg_cache *model;
555 union value *output = NULL;
556 const union value **vals = NULL;
557 const struct variable **vars = NULL;
559 assert (trns != NULL);
561 assert (model != NULL);
562 assert (model->depvar != NULL);
563 assert (model->pred != NULL);
565 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
566 n_vals = (*model->get_vars) (model, vars);
568 vals = xnmalloc (n_vals, sizeof (*vals));
569 output = case_data_rw (c, model->pred);
570 assert (output != NULL);
572 for (i = 0; i < n_vals; i++)
574 vals[i] = case_data (c, vars[i]);
576 output->f = (*model->predict) ((const struct variable **) vars,
577 vals, model, n_vals);
580 return TRNS_CONTINUE;
587 regression_trns_resid_proc (void *t_, struct ccase *c,
588 casenumber case_idx UNUSED)
592 struct reg_trns *trns = t_;
593 pspp_linreg_cache *model;
594 union value *output = NULL;
595 const union value **vals = NULL;
596 const union value *obs = NULL;
597 const struct variable **vars = NULL;
599 assert (trns != NULL);
601 assert (model != NULL);
602 assert (model->depvar != NULL);
603 assert (model->resid != NULL);
605 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
606 n_vals = (*model->get_vars) (model, vars);
608 vals = xnmalloc (n_vals, sizeof (*vals));
609 output = case_data_rw (c, model->resid);
610 assert (output != NULL);
612 for (i = 0; i < n_vals; i++)
614 vals[i] = case_data (c, vars[i]);
616 obs = case_data (c, model->depvar);
617 output->f = (*model->residual) ((const struct variable **) vars,
618 vals, obs, model, n_vals);
621 return TRNS_CONTINUE;
625 Returns false if NAME is a duplicate of any existing variable name.
628 try_name (const struct dictionary *dict, const char *name)
630 if (dict_lookup_var (dict, name) != NULL)
637 reg_get_name (const struct dictionary *dict, char name[VAR_NAME_LEN],
638 const char prefix[VAR_NAME_LEN])
642 snprintf (name, VAR_NAME_LEN, "%s%d", prefix, i);
643 while (!try_name (dict, name))
646 snprintf (name, VAR_NAME_LEN, "%s%d", prefix, i);
651 reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
652 pspp_linreg_cache * c, struct variable **v, int n_trns)
654 struct dictionary *dict = dataset_dict (ds);
655 static int trns_index = 1;
656 char name[VAR_NAME_LEN];
657 struct variable *new_var;
658 struct reg_trns *t = NULL;
660 t = xmalloc (sizeof (*t));
661 t->trns_id = trns_index;
664 reg_get_name (dict, name, prefix);
665 new_var = dict_create_var (dict, name, 0);
666 assert (new_var != NULL);
668 add_transformation (ds, f, regression_trns_free, t);
672 subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models)
674 pspp_linreg_cache **lc;
678 assert (models != NULL);
682 /* Count the number of transformations we will need. */
683 for (i = 0; i < REGRESSION_SV_count; i++)
690 n_trns *= cmd.n_dependent;
692 for (lc = models; lc < models + cmd.n_dependent; lc++)
694 assert (*lc != NULL);
695 assert ((*lc)->depvar != NULL);
696 if (cmd.a_save[REGRESSION_SV_RESID])
698 reg_save_var (ds, "RES", regression_trns_resid_proc, *lc,
699 &(*lc)->resid, n_trns);
701 if (cmd.a_save[REGRESSION_SV_PRED])
703 reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc,
704 &(*lc)->pred, n_trns);
710 for (lc = models; lc < models + cmd.n_dependent; lc++)
714 pspp_linreg_cache_free (*lc);
721 cmd_regression (struct lexer *lexer, struct dataset *ds)
723 struct casegrouper *grouper;
724 struct casereader *group;
725 pspp_linreg_cache **models;
729 if (!parse_regression (lexer, ds, &cmd, NULL))
734 models = xnmalloc (cmd.n_dependent, sizeof *models);
735 for (i = 0; i < cmd.n_dependent; i++)
741 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
742 while (casegrouper_get_next_group (grouper, &group))
743 run_regression (group, &cmd, ds, models);
744 ok = casegrouper_destroy (grouper);
745 ok = proc_commit (ds) && ok;
747 subcommand_save (ds, cmd.sbc_save, models);
750 free_regression (&cmd);
752 return ok ? CMD_SUCCESS : CMD_FAILURE;
756 Is variable k the dependent variable?
759 is_depvar (size_t k, const struct variable *v)
761 return v == v_variables[k];
764 /* Parser for the variables sub command */
766 regression_custom_variables (struct lexer *lexer, struct dataset *ds,
767 struct cmd_regression *cmd UNUSED,
770 const struct dictionary *dict = dataset_dict (ds);
772 lex_match (lexer, '=');
774 if ((lex_token (lexer) != T_ID
775 || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
776 && lex_token (lexer) != T_ALL)
780 if (!parse_variables_const
781 (lexer, dict, &v_variables, &n_variables, PV_NONE))
786 assert (n_variables);
791 /* Identify the explanatory variables in v_variables. Returns
792 the number of independent variables. */
794 identify_indep_vars (const struct variable **indep_vars,
795 const struct variable *depvar)
797 int n_indep_vars = 0;
800 for (i = 0; i < n_variables; i++)
801 if (!is_depvar (i, depvar))
802 indep_vars[n_indep_vars++] = v_variables[i];
803 if ((n_indep_vars < 1) && is_depvar (0, depvar))
806 There is only one independent variable, and it is the same
807 as the dependent variable. Print a warning and continue.
810 gettext ("The dependent variable is equal to the independent variable."
811 "The least squares line is therefore Y=X."
812 "Standard errors and related statistics may be meaningless."));
814 indep_vars[0] = v_variables[0];
819 /* Encode categorical variables.
820 Returns number of valid cases. */
822 prepare_categories (struct casereader *input,
823 const struct variable **vars, size_t n_vars,
824 struct moments_var *mom)
830 assert (vars != NULL);
831 assert (mom != NULL);
833 for (i = 0; i < n_vars; i++)
834 if (var_is_alpha (vars[i]))
835 cat_stored_values_create (vars[i]);
838 for (; casereader_read (input, &c); case_destroy (&c))
841 The second condition ensures the program will run even if
842 there is only one variable to act as both explanatory and
845 for (i = 0; i < n_vars; i++)
847 const union value *val = case_data (&c, vars[i]);
848 if (var_is_alpha (vars[i]))
849 cat_value_update (vars[i], val);
851 moments1_add (mom[i].m, val->f, 1.0);
855 casereader_destroy (input);
861 coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
863 c->coeff = xnmalloc (dm->m->size2, sizeof (*c->coeff));
864 pspp_coeff_init (c->coeff, dm);
868 Put the moments in the linreg cache.
871 compute_moments (pspp_linreg_cache * c, struct moments_var *mom,
872 struct design_matrix *dm, size_t n)
882 Scan the variable names in the columns of the design matrix.
883 When we find the variable we need, insert its mean in the cache.
885 for (i = 0; i < dm->m->size2; i++)
887 for (j = 0; j < n; j++)
889 if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
891 moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
892 &skewness, &kurtosis);
893 gsl_vector_set (c->indep_means, i, mean);
894 gsl_vector_set (c->indep_std, i, sqrt (variance));
901 run_regression (struct casereader *input, struct cmd_regression *cmd,
902 struct dataset *ds, pspp_linreg_cache **models)
908 const struct variable **indep_vars;
909 struct design_matrix *X;
910 struct moments_var *mom;
913 pspp_linreg_opts lopts;
915 assert (models != NULL);
917 if (!casereader_peek (input, 0, &c))
919 casereader_destroy (input);
922 output_split_file_values (ds, &c);
927 dict_get_vars (dataset_dict (ds), &v_variables, &n_variables, 0);
930 for (i = 0; i < cmd->n_dependent; i++)
932 if (!var_is_numeric (cmd->v_dependent[i]))
934 msg (SE, _("Dependent variable must be numeric."));
939 mom = xnmalloc (n_variables, sizeof (*mom));
940 for (i = 0; i < n_variables; i++)
942 (mom + i)->m = moments1_create (MOMENT_VARIANCE);
943 (mom + i)->v = v_variables[i];
945 lopts.get_depvar_mean_std = 1;
947 lopts.get_indep_mean_std = xnmalloc (n_variables, sizeof (int));
948 indep_vars = xnmalloc (n_variables, sizeof *indep_vars);
950 for (k = 0; k < cmd->n_dependent; k++)
952 const struct variable *dep_var;
953 struct casereader *reader;
956 size_t n_data; /* Number of valid cases. */
958 dep_var = cmd->v_dependent[k];
959 n_indep = identify_indep_vars (indep_vars, dep_var);
960 reader = casereader_clone (input);
961 reader = casereader_create_filter_missing (reader, indep_vars, n_indep,
963 reader = casereader_create_filter_missing (reader, &dep_var, 1,
965 n_data = prepare_categories (casereader_clone (reader),
966 indep_vars, n_indep, mom);
968 if ((n_data > 0) && (n_indep > 0))
970 Y = gsl_vector_alloc (n_data);
972 design_matrix_create (n_indep,
973 (const struct variable **) indep_vars,
975 for (i = 0; i < X->m->size2; i++)
977 lopts.get_indep_mean_std[i] = 1;
979 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
980 models[k]->depvar = dep_var;
982 For large data sets, use QR decomposition.
984 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
986 models[k]->method = PSPP_LINREG_QR;
990 The second pass fills the design matrix.
992 reader = casereader_create_counter (reader, &row, -1);
993 for (; casereader_read (reader, &c); case_destroy (&c))
995 for (i = 0; i < n_indep; ++i)
997 const struct variable *v = indep_vars[i];
998 const union value *val = case_data (&c, v);
999 if (var_is_alpha (v))
1000 design_matrix_set_categorical (X, row, v, val);
1002 design_matrix_set_numeric (X, row, v, val);
1004 gsl_vector_set (Y, row, case_num (&c, dep_var));
1007 Now that we know the number of coefficients, allocate space
1008 and store pointers to the variables that correspond to the
1011 coeff_init (models[k], X);
1014 Find the least-squares estimates and other statistics.
1016 pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
1017 compute_moments (models[k], mom, X, n_variables);
1019 if (!taint_has_tainted_successor (casereader_get_taint (input)))
1021 subcommand_statistics (cmd->a_statistics, models[k]);
1024 gsl_vector_free (Y);
1025 design_matrix_destroy (X);
1030 gettext ("No valid data found. This command was skipped."));
1032 casereader_destroy (reader);
1034 for (i = 0; i < n_variables; i++)
1036 moments1_destroy ((mom + i)->m);
1040 free (lopts.get_indep_mean_std);
1041 casereader_destroy (input);