1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <gsl/gsl_matrix.h>
21 #include <gsl/gsl_vector.h>
25 #include <data/case.h>
26 #include <data/casegrouper.h>
27 #include <data/casereader.h>
28 #include <data/category.h>
29 #include <data/dictionary.h>
30 #include <data/missing-values.h>
31 #include <data/procedure.h>
32 #include <data/transformations.h>
33 #include <data/value-labels.h>
34 #include <data/variable.h>
35 #include <language/command.h>
36 #include <language/dictionary/split-file.h>
37 #include <language/data-io/file-handle.h>
38 #include <language/lexer/lexer.h>
39 #include <libpspp/compiler.h>
40 #include <libpspp/message.h>
41 #include <libpspp/taint.h>
42 #include <math/design-matrix.h>
43 #include <math/coefficient.h>
44 #include <math/linreg.h>
45 #include <math/moments.h>
46 #include <output/table.h>
51 #define _(msgid) gettext (msgid)
53 #define REG_LARGE_DATA 1000
58 "REGRESSION" (regression_):
78 +save[sv_]=resid,pred;
83 static struct cmd_regression cmd;
86 Moments for each of the variables used.
91 const struct variable *v;
95 Transformations for saving predicted values
100 int n_trns; /* Number of transformations. */
101 int trns_id; /* Which trns is this one? */
102 pspp_linreg_cache *c; /* Linear model for this trns. */
105 Variables used (both explanatory and response).
107 static const struct variable **v_variables;
112 static size_t n_variables;
114 static bool run_regression (struct casereader *, struct cmd_regression *,
115 struct dataset *, pspp_linreg_cache **);
118 STATISTICS subcommand output functions.
120 static void reg_stats_r (pspp_linreg_cache *);
121 static void reg_stats_coeff (pspp_linreg_cache *);
122 static void reg_stats_anova (pspp_linreg_cache *);
123 static void reg_stats_outs (pspp_linreg_cache *);
124 static void reg_stats_zpp (pspp_linreg_cache *);
125 static void reg_stats_label (pspp_linreg_cache *);
126 static void reg_stats_sha (pspp_linreg_cache *);
127 static void reg_stats_ci (pspp_linreg_cache *);
128 static void reg_stats_f (pspp_linreg_cache *);
129 static void reg_stats_bcov (pspp_linreg_cache *);
130 static void reg_stats_ses (pspp_linreg_cache *);
131 static void reg_stats_xtx (pspp_linreg_cache *);
132 static void reg_stats_collin (pspp_linreg_cache *);
133 static void reg_stats_tol (pspp_linreg_cache *);
134 static void reg_stats_selection (pspp_linreg_cache *);
135 static void statistics_keyword_output (void (*)(pspp_linreg_cache *),
136 int, pspp_linreg_cache *);
139 reg_stats_r (pspp_linreg_cache * c)
149 rsq = c->ssm / c->sst;
150 adjrsq = 1.0 - (1.0 - rsq) * (c->n_obs - 1.0) / (c->n_obs - c->n_indeps);
151 std_error = sqrt (pspp_linreg_mse (c));
152 t = tab_create (n_cols, n_rows, 0);
153 tab_dim (t, tab_natural_dimensions);
154 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
155 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
156 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
157 tab_vline (t, TAL_0, 1, 0, 0);
159 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
160 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
161 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
162 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
163 tab_double (t, 1, 1, TAB_RIGHT, sqrt (rsq), NULL);
164 tab_double (t, 2, 1, TAB_RIGHT, rsq, NULL);
165 tab_double (t, 3, 1, TAB_RIGHT, adjrsq, NULL);
166 tab_double (t, 4, 1, TAB_RIGHT, std_error, NULL);
167 tab_title (t, _("Model Summary"));
172 Table showing estimated regression coefficients.
175 reg_stats_coeff (pspp_linreg_cache * c)
187 const struct variable *v;
188 const union value *val;
192 n_rows = c->n_coeffs + 3;
194 t = tab_create (n_cols, n_rows, 0);
195 tab_headers (t, 2, 0, 1, 0);
196 tab_dim (t, tab_natural_dimensions);
197 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
198 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
199 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
200 tab_vline (t, TAL_0, 1, 0, 0);
202 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
203 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
204 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
205 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
206 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
207 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
208 tab_double (t, 2, 1, 0, c->intercept, NULL);
209 std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
210 tab_double (t, 3, 1, 0, std_err, NULL);
211 tab_double (t, 4, 1, 0, 0.0, NULL);
212 t_stat = c->intercept / std_err;
213 tab_double (t, 5, 1, 0, t_stat, NULL);
214 pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
215 tab_double (t, 6, 1, 0, pval, NULL);
216 for (j = 0; j < c->n_coeffs; j++)
219 ds_init_empty (&tstr);
222 v = pspp_coeff_get_var (c->coeff[j], 0);
223 label = var_to_string (v);
224 /* Do not overwrite the variable's name. */
225 ds_put_cstr (&tstr, label);
226 if (var_is_alpha (v))
229 Append the value associated with this coefficient.
230 This makes sense only if we us the usual binary encoding
234 val = pspp_coeff_get_value (c->coeff[j], v);
236 var_append_value_name (v, val, &tstr);
239 tab_text (t, 1, this_row, TAB_CENTER, ds_cstr (&tstr));
241 Regression coefficients.
243 tab_double (t, 2, this_row, 0, c->coeff[j]->estimate, NULL);
245 Standard error of the coefficients.
247 std_err = sqrt (gsl_matrix_get (c->cov, j + 1, j + 1));
248 tab_double (t, 3, this_row, 0, std_err, NULL);
250 Standardized coefficient, i.e., regression coefficient
251 if all variables had unit variance.
253 beta = pspp_coeff_get_sd (c->coeff[j]);
254 beta *= c->coeff[j]->estimate / c->depvar_std;
255 tab_double (t, 4, this_row, 0, beta, NULL);
258 Test statistic for H0: coefficient is 0.
260 t_stat = c->coeff[j]->estimate / std_err;
261 tab_double (t, 5, this_row, 0, t_stat, NULL);
263 P values for the test statistic above.
266 2 * gsl_cdf_tdist_Q (fabs (t_stat),
267 (double) (c->n_obs - c->n_coeffs));
268 tab_double (t, 6, this_row, 0, pval, NULL);
271 tab_title (t, _("Coefficients"));
276 Display the ANOVA table.
279 reg_stats_anova (pspp_linreg_cache * c)
283 const double msm = c->ssm / c->dfm;
284 const double mse = pspp_linreg_mse (c);
285 const double F = msm / mse;
286 const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
291 t = tab_create (n_cols, n_rows, 0);
292 tab_headers (t, 2, 0, 1, 0);
293 tab_dim (t, tab_natural_dimensions);
295 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
297 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
298 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
299 tab_vline (t, TAL_0, 1, 0, 0);
301 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
302 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
303 tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
304 tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
305 tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
307 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
308 tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
309 tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
311 /* Sums of Squares */
312 tab_double (t, 2, 1, 0, c->ssm, NULL);
313 tab_double (t, 2, 3, 0, c->sst, NULL);
314 tab_double (t, 2, 2, 0, c->sse, NULL);
317 /* Degrees of freedom */
318 tab_text (t, 3, 1, TAB_RIGHT | TAT_PRINTF, "%g", c->dfm);
319 tab_text (t, 3, 2, TAB_RIGHT | TAT_PRINTF, "%g", c->dfe);
320 tab_text (t, 3, 3, TAB_RIGHT | TAT_PRINTF, "%g", c->dft);
323 tab_double (t, 4, 1, TAB_RIGHT, msm, NULL);
324 tab_double (t, 4, 2, TAB_RIGHT, mse, NULL);
326 tab_double (t, 5, 1, 0, F, NULL);
328 tab_double (t, 6, 1, 0, pval, NULL);
330 tab_title (t, _("ANOVA"));
335 reg_stats_outs (pspp_linreg_cache * c)
341 reg_stats_zpp (pspp_linreg_cache * c)
347 reg_stats_label (pspp_linreg_cache * c)
353 reg_stats_sha (pspp_linreg_cache * c)
358 reg_stats_ci (pspp_linreg_cache * c)
363 reg_stats_f (pspp_linreg_cache * c)
368 reg_stats_bcov (pspp_linreg_cache * c)
380 n_cols = c->n_indeps + 1 + 2;
381 n_rows = 2 * (c->n_indeps + 1);
382 t = tab_create (n_cols, n_rows, 0);
383 tab_headers (t, 2, 0, 1, 0);
384 tab_dim (t, tab_natural_dimensions);
385 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
386 tab_hline (t, TAL_2, 0, n_cols - 1, 1);
387 tab_vline (t, TAL_2, 2, 0, n_rows - 1);
388 tab_vline (t, TAL_0, 1, 0, 0);
389 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
390 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
391 for (i = 0; i < c->n_coeffs; i++)
393 const struct variable *v = pspp_coeff_get_var (c->coeff[i], 0);
394 label = var_to_string (v);
395 tab_text (t, 2, i, TAB_CENTER, label);
396 tab_text (t, i + 2, 0, TAB_CENTER, label);
397 for (k = 1; k < c->n_coeffs; k++)
399 col = (i <= k) ? k : i;
400 row = (i <= k) ? i : k;
401 tab_double (t, k + 2, i, TAB_CENTER,
402 gsl_matrix_get (c->cov, row, col), NULL);
405 tab_title (t, _("Coefficient Correlations"));
409 reg_stats_ses (pspp_linreg_cache * c)
414 reg_stats_xtx (pspp_linreg_cache * c)
419 reg_stats_collin (pspp_linreg_cache * c)
424 reg_stats_tol (pspp_linreg_cache * c)
429 reg_stats_selection (pspp_linreg_cache * c)
435 statistics_keyword_output (void (*function) (pspp_linreg_cache *),
436 int keyword, pspp_linreg_cache * c)
445 subcommand_statistics (int *keywords, pspp_linreg_cache * c)
448 The order here must match the order in which the STATISTICS
449 keywords appear in the specification section above.
476 Set everything but F.
478 for (i = 0; i < f; i++)
485 for (i = 0; i < all; i++)
493 Default output: ANOVA table, parameter estimates,
494 and statistics for variables not entered into model,
497 if (keywords[defaults] | d)
505 statistics_keyword_output (reg_stats_r, keywords[r], c);
506 statistics_keyword_output (reg_stats_anova, keywords[anova], c);
507 statistics_keyword_output (reg_stats_coeff, keywords[coeff], c);
508 statistics_keyword_output (reg_stats_outs, keywords[outs], c);
509 statistics_keyword_output (reg_stats_zpp, keywords[zpp], c);
510 statistics_keyword_output (reg_stats_label, keywords[label], c);
511 statistics_keyword_output (reg_stats_sha, keywords[sha], c);
512 statistics_keyword_output (reg_stats_ci, keywords[ci], c);
513 statistics_keyword_output (reg_stats_f, keywords[f], c);
514 statistics_keyword_output (reg_stats_bcov, keywords[bcov], c);
515 statistics_keyword_output (reg_stats_ses, keywords[ses], c);
516 statistics_keyword_output (reg_stats_xtx, keywords[xtx], c);
517 statistics_keyword_output (reg_stats_collin, keywords[collin], c);
518 statistics_keyword_output (reg_stats_tol, keywords[tol], c);
519 statistics_keyword_output (reg_stats_selection, keywords[selection], c);
523 Free the transformation. Free its linear model if this
524 transformation is the last one.
527 regression_trns_free (void *t_)
530 struct reg_trns *t = t_;
532 if (t->trns_id == t->n_trns)
534 result = pspp_linreg_cache_free (t->c);
542 Gets the predicted values.
545 regression_trns_pred_proc (void *t_, struct ccase *c,
546 casenumber case_idx UNUSED)
550 struct reg_trns *trns = t_;
551 pspp_linreg_cache *model;
552 union value *output = NULL;
553 const union value **vals = NULL;
554 const struct variable **vars = NULL;
556 assert (trns != NULL);
558 assert (model != NULL);
559 assert (model->depvar != NULL);
560 assert (model->pred != NULL);
562 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
563 n_vals = (*model->get_vars) (model, vars);
565 vals = xnmalloc (n_vals, sizeof (*vals));
566 output = case_data_rw (c, model->pred);
567 assert (output != NULL);
569 for (i = 0; i < n_vals; i++)
571 vals[i] = case_data (c, vars[i]);
573 output->f = (*model->predict) ((const struct variable **) vars,
574 vals, model, n_vals);
577 return TRNS_CONTINUE;
584 regression_trns_resid_proc (void *t_, struct ccase *c,
585 casenumber case_idx UNUSED)
589 struct reg_trns *trns = t_;
590 pspp_linreg_cache *model;
591 union value *output = NULL;
592 const union value **vals = NULL;
593 const union value *obs = NULL;
594 const struct variable **vars = NULL;
596 assert (trns != NULL);
598 assert (model != NULL);
599 assert (model->depvar != NULL);
600 assert (model->resid != NULL);
602 vars = xnmalloc (model->n_coeffs, sizeof (*vars));
603 n_vals = (*model->get_vars) (model, vars);
605 vals = xnmalloc (n_vals, sizeof (*vals));
606 output = case_data_rw (c, model->resid);
607 assert (output != NULL);
609 for (i = 0; i < n_vals; i++)
611 vals[i] = case_data (c, vars[i]);
613 obs = case_data (c, model->depvar);
614 output->f = (*model->residual) ((const struct variable **) vars,
615 vals, obs, model, n_vals);
618 return TRNS_CONTINUE;
622 Returns false if NAME is a duplicate of any existing variable name.
625 try_name (const struct dictionary *dict, const char *name)
627 if (dict_lookup_var (dict, name) != NULL)
634 reg_get_name (const struct dictionary *dict, char name[VAR_NAME_LEN],
635 const char prefix[VAR_NAME_LEN])
639 snprintf (name, VAR_NAME_LEN, "%s%d", prefix, i);
640 while (!try_name (dict, name))
643 snprintf (name, VAR_NAME_LEN, "%s%d", prefix, i);
648 reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
649 pspp_linreg_cache * c, struct variable **v, int n_trns)
651 struct dictionary *dict = dataset_dict (ds);
652 static int trns_index = 1;
653 char name[VAR_NAME_LEN];
654 struct variable *new_var;
655 struct reg_trns *t = NULL;
657 t = xmalloc (sizeof (*t));
658 t->trns_id = trns_index;
661 reg_get_name (dict, name, prefix);
662 new_var = dict_create_var (dict, name, 0);
663 assert (new_var != NULL);
665 add_transformation (ds, f, regression_trns_free, t);
669 subcommand_save (struct dataset *ds, int save, pspp_linreg_cache ** models)
671 pspp_linreg_cache **lc;
675 assert (models != NULL);
679 /* Count the number of transformations we will need. */
680 for (i = 0; i < REGRESSION_SV_count; i++)
687 n_trns *= cmd.n_dependent;
689 for (lc = models; lc < models + cmd.n_dependent; lc++)
691 assert (*lc != NULL);
692 assert ((*lc)->depvar != NULL);
693 if (cmd.a_save[REGRESSION_SV_RESID])
695 reg_save_var (ds, "RES", regression_trns_resid_proc, *lc,
696 &(*lc)->resid, n_trns);
698 if (cmd.a_save[REGRESSION_SV_PRED])
700 reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc,
701 &(*lc)->pred, n_trns);
707 for (lc = models; lc < models + cmd.n_dependent; lc++)
711 pspp_linreg_cache_free (*lc);
718 cmd_regression (struct lexer *lexer, struct dataset *ds)
720 struct casegrouper *grouper;
721 struct casereader *group;
722 pspp_linreg_cache **models;
726 if (!parse_regression (lexer, ds, &cmd, NULL))
731 models = xnmalloc (cmd.n_dependent, sizeof *models);
732 for (i = 0; i < cmd.n_dependent; i++)
738 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
739 while (casegrouper_get_next_group (grouper, &group))
740 run_regression (group, &cmd, ds, models);
741 ok = casegrouper_destroy (grouper);
742 ok = proc_commit (ds) && ok;
744 subcommand_save (ds, cmd.sbc_save, models);
747 free_regression (&cmd);
749 return ok ? CMD_SUCCESS : CMD_FAILURE;
753 Is variable k the dependent variable?
756 is_depvar (size_t k, const struct variable *v)
758 return v == v_variables[k];
761 /* Parser for the variables sub command */
763 regression_custom_variables (struct lexer *lexer, struct dataset *ds,
764 struct cmd_regression *cmd UNUSED,
767 const struct dictionary *dict = dataset_dict (ds);
769 lex_match (lexer, '=');
771 if ((lex_token (lexer) != T_ID
772 || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
773 && lex_token (lexer) != T_ALL)
777 if (!parse_variables_const
778 (lexer, dict, &v_variables, &n_variables, PV_NONE))
783 assert (n_variables);
788 /* Identify the explanatory variables in v_variables. Returns
789 the number of independent variables. */
791 identify_indep_vars (const struct variable **indep_vars,
792 const struct variable *depvar)
794 int n_indep_vars = 0;
797 for (i = 0; i < n_variables; i++)
798 if (!is_depvar (i, depvar))
799 indep_vars[n_indep_vars++] = v_variables[i];
800 if ((n_indep_vars < 1) && is_depvar (0, depvar))
803 There is only one independent variable, and it is the same
804 as the dependent variable. Print a warning and continue.
807 gettext ("The dependent variable is equal to the independent variable."
808 "The least squares line is therefore Y=X."
809 "Standard errors and related statistics may be meaningless."));
811 indep_vars[0] = v_variables[0];
816 /* Encode categorical variables.
817 Returns number of valid cases. */
819 prepare_categories (struct casereader *input,
820 const struct variable **vars, size_t n_vars,
821 struct moments_var *mom)
827 assert (vars != NULL);
828 assert (mom != NULL);
830 for (i = 0; i < n_vars; i++)
831 if (var_is_alpha (vars[i]))
832 cat_stored_values_create (vars[i]);
835 for (; casereader_read (input, &c); case_destroy (&c))
838 The second condition ensures the program will run even if
839 there is only one variable to act as both explanatory and
842 for (i = 0; i < n_vars; i++)
844 const union value *val = case_data (&c, vars[i]);
845 if (var_is_alpha (vars[i]))
846 cat_value_update (vars[i], val);
848 moments1_add (mom[i].m, val->f, 1.0);
852 casereader_destroy (input);
858 coeff_init (pspp_linreg_cache * c, struct design_matrix *dm)
860 c->coeff = xnmalloc (dm->m->size2, sizeof (*c->coeff));
861 pspp_coeff_init (c->coeff, dm);
865 Put the moments in the linreg cache.
868 compute_moments (pspp_linreg_cache * c, struct moments_var *mom,
869 struct design_matrix *dm, size_t n)
879 Scan the variable names in the columns of the design matrix.
880 When we find the variable we need, insert its mean in the cache.
882 for (i = 0; i < dm->m->size2; i++)
884 for (j = 0; j < n; j++)
886 if (design_matrix_col_to_var (dm, i) == (mom + j)->v)
888 moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
889 &skewness, &kurtosis);
890 pspp_linreg_set_indep_variable_mean (c, (mom + j)->v, mean);
891 pspp_linreg_set_indep_variable_sd (c, (mom + j)->v, sqrt (variance));
898 run_regression (struct casereader *input, struct cmd_regression *cmd,
899 struct dataset *ds, pspp_linreg_cache **models)
905 const struct variable **indep_vars;
906 struct design_matrix *X;
907 struct moments_var *mom;
910 pspp_linreg_opts lopts;
912 assert (models != NULL);
914 if (!casereader_peek (input, 0, &c))
916 casereader_destroy (input);
919 output_split_file_values (ds, &c);
924 dict_get_vars (dataset_dict (ds), &v_variables, &n_variables, 0);
927 for (i = 0; i < cmd->n_dependent; i++)
929 if (!var_is_numeric (cmd->v_dependent[i]))
931 msg (SE, _("Dependent variable must be numeric."));
936 mom = xnmalloc (n_variables, sizeof (*mom));
937 for (i = 0; i < n_variables; i++)
939 (mom + i)->m = moments1_create (MOMENT_VARIANCE);
940 (mom + i)->v = v_variables[i];
942 lopts.get_depvar_mean_std = 1;
944 lopts.get_indep_mean_std = xnmalloc (n_variables, sizeof (int));
945 indep_vars = xnmalloc (n_variables, sizeof *indep_vars);
947 for (k = 0; k < cmd->n_dependent; k++)
949 const struct variable *dep_var;
950 struct casereader *reader;
953 size_t n_data; /* Number of valid cases. */
955 dep_var = cmd->v_dependent[k];
956 n_indep = identify_indep_vars (indep_vars, dep_var);
957 reader = casereader_clone (input);
958 reader = casereader_create_filter_missing (reader, indep_vars, n_indep,
960 reader = casereader_create_filter_missing (reader, &dep_var, 1,
962 n_data = prepare_categories (casereader_clone (reader),
963 indep_vars, n_indep, mom);
965 if ((n_data > 0) && (n_indep > 0))
967 Y = gsl_vector_alloc (n_data);
969 design_matrix_create (n_indep,
970 (const struct variable **) indep_vars,
972 for (i = 0; i < X->m->size2; i++)
974 lopts.get_indep_mean_std[i] = 1;
976 models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
977 models[k]->depvar = dep_var;
979 For large data sets, use QR decomposition.
981 if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
983 models[k]->method = PSPP_LINREG_QR;
987 The second pass fills the design matrix.
989 reader = casereader_create_counter (reader, &row, -1);
990 for (; casereader_read (reader, &c); case_destroy (&c))
992 for (i = 0; i < n_indep; ++i)
994 const struct variable *v = indep_vars[i];
995 const union value *val = case_data (&c, v);
996 if (var_is_alpha (v))
997 design_matrix_set_categorical (X, row, v, val);
999 design_matrix_set_numeric (X, row, v, val);
1001 gsl_vector_set (Y, row, case_num (&c, dep_var));
1004 Now that we know the number of coefficients, allocate space
1005 and store pointers to the variables that correspond to the
1008 coeff_init (models[k], X);
1011 Find the least-squares estimates and other statistics.
1013 pspp_linreg ((const gsl_vector *) Y, X, &lopts, models[k]);
1015 if (!taint_has_tainted_successor (casereader_get_taint (input)))
1017 subcommand_statistics (cmd->a_statistics, models[k]);
1020 gsl_vector_free (Y);
1021 design_matrix_destroy (X);
1026 gettext ("No valid data found. This command was skipped."));
1028 casereader_destroy (reader);
1030 for (i = 0; i < n_variables; i++)
1032 moments1_destroy ((mom + i)->m);
1036 free (lopts.get_indep_mean_std);
1037 casereader_destroy (input);