X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Fregression.c;h=b4448dd9e58ead03cb61cf3d5e5b3c97201b1edf;hb=2a65f1fed17e8fae3c5d2ae22a476d0b885a69c3;hp=6cf94c6ad93d3254be30f0ab135c55cface149a6;hpb=3acf6da8d49333d9cc037dfe92a80c1615224a27;p=pspp diff --git a/src/language/stats/regression.c b/src/language/stats/regression.c index 6cf94c6ad9..b4448dd9e5 100644 --- a/src/language/stats/regression.c +++ b/src/language/stats/regression.c @@ -1,5 +1,6 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2005, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. + Copyright (C) 2005, 2009, 2010, 2011, 2012, 2013, 2014, + 2016, 2017, 2019 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,8 +17,10 @@ #include +#include #include +#include #include #include @@ -41,16 +44,27 @@ #include "libpspp/message.h" #include "libpspp/taint.h" -#include "output/tab.h" +#include "output/pivot-table.h" + +#include "gl/intprops.h" +#include "gl/minmax.h" #include "gettext.h" #define _(msgid) gettext (msgid) #define N_(msgid) msgid -#include +#define STATS_R 1 +#define STATS_COEFF 2 +#define STATS_ANOVA 4 +#define STATS_OUTS 8 +#define STATS_CI 16 +#define STATS_BCOV 32 +#define STATS_TOL 64 + +#define STATS_DEFAULT (STATS_R | STATS_COEFF | STATS_ANOVA | STATS_OUTS) + -#define REG_LARGE_DATA 1000 struct regression { @@ -62,43 +76,41 @@ struct regression const struct variable **dep_vars; size_t n_dep_vars; - bool r; - bool coeff; - bool anova; - bool bcov; - + unsigned int stats; + double ci; bool resid; bool pred; -}; -struct per_split_ws -{ - linreg **models; + bool origin; }; struct regression_workspace { - struct per_split_ws *psw; + /* The new variables which will be introduced by /SAVE */ + const struct variable **predvars; + const struct variable **residvars; + /* A reader/writer pair to temporarily hold the + values of the new variables */ struct casewriter *writer; struct casereader *reader; + /* Indeces of the new values in the reader/writer (-1 if not applicable) */ int res_idx; int pred_idx; - int extras; - const struct variable **predvars; - const struct variable **residvars; + /* 0, 1 or 2 depending on what new variables are to be created */ + int extras; }; static void run_regression (const struct regression *cmd, - struct per_split_ws *psw, struct regression_workspace *ws, struct casereader *input); - +/* Return a string based on PREFIX which may be used as the name + of a new variable in DICT */ static char * reg_get_name (const struct dictionary *dict, const char *prefix) { @@ -127,87 +139,110 @@ create_aux_var (struct dataset *ds, const char *prefix) return var; } -struct thing +/* Auxiliary data for transformation when /SAVE is entered */ +struct save_trans_data { int n_dep_vars; struct regression_workspace *ws; }; -static int -transX (void *aux, struct ccase **c, casenumber x UNUSED) +static bool +save_trans_free (void *aux) +{ + struct save_trans_data *save_trans_data = aux; + free (save_trans_data->ws->predvars); + free (save_trans_data->ws->residvars); + + casereader_destroy (save_trans_data->ws->reader); + free (save_trans_data->ws); + free (save_trans_data); + return true; +} + +static enum trns_result +save_trans_func (void *aux, struct ccase **c, casenumber x UNUSED) { - struct thing *thing = aux; - struct regression_workspace *ws = thing->ws; - const struct ccase *in = casereader_read (ws->reader); + struct save_trans_data *save_trans_data = aux; + struct regression_workspace *ws = save_trans_data->ws; + struct ccase *in = casereader_read (ws->reader); if (in) { int k; *c = case_unshare (*c); - for (k = 0; k < thing->n_dep_vars; ++k) + for (k = 0; k < save_trans_data->n_dep_vars; ++k) { if (ws->pred_idx != -1) { - double pred = case_data_idx (in, ws->extras * k + ws->pred_idx)->f; - case_data_rw (*c, ws->predvars[k])->f = pred; + double pred = case_num_idx (in, ws->extras * k + ws->pred_idx); + *case_num_rw (*c, ws->predvars[k]) = pred; } - + if (ws->res_idx != -1) { - double resid = case_data_idx (in, ws->extras * k + ws->res_idx)->f; - case_data_rw (*c, ws->residvars[k])->f = resid; + double resid = case_num_idx (in, ws->extras * k + ws->res_idx); + *case_num_rw (*c, ws->residvars[k]) = resid; } } + case_unref (in); } return TRNS_CONTINUE; } - int cmd_regression (struct lexer *lexer, struct dataset *ds) { - int n_splits = 0; struct regression_workspace workspace; struct regression regression; const struct dictionary *dict = dataset_dict (ds); bool save; - workspace.psw = NULL; memset (®ression, 0, sizeof (struct regression)); - regression.anova = true; - regression.coeff = true; - regression.r = true; - + regression.ci = 0.95; + regression.stats = STATS_DEFAULT; regression.pred = false; regression.resid = false; regression.ds = ds; + regression.origin = false; - /* Accept an optional, completely pointless "/VARIABLES=" */ - lex_match (lexer, T_SLASH); - if (lex_match_id (lexer, "VARIABLES")) - { - if (!lex_force_match (lexer, T_EQUALS)) - goto error; - } - - if (!parse_variables_const (lexer, dict, - ®ression.vars, ®ression.n_vars, - PV_NO_DUPLICATE | PV_NUMERIC)) - goto error; - - + bool variables_seen = false; + bool method_seen = false; + bool dependent_seen = false; while (lex_token (lexer) != T_ENDCMD) { lex_match (lexer, T_SLASH); - if (lex_match_id (lexer, "DEPENDENT")) + if (lex_match_id (lexer, "VARIABLES")) { - if (!lex_force_match (lexer, T_EQUALS)) - goto error; + if (method_seen) + { + msg (SE, _("VARIABLES may not appear after %s"), "METHOD"); + goto error; + } + if (dependent_seen) + { + msg (SE, _("VARIABLES may not appear after %s"), "DEPENDENT"); + goto error; + } + variables_seen = true; + lex_match (lexer, T_EQUALS); + + if (!parse_variables_const (lexer, dict, + ®ression.vars, ®ression.n_vars, + PV_NO_DUPLICATE | PV_NUMERIC)) + goto error; + } + else if (lex_match_id (lexer, "DEPENDENT")) + { + dependent_seen = true; + lex_match (lexer, T_EQUALS); + + free (regression.dep_vars); + regression.n_dep_vars = 0; if (!parse_variables_const (lexer, dict, ®ression.dep_vars, @@ -215,17 +250,35 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) PV_NO_DUPLICATE | PV_NUMERIC)) goto error; } + else if (lex_match_id (lexer, "ORIGIN")) + { + regression.origin = true; + } + else if (lex_match_id (lexer, "NOORIGIN")) + { + regression.origin = false; + } else if (lex_match_id (lexer, "METHOD")) { + method_seen = true; lex_match (lexer, T_EQUALS); if (!lex_force_match_id (lexer, "ENTER")) { goto error; } + + if (! variables_seen) + { + if (!parse_variables_const (lexer, dict, + ®ression.vars, ®ression.n_vars, + PV_NO_DUPLICATE | PV_NUMERIC)) + goto error; + } } else if (lex_match_id (lexer, "STATISTICS")) { + unsigned long statistics = 0; lex_match (lexer, T_EQUALS); while (lex_token (lexer) != T_ENDCMD @@ -233,21 +286,44 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) { if (lex_match (lexer, T_ALL)) { + statistics = ~0; } else if (lex_match_id (lexer, "DEFAULTS")) { + statistics |= STATS_DEFAULT; } else if (lex_match_id (lexer, "R")) { + statistics |= STATS_R; } else if (lex_match_id (lexer, "COEFF")) { + statistics |= STATS_COEFF; } else if (lex_match_id (lexer, "ANOVA")) { + statistics |= STATS_ANOVA; } else if (lex_match_id (lexer, "BCOV")) { + statistics |= STATS_BCOV; + } + else if (lex_match_id (lexer, "TOL")) + { + statistics |= STATS_TOL; + } + else if (lex_match_id (lexer, "CI")) + { + statistics |= STATS_CI; + + if (lex_match (lexer, T_LPAREN) && + lex_force_num (lexer)) + { + regression.ci = lex_number (lexer) / 100.0; + lex_get (lexer); + if (! lex_force_match (lexer, T_RPAREN)) + goto error; + } } else { @@ -255,6 +331,10 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) goto error; } } + + if (statistics) + regression.stats = statistics; + } else if (lex_match_id (lexer, "SAVE")) { @@ -294,8 +374,10 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) workspace.extras = 0; workspace.res_idx = -1; workspace.pred_idx = -1; - workspace.writer = NULL; + workspace.writer = NULL; workspace.reader = NULL; + workspace.residvars = NULL; + workspace.predvars = NULL; if (save) { int i; @@ -303,8 +385,7 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) if (regression.resid) { - workspace.extras ++; - workspace.res_idx = 0; + workspace.res_idx = workspace.extras ++; workspace.residvars = xcalloc (regression.n_dep_vars, sizeof (*workspace.residvars)); for (i = 0; i < regression.n_dep_vars; ++i) @@ -316,8 +397,7 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) if (regression.pred) { - workspace.extras ++; - workspace.pred_idx = 1; + workspace.pred_idx = workspace.extras ++; workspace.predvars = xcalloc (regression.n_dep_vars, sizeof (*workspace.predvars)); for (i = 0; i < regression.n_dep_vars; ++i) @@ -331,11 +411,15 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) msg (SW, _("REGRESSION with SAVE ignores TEMPORARY. " "Temporary transformations will be made permanent.")); + if (dict_get_filter (dict)) + msg (SW, _("REGRESSION with SAVE ignores FILTER. " + "All cases will be processed.")); + workspace.writer = autopaging_writer_create (proto); + caseproto_unref (proto); } - n_splits = 0; { struct casegrouper *grouper; struct casereader *group; @@ -346,9 +430,7 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) while (casegrouper_get_next_group (grouper, &group)) { - workspace.psw = xrealloc (workspace.psw, ++n_splits * sizeof (*workspace.psw)); - - run_regression (®ression, &workspace.psw[n_splits - 1], + run_regression (®ression, &workspace, group); @@ -357,19 +439,22 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) ok = proc_commit (ds) && ok; } + if (workspace.writer) { - if (workspace.writer) - { - struct thing *thing = xmalloc (sizeof *thing); - struct casereader *r = casewriter_make_reader (workspace.writer); - workspace.writer = NULL; - workspace.reader = r; - thing->ws = xmalloc (sizeof (workspace)); - memcpy (thing->ws, &workspace, sizeof (workspace)); - thing->n_dep_vars = regression.n_dep_vars; - - add_transformation (ds, transX, NULL, thing); - } + struct save_trans_data *save_trans_data = xmalloc (sizeof *save_trans_data); + struct casereader *r = casewriter_make_reader (workspace.writer); + workspace.writer = NULL; + workspace.reader = r; + save_trans_data->ws = xmalloc (sizeof (workspace)); + memcpy (save_trans_data->ws, &workspace, sizeof (workspace)); + save_trans_data->n_dep_vars = regression.n_dep_vars; + + static const struct trns_class trns_class = { + .name = "REGRESSION", + .execute = save_trans_func, + .destroy = save_trans_free, + }; + add_transformation (ds, &trns_class, save_trans_data); } @@ -384,7 +469,7 @@ error: return CMD_FAILURE; } - +/* Return the size of the union of dependent and independent variables */ static size_t get_n_all_vars (const struct regression *cmd) { @@ -406,20 +491,21 @@ get_n_all_vars (const struct regression *cmd) return result; } +/* Fill VARS with the union of dependent and independent variables */ static void fill_all_vars (const struct variable **vars, const struct regression *cmd) { + size_t x = 0; size_t i; - size_t j; - bool absent; - for (i = 0; i < cmd->n_vars; i++) { vars[i] = cmd->vars[i]; } + for (i = 0; i < cmd->n_dep_vars; i++) { - absent = true; + size_t j; + bool absent = true; for (j = 0; j < cmd->n_vars; j++) { if (cmd->dep_vars[i] == cmd->vars[j]) @@ -430,11 +516,29 @@ fill_all_vars (const struct variable **vars, const struct regression *cmd) } if (absent) { - vars[i + cmd->n_vars] = cmd->dep_vars[i]; + vars[cmd->n_vars + x++] = cmd->dep_vars[i]; } } } + +/* Fill the array VARS, with all the predictor variables from CMD, except + variable X */ +static void +fill_predictor_x (const struct variable **vars, const struct variable *x, const struct regression *cmd) +{ + size_t i; + size_t n = 0; + + for (i = 0; i < cmd->n_vars; i++) + { + if (cmd->vars[i] == x) + continue; + + vars[n++] = cmd->vars[i]; + } +} + /* Is variable k the dependent variable? */ @@ -466,8 +570,8 @@ identify_indep_vars (const struct regression *cmd, */ msg (SW, gettext - ("The dependent variable is equal to the independent variable." - "The least squares line is therefore Y=X." + ("The dependent variable is equal to the independent variable. " + "The least squares line is therefore Y=X. " "Standard errors and related statistics may be meaningless.")); n_indep_vars = 1; indep_vars[0] = cmd->vars[0]; @@ -475,7 +579,6 @@ identify_indep_vars (const struct regression *cmd, return n_indep_vars; } - static double fill_covariance (gsl_matrix * cov, struct covariance *all_cov, const struct variable **vars, @@ -485,14 +588,14 @@ fill_covariance (gsl_matrix * cov, struct covariance *all_cov, { size_t i; size_t j; - size_t dep_subscript; + size_t dep_subscript = SIZE_MAX; size_t *rows; const gsl_matrix *ssizes; const gsl_matrix *mean_matrix; const gsl_matrix *ssize_matrix; double result = 0.0; - gsl_matrix *cm = covariance_calculate_unnormalized (all_cov); + const gsl_matrix *cm = covariance_calculate_unnormalized (all_cov); if (cm == NULL) return 0; @@ -513,6 +616,8 @@ fill_covariance (gsl_matrix * cov, struct covariance *all_cov, dep_subscript = i; } } + assert (dep_subscript != SIZE_MAX); + mean_matrix = covariance_moments (all_cov, MOMENT_MEAN); ssize_matrix = covariance_moments (all_cov, MOMENT_NONE); for (i = 0; i < cov->size1 - 1; i++) @@ -543,65 +648,83 @@ fill_covariance (gsl_matrix * cov, struct covariance *all_cov, gsl_matrix_set (cov, cov->size1 - 1, cov->size1 - 1, gsl_matrix_get (cm, dep_subscript, dep_subscript)); free (rows); - gsl_matrix_free (cm); return result; } + + +struct model_container +{ + struct linreg **models; +}; /* STATISTICS subcommand output functions. */ -static void reg_stats_r (linreg *, void *, const struct variable *); -static void reg_stats_coeff (linreg *, void *, const struct variable *); -static void reg_stats_anova (linreg *, void *, const struct variable *); -static void reg_stats_bcov (linreg *, void *, const struct variable *); - -static void -statistics_keyword_output (void (*) - (linreg *, void *, const struct variable *), bool, - linreg *, void *, const struct variable *); - - - -static void -subcommand_statistics (const struct regression *cmd, linreg * c, void *aux, - const struct variable *var) -{ - statistics_keyword_output (reg_stats_r, cmd->r, c, aux, var); - statistics_keyword_output (reg_stats_anova, cmd->anova, c, aux, var); - statistics_keyword_output (reg_stats_coeff, cmd->coeff, c, aux, var); - statistics_keyword_output (reg_stats_bcov, cmd->bcov, c, aux, var); -} - - -static void -run_regression (const struct regression *cmd, - struct per_split_ws *psw, - struct regression_workspace *ws, - struct casereader *input) +static void reg_stats_r (const struct linreg *, const struct variable *); +static void reg_stats_coeff (const struct regression *, const struct linreg *, + const struct model_container *, const gsl_matrix *, + const struct variable *); +static void reg_stats_anova (const struct linreg *, const struct variable *); +static void reg_stats_bcov (const struct linreg *, const struct variable *); + + +static struct linreg ** +run_regression_get_models (const struct regression *cmd, + struct casereader *input, + bool output) { size_t i; + struct model_container *model_container = XCALLOC (cmd->n_vars, struct model_container); - int k; struct ccase *c; struct covariance *cov; struct casereader *reader; + + if (cmd->stats & STATS_TOL) + { + for (i = 0; i < cmd->n_vars; i++) + { + struct regression subreg; + subreg.origin = cmd->origin; + subreg.ds = cmd->ds; + subreg.n_vars = cmd->n_vars - 1; + subreg.n_dep_vars = 1; + subreg.vars = xmalloc (sizeof (*subreg.vars) * cmd->n_vars - 1); + subreg.dep_vars = xmalloc (sizeof (*subreg.dep_vars)); + fill_predictor_x (subreg.vars, cmd->vars[i], cmd); + subreg.dep_vars[0] = cmd->vars[i]; + subreg.stats = STATS_R; + subreg.ci = 0; + subreg.resid = false; + subreg.pred = false; + + model_container[i].models = + run_regression_get_models (&subreg, input, false); + free (subreg.vars); + free (subreg.dep_vars); + } + } + size_t n_all_vars = get_n_all_vars (cmd); const struct variable **all_vars = xnmalloc (n_all_vars, sizeof (*all_vars)); - double *means = xnmalloc (n_all_vars, sizeof (*means)); - + /* In the (rather pointless) case where the dependent variable is + the independent variable, n_all_vars == 1. + However this would result in a buffer overflow so we must + over-allocate the space required in this malloc call. + See bug #58599 */ + double *means = xnmalloc (n_all_vars <= 1 ? 2 : n_all_vars, + sizeof (*means)); fill_all_vars (all_vars, cmd); cov = covariance_1pass_create (n_all_vars, all_vars, dict_get_weight (dataset_dict (cmd->ds)), - MV_ANY); + MV_ANY, cmd->origin == false); reader = casereader_clone (input); reader = casereader_create_filter_missing (reader, all_vars, n_all_vars, MV_ANY, NULL, NULL); - - - { +{ struct casereader *r = casereader_clone (reader); for (; (c = casereader_read (r)) != NULL; case_unref (c)) @@ -611,66 +734,94 @@ run_regression (const struct regression *cmd, casereader_destroy (r); } - psw->models = xcalloc (cmd->n_dep_vars, sizeof (*psw->models)); - for (k = 0; k < cmd->n_dep_vars; k++) - { + struct linreg **models = XCALLOC (cmd->n_dep_vars, struct linreg*); + for (int k = 0; k < cmd->n_dep_vars; k++) + { const struct variable **vars = xnmalloc (cmd->n_vars, sizeof (*vars)); const struct variable *dep_var = cmd->dep_vars[k]; int n_indep = identify_indep_vars (cmd, vars, dep_var); - gsl_matrix *this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1); - double n_data = fill_covariance (this_cm, cov, vars, n_indep, + gsl_matrix *cov_matrix = gsl_matrix_alloc (n_indep + 1, n_indep + 1); + double n_data = fill_covariance (cov_matrix, cov, vars, n_indep, dep_var, all_vars, n_all_vars, means); - psw->models[k] = linreg_alloc (dep_var, vars, n_data, n_indep); - psw->models[k]->depvar = dep_var; + models[k] = linreg_alloc (dep_var, vars, n_data, n_indep, cmd->origin); for (i = 0; i < n_indep; i++) { - linreg_set_indep_variable_mean (psw->models[k], i, means[i]); + linreg_set_indep_variable_mean (models[k], i, means[i]); } - linreg_set_depvar_mean (psw->models[k], means[i]); - /* - For large data sets, use QR decomposition. - */ - if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA) - { - psw->models[k]->method = LINREG_QR; - } - + linreg_set_depvar_mean (models[k], means[i]); if (n_data > 0) { - /* - Find the least-squares estimates and other statistics. - */ - linreg_fit (this_cm, psw->models[k]); + linreg_fit (cov_matrix, models[k]); - if (!taint_has_tainted_successor (casereader_get_taint (input))) + if (output && !taint_has_tainted_successor (casereader_get_taint (input))) { - subcommand_statistics (cmd, psw->models[k], this_cm, dep_var); - } + /* + Find the least-squares estimates and other statistics. + */ + if (cmd->stats & STATS_R) + reg_stats_r (models[k], dep_var); + + if (cmd->stats & STATS_ANOVA) + reg_stats_anova (models[k], dep_var); + + if (cmd->stats & STATS_COEFF) + reg_stats_coeff (cmd, models[k], + model_container, + cov_matrix, dep_var); + + if (cmd->stats & STATS_BCOV) + reg_stats_bcov (models[k], dep_var); + } } else { msg (SE, _("No valid data found. This command was skipped.")); } - gsl_matrix_free (this_cm); free (vars); + gsl_matrix_free (cov_matrix); } + casereader_destroy (reader); + + for (int i = 0; i < cmd->n_vars; i++) + { + if (model_container[i].models) + { + linreg_unref (model_container[i].models[0]); + } + free (model_container[i].models); + } + free (model_container); + + free (all_vars); + free (means); + covariance_destroy (cov); + return models; +} + +static void +run_regression (const struct regression *cmd, + struct regression_workspace *ws, + struct casereader *input) +{ + struct linreg **models = run_regression_get_models (cmd, input, true); if (ws->extras > 0) { - struct casereader *r = casereader_clone (reader); - + struct ccase *c; + struct casereader *r = casereader_clone (input); + for (; (c = casereader_read (r)) != NULL; case_unref (c)) { - struct ccase *outc = case_clone (c); - for (k = 0; k < cmd->n_dep_vars; k++) + struct ccase *outc = case_create (casewriter_get_proto (ws->writer)); + for (int k = 0; k < cmd->n_dep_vars; k++) { const struct variable **vars = xnmalloc (cmd->n_vars, sizeof (*vars)); const struct variable *dep_var = cmd->dep_vars[k]; int n_indep = identify_indep_vars (cmd, vars, dep_var); double *vals = xnmalloc (n_indep, sizeof (*vals)); - for (i = 0; i < n_indep; i++) + for (int i = 0; i < n_indep; i++) { const union value *tmp = case_data (c, vars[i]); vals[i] = tmp->f; @@ -678,271 +829,275 @@ run_regression (const struct regression *cmd, if (cmd->pred) { - double pred = linreg_predict (psw->models[k], vals, n_indep); - case_data_rw_idx (outc, k * ws->extras + ws->pred_idx)->f = pred; + double pred = linreg_predict (models[k], vals, n_indep); + *case_num_rw_idx (outc, k * ws->extras + ws->pred_idx) = pred; } if (cmd->resid) { - double obs = case_data (c, psw->models[k]->depvar)->f; - double res = linreg_residual (psw->models[k], obs, vals, n_indep); - case_data_rw_idx (outc, k * ws->extras + ws->res_idx)->f = res; + double obs = case_num (c, linreg_dep_var (models[k])); + double res = linreg_residual (models[k], obs, vals, n_indep); + *case_num_rw_idx (outc, k * ws->extras + ws->res_idx) = res; } - } + free (vals); + free (vars); + } casewriter_write (ws->writer, outc); } casereader_destroy (r); } - casereader_destroy (reader); - + for (int k = 0; k < cmd->n_dep_vars; k++) + { + linreg_unref (models[k]); + } - free (all_vars); - free (means); + free (models); casereader_destroy (input); - covariance_destroy (cov); } - - + static void -reg_stats_r (linreg * c, void *aux UNUSED, const struct variable *var) +reg_stats_r (const struct linreg * c, const struct variable *var) { - struct tab_table *t; - int n_rows = 2; - int n_cols = 5; - double rsq; - double adjrsq; - double std_error; - - assert (c != NULL); - rsq = linreg_ssreg (c) / linreg_sst (c); - adjrsq = rsq - - (1.0 - rsq) * linreg_n_coeffs (c) / (linreg_n_obs (c) - - linreg_n_coeffs (c) - 1); - std_error = sqrt (linreg_mse (c)); - t = tab_create (n_cols, n_rows); - tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); - tab_hline (t, TAL_2, 0, n_cols - 1, 1); - tab_vline (t, TAL_2, 2, 0, n_rows - 1); - tab_vline (t, TAL_0, 1, 0, 0); - - tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R")); - tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square")); - tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square")); - tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate")); - tab_double (t, 1, 1, TAB_RIGHT, sqrt (rsq), NULL); - tab_double (t, 2, 1, TAB_RIGHT, rsq, NULL); - tab_double (t, 3, 1, TAB_RIGHT, adjrsq, NULL); - tab_double (t, 4, 1, TAB_RIGHT, std_error, NULL); - tab_title (t, _("Model Summary (%s)"), var_to_string (var)); - tab_submit (t); + struct pivot_table *table = pivot_table_create__ ( + pivot_value_new_text_format (N_("Model Summary (%s)"), + var_to_string (var)), + "Model Summary"); + + pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"), + N_("R"), N_("R Square"), N_("Adjusted R Square"), + N_("Std. Error of the Estimate")); + + double rsq = linreg_ssreg (c) / linreg_sst (c); + double adjrsq = (rsq - + (1.0 - rsq) * linreg_n_coeffs (c) + / (linreg_n_obs (c) - linreg_n_coeffs (c) - 1)); + double std_error = sqrt (linreg_mse (c)); + + double entries[] = { + sqrt (rsq), rsq, adjrsq, std_error + }; + for (size_t i = 0; i < sizeof entries / sizeof *entries; i++) + pivot_table_put1 (table, i, pivot_value_new_number (entries[i])); + + pivot_table_submit (table); } /* Table showing estimated regression coefficients. */ static void -reg_stats_coeff (linreg * c, void *aux_, const struct variable *var) +reg_stats_coeff (const struct regression *cmd, const struct linreg *c, + const struct model_container *mc, const gsl_matrix *cov, + const struct variable *var) { - size_t j; - int n_cols = 7; - int n_rows; - int this_row; - double t_stat; - double pval; - double std_err; - double beta; - const char *label; - - const struct variable *v; - struct tab_table *t; - gsl_matrix *cov = aux_; - - assert (c != NULL); - n_rows = linreg_n_coeffs (c) + 3; - - t = tab_create (n_cols, n_rows); - tab_headers (t, 2, 0, 1, 0); - tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); - tab_hline (t, TAL_2, 0, n_cols - 1, 1); - tab_vline (t, TAL_2, 2, 0, n_rows - 1); - tab_vline (t, TAL_0, 1, 0, 0); - - tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B")); - tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error")); - tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta")); - tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t")); - tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance")); - tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)")); - tab_double (t, 2, 1, 0, linreg_intercept (c), NULL); - std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0)); - tab_double (t, 3, 1, 0, std_err, NULL); - tab_double (t, 4, 1, 0, 0.0, NULL); - t_stat = linreg_intercept (c) / std_err; - tab_double (t, 5, 1, 0, t_stat, NULL); - pval = - 2 * gsl_cdf_tdist_Q (fabs (t_stat), - (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); - tab_double (t, 6, 1, 0, pval, NULL); - for (j = 0; j < linreg_n_coeffs (c); j++) + struct pivot_table *table = pivot_table_create__ ( + pivot_value_new_text_format (N_("Coefficients (%s)"), var_to_string (var)), + "Coefficients"); + + struct pivot_dimension *statistics = pivot_dimension_create ( + table, PIVOT_AXIS_COLUMN, N_("Statistics")); + pivot_category_create_group (statistics->root, + N_("Unstandardized Coefficients"), + N_("B"), N_("Std. Error")); + pivot_category_create_group (statistics->root, + N_("Standardized Coefficients"), N_("Beta")); + pivot_category_create_leaves (statistics->root, N_("t"), + N_("Sig."), PIVOT_RC_SIGNIFICANCE); + if (cmd->stats & STATS_CI) { - struct string tstr; - ds_init_empty (&tstr); - this_row = j + 2; - - v = linreg_indep_var (c, j); - label = var_to_string (v); - /* Do not overwrite the variable's name. */ - ds_put_cstr (&tstr, label); - tab_text (t, 1, this_row, TAB_CENTER, ds_cstr (&tstr)); - /* - Regression coefficients. - */ - tab_double (t, 2, this_row, 0, linreg_coeff (c, j), NULL); - /* - Standard error of the coefficients. - */ - std_err = sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1)); - tab_double (t, 3, this_row, 0, std_err, NULL); - /* - Standardized coefficient, i.e., regression coefficient - if all variables had unit variance. - */ - beta = sqrt (gsl_matrix_get (cov, j, j)); - beta *= linreg_coeff (c, j) / - sqrt (gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1)); - tab_double (t, 4, this_row, 0, beta, NULL); - - /* - Test statistic for H0: coefficient is 0. - */ - t_stat = linreg_coeff (c, j) / std_err; - tab_double (t, 5, this_row, 0, t_stat, NULL); - /* - P values for the test statistic above. - */ - pval = - 2 * gsl_cdf_tdist_Q (fabs (t_stat), - (double) (linreg_n_obs (c) - - linreg_n_coeffs (c) - 1)); - tab_double (t, 6, this_row, 0, pval, NULL); - ds_destroy (&tstr); + struct pivot_category *interval = pivot_category_create_group__ ( + statistics->root, pivot_value_new_text_format ( + N_("%g%% Confidence Interval for B"), + cmd->ci * 100.0)); + pivot_category_create_leaves (interval, N_("Lower Bound"), + N_("Upper Bound")); } - tab_title (t, _("Coefficients (%s)"), var_to_string (var)); - tab_submit (t); -} -/* - Display the ANOVA table. -*/ -static void -reg_stats_anova (linreg * c, void *aux UNUSED, const struct variable *var) -{ - int n_cols = 7; - int n_rows = 4; - const double msm = linreg_ssreg (c) / linreg_dfmodel (c); - const double mse = linreg_mse (c); - const double F = msm / mse; - const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe); - - struct tab_table *t; + if (cmd->stats & STATS_TOL) + pivot_category_create_group (statistics->root, + N_("Collinearity Statistics"), + N_("Tolerance"), N_("VIF")); - assert (c != NULL); - t = tab_create (n_cols, n_rows); - tab_headers (t, 2, 0, 1, 0); - tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); + struct pivot_dimension *variables = pivot_dimension_create ( + table, PIVOT_AXIS_ROW, N_("Variables")); - tab_hline (t, TAL_2, 0, n_cols - 1, 1); - tab_vline (t, TAL_2, 2, 0, n_rows - 1); - tab_vline (t, TAL_0, 1, 0, 0); + double df = linreg_n_obs (c) - linreg_n_coeffs (c) - 1; + double q = (1 - cmd->ci) / 2.0; /* 2-tailed test */ + double tval = gsl_cdf_tdist_Qinv (q, df); - tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares")); - tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df")); - tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square")); - tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F")); - tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance")); + if (!cmd->origin) + { + int var_idx = pivot_category_create_leaf ( + variables->root, pivot_value_new_text (N_("(Constant)"))); + + double std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0)); + double t_stat = linreg_intercept (c) / std_err; + double base_entries[] = { + linreg_intercept (c), + std_err, + 0.0, + t_stat, + 2.0 * gsl_cdf_tdist_Q (fabs (t_stat), + linreg_n_obs (c) - linreg_n_coeffs (c)), + }; + + size_t col = 0; + for (size_t i = 0; i < sizeof base_entries / sizeof *base_entries; i++) + pivot_table_put2 (table, col++, var_idx, + pivot_value_new_number (base_entries[i])); + + if (cmd->stats & STATS_CI) + { + double interval_entries[] = { + linreg_intercept (c) - tval * std_err, + linreg_intercept (c) + tval * std_err, + }; + + for (size_t i = 0; i < sizeof interval_entries / sizeof *interval_entries; i++) + pivot_table_put2 (table, col++, var_idx, + pivot_value_new_number (interval_entries[i])); + } + } - tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression")); - tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual")); - tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total")); + for (size_t j = 0; j < linreg_n_coeffs (c); j++) + { + const struct variable *v = linreg_indep_var (c, j); + int var_idx = pivot_category_create_leaf ( + variables->root, pivot_value_new_variable (v)); + + double std_err = sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1)); + double t_stat = linreg_coeff (c, j) / std_err; + double base_entries[] = { + linreg_coeff (c, j), + sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1)), + (sqrt (gsl_matrix_get (cov, j, j)) * linreg_coeff (c, j) / + sqrt (gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1))), + t_stat, + 2 * gsl_cdf_tdist_Q (fabs (t_stat), df) + }; + + size_t col = 0; + for (size_t i = 0; i < sizeof base_entries / sizeof *base_entries; i++) + pivot_table_put2 (table, col++, var_idx, + pivot_value_new_number (base_entries[i])); + + if (cmd->stats & STATS_CI) + { + double interval_entries[] = { + linreg_coeff (c, j) - tval * std_err, + linreg_coeff (c, j) + tval * std_err, + }; + + + for (size_t i = 0; i < sizeof interval_entries / sizeof *interval_entries; i++) + pivot_table_put2 (table, col++, var_idx, + pivot_value_new_number (interval_entries[i])); + } + + if (cmd->stats & STATS_TOL) + { + { + struct linreg *m = mc[j].models[0]; + double rsq = linreg_ssreg (m) / linreg_sst (m); + pivot_table_put2 (table, col++, var_idx, pivot_value_new_number (1.0 - rsq)); + pivot_table_put2 (table, col++, var_idx, pivot_value_new_number (1.0 / (1.0 - rsq))); + } + } + } - /* Sums of Squares */ - tab_double (t, 2, 1, 0, linreg_ssreg (c), NULL); - tab_double (t, 2, 3, 0, linreg_sst (c), NULL); - tab_double (t, 2, 2, 0, linreg_sse (c), NULL); + pivot_table_submit (table); +} +/* + Display the ANOVA table. +*/ +static void +reg_stats_anova (const struct linreg * c, const struct variable *var) +{ + struct pivot_table *table = pivot_table_create__ ( + pivot_value_new_text_format (N_("ANOVA (%s)"), var_to_string (var)), + "ANOVA"); - /* Degrees of freedom */ - tab_text_format (t, 3, 1, TAB_RIGHT, "%g", c->dfm); - tab_text_format (t, 3, 2, TAB_RIGHT, "%g", c->dfe); - tab_text_format (t, 3, 3, TAB_RIGHT, "%g", c->dft); + pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"), + N_("Sum of Squares"), PIVOT_RC_OTHER, + N_("df"), PIVOT_RC_INTEGER, + N_("Mean Square"), PIVOT_RC_OTHER, + N_("F"), PIVOT_RC_OTHER, + N_("Sig."), PIVOT_RC_SIGNIFICANCE); - /* Mean Squares */ - tab_double (t, 4, 1, TAB_RIGHT, msm, NULL); - tab_double (t, 4, 2, TAB_RIGHT, mse, NULL); + pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Source"), + N_("Regression"), N_("Residual"), N_("Total")); - tab_double (t, 5, 1, 0, F, NULL); + double msm = linreg_ssreg (c) / linreg_dfmodel (c); + double mse = linreg_mse (c); + double F = msm / mse; - tab_double (t, 6, 1, 0, pval, NULL); + struct entry + { + int stat_idx; + int source_idx; + double x; + } + entries[] = { + /* Sums of Squares. */ + { 0, 0, linreg_ssreg (c) }, + { 0, 1, linreg_sse (c) }, + { 0, 2, linreg_sst (c) }, + /* Degrees of freedom. */ + { 1, 0, linreg_dfmodel (c) }, + { 1, 1, linreg_dferror (c) }, + { 1, 2, linreg_dftotal (c) }, + /* Mean Squares. */ + { 2, 0, msm }, + { 2, 1, mse }, + /* F */ + { 3, 0, F }, + /* Significance. */ + { 4, 0, gsl_cdf_fdist_Q (F, linreg_dfmodel (c), linreg_dferror (c)) }, + }; + for (size_t i = 0; i < sizeof entries / sizeof *entries; i++) + { + const struct entry *e = &entries[i]; + pivot_table_put2 (table, e->stat_idx, e->source_idx, + pivot_value_new_number (e->x)); + } - tab_title (t, _("ANOVA (%s)"), var_to_string (var)); - tab_submit (t); + pivot_table_submit (table); } static void -reg_stats_bcov (linreg * c, void *aux UNUSED, const struct variable *var) +reg_stats_bcov (const struct linreg * c, const struct variable *var) { - int n_cols; - int n_rows; - int i; - int k; - int row; - int col; - const char *label; - struct tab_table *t; - - assert (c != NULL); - n_cols = c->n_indeps + 1 + 2; - n_rows = 2 * (c->n_indeps + 1); - t = tab_create (n_cols, n_rows); - tab_headers (t, 2, 0, 1, 0); - tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); - tab_hline (t, TAL_2, 0, n_cols - 1, 1); - tab_vline (t, TAL_2, 2, 0, n_rows - 1); - tab_vline (t, TAL_0, 1, 0, 0); - tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model")); - tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances")); - for (i = 0; i < linreg_n_coeffs (c); i++) - { - const struct variable *v = linreg_indep_var (c, i); - label = var_to_string (v); - tab_text (t, 2, i, TAB_CENTER, label); - tab_text (t, i + 2, 0, TAB_CENTER, label); - for (k = 1; k < linreg_n_coeffs (c); k++) - { - col = (i <= k) ? k : i; - row = (i <= k) ? i : k; - tab_double (t, k + 2, i, TAB_CENTER, - gsl_matrix_get (c->cov, row, col), NULL); - } - } - tab_title (t, _("Coefficient Correlations (%s)"), var_to_string (var)); - tab_submit (t); -} + struct pivot_table *table = pivot_table_create__ ( + pivot_value_new_text_format (N_("Coefficient Correlations (%s)"), + var_to_string (var)), + "Coefficient Correlations"); -static void -statistics_keyword_output (void (*function) - (linreg *, void *, const struct variable * var), - bool keyword, linreg * c, void *aux, - const struct variable *var) -{ - if (keyword) + for (size_t i = 0; i < 2; i++) { - (*function) (c, aux, var); + struct pivot_dimension *models = pivot_dimension_create ( + table, i ? PIVOT_AXIS_ROW : PIVOT_AXIS_COLUMN, N_("Models")); + for (size_t j = 0; j < linreg_n_coeffs (c); j++) + pivot_category_create_leaf ( + models->root, pivot_value_new_variable ( + linreg_indep_var (c, j))); } + + pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Statistics"), + N_("Covariances")); + + for (size_t i = 0; i < linreg_n_coeffs (c); i++) + for (size_t k = 0; k < linreg_n_coeffs (c); k++) + { + double cov = gsl_matrix_get (linreg_cov (c), MIN (i, k), MAX (i, k)); + pivot_table_put3 (table, k, i, 0, pivot_value_new_number (cov)); + } + + pivot_table_submit (table); }