X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Fregression.c;h=66a5abaf7c8aa7011d071bc5a9837c06904451e5;hb=912833f9331784d692ade66c76bffb5497b890e0;hp=9e1aba8815e22f173fab6042db2acf5b293e1cef;hpb=f481fd69631024bcdc7dc2369bbc1592d7a43ac7;p=pspp diff --git a/src/language/stats/regression.c b/src/language/stats/regression.c index 9e1aba8815..66a5abaf7c 100644 --- a/src/language/stats/regression.c +++ b/src/language/stats/regression.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2005, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. + Copyright (C) 2005, 2009, 2010, 2011, 2012, 2013, 2014, 2016 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,12 +16,14 @@ #include +#include #include #include #include #include +#include #include "language/command.h" #include "language/lexer/lexer.h" @@ -51,6 +53,17 @@ #define REG_LARGE_DATA 1000 +#define STATS_R 1 +#define STATS_COEFF 2 +#define STATS_ANOVA 4 +#define STATS_OUTS 8 +#define STATS_CI 16 +#define STATS_BCOV 32 + +#define STATS_DEFAULT (STATS_R | STATS_COEFF | STATS_ANOVA | STATS_OUTS) + + + struct regression { struct dataset *ds; @@ -61,118 +74,39 @@ struct regression const struct variable **dep_vars; size_t n_dep_vars; - bool r; - bool coeff; - bool anova; - bool bcov; - + unsigned int stats; + double ci; bool resid; bool pred; - - linreg **models; }; +struct regression_workspace +{ + /* The new variables which will be introduced by /SAVE */ + const struct variable **predvars; + const struct variable **residvars; -static void run_regression (const struct regression *cmd, struct casereader *input); - + /* A reader/writer pair to temporarily hold the + values of the new variables */ + struct casewriter *writer; + struct casereader *reader; + /* Indeces of the new values in the reader/writer (-1 if not applicable) */ + int res_idx; + int pred_idx; -/* - Transformations for saving predicted values - and residuals, etc. -*/ -struct reg_trns -{ - int n_trns; /* Number of transformations. */ - int trns_id; /* Which trns is this one? */ - linreg *c; /* Linear model for this trns. */ + /* 0, 1 or 2 depending on what new variables are to be created */ + int extras; }; -/* - Gets the predicted values. -*/ -static int -regression_trns_pred_proc (void *t_, struct ccase **c, - casenumber case_idx UNUSED) -{ - size_t i; - size_t n_vals; - struct reg_trns *trns = t_; - linreg *model; - union value *output = NULL; - const union value *tmp; - double *vals; - const struct variable **vars = NULL; - - assert (trns != NULL); - model = trns->c; - assert (model != NULL); - assert (model->depvar != NULL); - assert (model->pred != NULL); - - vars = linreg_get_vars (model); - n_vals = linreg_n_coeffs (model); - vals = xnmalloc (n_vals, sizeof (*vals)); - *c = case_unshare (*c); - - output = case_data_rw (*c, model->pred); - - for (i = 0; i < n_vals; i++) - { - tmp = case_data (*c, vars[i]); - vals[i] = tmp->f; - } - output->f = linreg_predict (model, vals, n_vals); - free (vals); - return TRNS_CONTINUE; -} - -/* - Gets the residuals. -*/ -static int -regression_trns_resid_proc (void *t_, struct ccase **c, - casenumber case_idx UNUSED) -{ - size_t i; - size_t n_vals; - struct reg_trns *trns = t_; - linreg *model; - union value *output = NULL; - const union value *tmp; - double *vals = NULL; - double obs; - const struct variable **vars = NULL; - - assert (trns != NULL); - model = trns->c; - assert (model != NULL); - assert (model->depvar != NULL); - assert (model->resid != NULL); - - vars = linreg_get_vars (model); - n_vals = linreg_n_coeffs (model); - - vals = xnmalloc (n_vals, sizeof (*vals)); - *c = case_unshare (*c); - output = case_data_rw (*c, model->resid); - assert (output != NULL); - - for (i = 0; i < n_vals; i++) - { - tmp = case_data (*c, vars[i]); - vals[i] = tmp->f; - } - tmp = case_data (*c, model->depvar); - obs = tmp->f; - output->f = linreg_residual (model, obs, vals, n_vals); - free (vals); - - return TRNS_CONTINUE; -} +static void run_regression (const struct regression *cmd, + struct regression_workspace *ws, + struct casereader *input); +/* Return a string based on PREFIX which may be used as the name + of a new variable in DICT */ static char * reg_get_name (const struct dictionary *dict, const char *prefix) { @@ -181,7 +115,7 @@ reg_get_name (const struct dictionary *dict, const char *prefix) /* XXX handle too-long prefixes */ name = xmalloc (strlen (prefix) + INT_BUFSIZE_BOUND (i) + 1); - for (i = 1; ; i++) + for (i = 1;; i++) { sprintf (name, "%s%d", prefix, i); if (dict_lookup_var (dict, name) == NULL) @@ -189,94 +123,83 @@ reg_get_name (const struct dictionary *dict, const char *prefix) } } -/* - Free the transformation. Free its linear model if this - transformation is the last one. -*/ -static bool -regression_trns_free (void *t_) -{ - bool result = true; - struct reg_trns *t = t_; - - if (t->trns_id == t->n_trns) - { - result = linreg_free (t->c); - } - free (t); - - return result; -} -static void -reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f, - linreg * c, struct variable **v, int n_trns) +static const struct variable * +create_aux_var (struct dataset *ds, const char *prefix) { + struct variable *var; struct dictionary *dict = dataset_dict (ds); - static int trns_index = 1; - char *name; - struct variable *new_var; - struct reg_trns *t = NULL; - - t = xmalloc (sizeof (*t)); - t->trns_id = trns_index; - t->n_trns = n_trns; - t->c = c; - - name = reg_get_name (dict, prefix); - new_var = dict_create_var_assert (dict, name, 0); + char *name = reg_get_name (dict, prefix); + var = dict_create_var_assert (dict, name, 0); free (name); - - *v = new_var; - add_transformation (ds, f, regression_trns_free, t); - trns_index++; + return var; } -static void -subcommand_save (const struct regression *cmd) +/* Auxilliary data for transformation when /SAVE is entered */ +struct save_trans_data { - linreg **lc; - int n_trns = 0; + int n_dep_vars; + struct regression_workspace *ws; +}; - if ( cmd->resid ) n_trns++; - if ( cmd->pred ) n_trns++; +static bool +save_trans_free (void *aux) +{ + struct save_trans_data *save_trans_data = aux; + free (save_trans_data->ws->predvars); + free (save_trans_data->ws->residvars); + + casereader_destroy (save_trans_data->ws->reader); + free (save_trans_data->ws); + free (save_trans_data); + return true; +} - n_trns *= cmd->n_dep_vars; +static int +save_trans_func (void *aux, struct ccase **c, casenumber x UNUSED) +{ + struct save_trans_data *save_trans_data = aux; + struct regression_workspace *ws = save_trans_data->ws; + struct ccase *in = casereader_read (ws->reader); - for (lc = cmd->models; lc < cmd->models + cmd->n_dep_vars; lc++) + if (in) { - if (*lc != NULL) + int k; + *c = case_unshare (*c); + + for (k = 0; k < save_trans_data->n_dep_vars; ++k) { - if ((*lc)->depvar != NULL) + if (ws->pred_idx != -1) { - if (cmd->resid) - { - reg_save_var (cmd->ds, "RES", regression_trns_resid_proc, *lc, - &(*lc)->resid, n_trns); - } - if (cmd->pred) - { - reg_save_var (cmd->ds, "PRED", regression_trns_pred_proc, *lc, - &(*lc)->pred, n_trns); - } + double pred = case_data_idx (in, ws->extras * k + ws->pred_idx)->f; + case_data_rw (*c, ws->predvars[k])->f = pred; + } + + if (ws->res_idx != -1) + { + double resid = case_data_idx (in, ws->extras * k + ws->res_idx)->f; + case_data_rw (*c, ws->residvars[k])->f = resid; } } + case_unref (in); } + + return TRNS_CONTINUE; } + int cmd_regression (struct lexer *lexer, struct dataset *ds) { - int k; + struct regression_workspace workspace; struct regression regression; const struct dictionary *dict = dataset_dict (ds); + bool save; memset (®ression, 0, sizeof (struct regression)); - regression.anova = true; - regression.coeff = true; - regression.r = true; - + regression.ci = 0.95; + regression.stats = STATS_DEFAULT; regression.pred = false; regression.resid = false; @@ -284,15 +207,15 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) /* Accept an optional, completely pointless "/VARIABLES=" */ lex_match (lexer, T_SLASH); - if (lex_match_id (lexer, "VARIABLES")) + if (lex_match_id (lexer, "VARIABLES")) { - if (! lex_force_match (lexer, T_EQUALS) ) + if (!lex_force_match (lexer, T_EQUALS)) goto error; } if (!parse_variables_const (lexer, dict, - ®ression.vars, ®ression.n_vars, - PV_NO_DUPLICATE | PV_NUMERIC)) + ®ression.vars, ®ression.n_vars, + PV_NO_DUPLICATE | PV_NUMERIC)) goto error; @@ -300,19 +223,22 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) { lex_match (lexer, T_SLASH); - if (lex_match_id (lexer, "DEPENDENT")) + if (lex_match_id (lexer, "DEPENDENT")) { - if (! lex_force_match (lexer, T_EQUALS) ) - goto error; + lex_match (lexer, T_EQUALS); + free (regression.dep_vars); + regression.n_dep_vars = 0; + if (!parse_variables_const (lexer, dict, - ®ression.dep_vars, ®ression.n_dep_vars, + ®ression.dep_vars, + ®ression.n_dep_vars, PV_NO_DUPLICATE | PV_NUMERIC)) goto error; } else if (lex_match_id (lexer, "METHOD")) - { - lex_match (lexer, T_EQUALS); + { + lex_match (lexer, T_EQUALS); if (!lex_force_match_id (lexer, "ENTER")) { @@ -320,29 +246,48 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) } } else if (lex_match_id (lexer, "STATISTICS")) - { - lex_match (lexer, T_EQUALS); + { + unsigned long statistics = 0; + lex_match (lexer, T_EQUALS); - while (lex_token (lexer) != T_ENDCMD - && lex_token (lexer) != T_SLASH) - { + while (lex_token (lexer) != T_ENDCMD + && lex_token (lexer) != T_SLASH) + { if (lex_match (lexer, T_ALL)) { + statistics = ~0; } else if (lex_match_id (lexer, "DEFAULTS")) { + statistics |= STATS_DEFAULT; } else if (lex_match_id (lexer, "R")) { + statistics |= STATS_R; } else if (lex_match_id (lexer, "COEFF")) { + statistics |= STATS_COEFF; } else if (lex_match_id (lexer, "ANOVA")) { + statistics |= STATS_ANOVA; } else if (lex_match_id (lexer, "BCOV")) { + statistics |= STATS_BCOV; + } + else if (lex_match_id (lexer, "CI")) + { + statistics |= STATS_CI; + + if (lex_match (lexer, T_LPAREN) && + lex_force_num (lexer)) + { + regression.ci = lex_number (lexer) / 100.0; + lex_get (lexer); + lex_force_match (lexer, T_RPAREN); + } } else { @@ -350,14 +295,18 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) goto error; } } + + if (statistics) + regression.stats = statistics; + } else if (lex_match_id (lexer, "SAVE")) - { - lex_match (lexer, T_EQUALS); + { + lex_match (lexer, T_EQUALS); - while (lex_token (lexer) != T_ENDCMD - && lex_token (lexer) != T_SLASH) - { + while (lex_token (lexer) != T_ENDCMD + && lex_token (lexer) != T_SLASH) + { if (lex_match_id (lexer, "PRED")) { regression.pred = true; @@ -385,42 +334,101 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) dict_get_vars (dict, ®ression.vars, ®ression.n_vars, 0); } + save = regression.pred || regression.resid; + workspace.extras = 0; + workspace.res_idx = -1; + workspace.pred_idx = -1; + workspace.writer = NULL; + workspace.reader = NULL; + workspace.residvars = NULL; + workspace.predvars = NULL; + if (save) + { + int i; + struct caseproto *proto = caseproto_create (); + + if (regression.resid) + { + workspace.res_idx = workspace.extras ++; + workspace.residvars = xcalloc (regression.n_dep_vars, sizeof (*workspace.residvars)); + + for (i = 0; i < regression.n_dep_vars; ++i) + { + workspace.residvars[i] = create_aux_var (ds, "RES"); + proto = caseproto_add_width (proto, 0); + } + } + + if (regression.pred) + { + workspace.pred_idx = workspace.extras ++; + workspace.predvars = xcalloc (regression.n_dep_vars, sizeof (*workspace.predvars)); + + for (i = 0; i < regression.n_dep_vars; ++i) + { + workspace.predvars[i] = create_aux_var (ds, "PRED"); + proto = caseproto_add_width (proto, 0); + } + } + + if (proc_make_temporary_transformations_permanent (ds)) + msg (SW, _("REGRESSION with SAVE ignores TEMPORARY. " + "Temporary transformations will be made permanent.")); + + if (dict_get_filter (dict)) + msg (SW, _("REGRESSION with SAVE ignores FILTER. " + "All cases will be processed.")); + + workspace.writer = autopaging_writer_create (proto); + caseproto_unref (proto); + } - regression.models = xcalloc (regression.n_dep_vars, sizeof *regression.models); { struct casegrouper *grouper; struct casereader *group; bool ok; - - grouper = casegrouper_create_splits (proc_open (ds), dict); + + grouper = casegrouper_create_splits (proc_open_filtering (ds, !save), dict); + + while (casegrouper_get_next_group (grouper, &group)) - run_regression (®ression, group); + { + run_regression (®ression, + &workspace, + group); + + } ok = casegrouper_destroy (grouper); ok = proc_commit (ds) && ok; } - if (regression.pred || regression.resid ) - subcommand_save (®ression); - + if (workspace.writer) + { + struct save_trans_data *save_trans_data = xmalloc (sizeof *save_trans_data); + struct casereader *r = casewriter_make_reader (workspace.writer); + workspace.writer = NULL; + workspace.reader = r; + save_trans_data->ws = xmalloc (sizeof (workspace)); + memcpy (save_trans_data->ws, &workspace, sizeof (workspace)); + save_trans_data->n_dep_vars = regression.n_dep_vars; + + add_transformation (ds, save_trans_func, save_trans_free, save_trans_data); + } + - for (k = 0; k < regression.n_dep_vars; k++) - linreg_free (regression.models[k]); - free (regression.models); free (regression.vars); free (regression.dep_vars); return CMD_SUCCESS; - - error: - for (k = 0; k < regression.n_dep_vars; k++) - linreg_free (regression.models[k]); - free (regression.models); + +error: + free (regression.vars); free (regression.dep_vars); return CMD_FAILURE; } - +/* Return the size of the union of dependent and independent variables */ static size_t get_n_all_vars (const struct regression *cmd) { @@ -432,42 +440,43 @@ get_n_all_vars (const struct regression *cmd) for (i = 0; i < cmd->n_dep_vars; i++) { for (j = 0; j < cmd->n_vars; j++) - { - if (cmd->vars[j] == cmd->dep_vars[i]) - { - result--; - } - } + { + if (cmd->vars[j] == cmd->dep_vars[i]) + { + result--; + } + } } return result; } +/* Fill VARS with the union of dependent and independent variables */ static void fill_all_vars (const struct variable **vars, const struct regression *cmd) { + size_t x = 0; size_t i; - size_t j; - bool absent; - for (i = 0; i < cmd->n_vars; i++) { vars[i] = cmd->vars[i]; } + for (i = 0; i < cmd->n_dep_vars; i++) { - absent = true; + size_t j; + bool absent = true; for (j = 0; j < cmd->n_vars; j++) - { - if (cmd->dep_vars[i] == cmd->vars[j]) - { - absent = false; - break; - } - } + { + if (cmd->dep_vars[i] == cmd->vars[j]) + { + absent = false; + break; + } + } if (absent) - { - vars[i + cmd->n_vars] = cmd->dep_vars[i]; - } + { + vars[cmd->n_vars + x++] = cmd->dep_vars[i]; + } } } @@ -484,9 +493,9 @@ is_depvar (const struct regression *cmd, size_t k, const struct variable *v) /* Identify the explanatory variables in v_variables. Returns the number of independent variables. */ static int -identify_indep_vars (const struct regression *cmd, +identify_indep_vars (const struct regression *cmd, const struct variable **indep_vars, - const struct variable *depvar) + const struct variable *depvar) { int n_indep_vars = 0; int i; @@ -497,13 +506,14 @@ identify_indep_vars (const struct regression *cmd, if ((n_indep_vars < 1) && is_depvar (cmd, 0, depvar)) { /* - There is only one independent variable, and it is the same - as the dependent variable. Print a warning and continue. - */ - msg (SE, - gettext ("The dependent variable is equal to the independent variable." - "The least squares line is therefore Y=X." - "Standard errors and related statistics may be meaningless.")); + There is only one independent variable, and it is the same + as the dependent variable. Print a warning and continue. + */ + msg (SW, + gettext + ("The dependent variable is equal to the independent variable. " + "The least squares line is therefore Y=X. " + "Standard errors and related statistics may be meaningless.")); n_indep_vars = 1; indep_vars[0] = cmd->vars[0]; } @@ -512,11 +522,11 @@ identify_indep_vars (const struct regression *cmd, static double -fill_covariance (gsl_matrix *cov, struct covariance *all_cov, - const struct variable **vars, - size_t n_vars, const struct variable *dep_var, - const struct variable **all_vars, size_t n_all_vars, - double *means) +fill_covariance (gsl_matrix * cov, struct covariance *all_cov, + const struct variable **vars, + size_t n_vars, const struct variable *dep_var, + const struct variable **all_vars, size_t n_all_vars, + double *means) { size_t i; size_t j; @@ -526,39 +536,39 @@ fill_covariance (gsl_matrix *cov, struct covariance *all_cov, const gsl_matrix *mean_matrix; const gsl_matrix *ssize_matrix; double result = 0.0; - - gsl_matrix *cm = covariance_calculate_unnormalized (all_cov); - if ( cm == NULL) + const gsl_matrix *cm = covariance_calculate_unnormalized (all_cov); + + if (cm == NULL) return 0; rows = xnmalloc (cov->size1 - 1, sizeof (*rows)); - + for (i = 0; i < n_all_vars; i++) { for (j = 0; j < n_vars; j++) - { - if (vars[j] == all_vars[i]) - { - rows[j] = i; - } - } + { + if (vars[j] == all_vars[i]) + { + rows[j] = i; + } + } if (all_vars[i] == dep_var) - { - dep_subscript = i; - } + { + dep_subscript = i; + } } mean_matrix = covariance_moments (all_cov, MOMENT_MEAN); ssize_matrix = covariance_moments (all_cov, MOMENT_NONE); for (i = 0; i < cov->size1 - 1; i++) { means[i] = gsl_matrix_get (mean_matrix, rows[i], 0) - / gsl_matrix_get (ssize_matrix, rows[i], 0); + / gsl_matrix_get (ssize_matrix, rows[i], 0); for (j = 0; j < cov->size2 - 1; j++) - { - gsl_matrix_set (cov, i, j, gsl_matrix_get (cm, rows[i], rows[j])); - gsl_matrix_set (cov, j, i, gsl_matrix_get (cm, rows[j], rows[i])); - } + { + gsl_matrix_set (cov, i, j, gsl_matrix_get (cm, rows[i], rows[j])); + gsl_matrix_set (cov, j, i, gsl_matrix_get (cm, rows[j], rows[i])); + } } means[cov->size1 - 1] = gsl_matrix_get (mean_matrix, dep_subscript, 0) / gsl_matrix_get (ssize_matrix, dep_subscript, 0); @@ -566,144 +576,190 @@ fill_covariance (gsl_matrix *cov, struct covariance *all_cov, result = gsl_matrix_get (ssizes, dep_subscript, rows[0]); for (i = 0; i < cov->size1 - 1; i++) { - gsl_matrix_set (cov, i, cov->size1 - 1, - gsl_matrix_get (cm, rows[i], dep_subscript)); - gsl_matrix_set (cov, cov->size1 - 1, i, - gsl_matrix_get (cm, rows[i], dep_subscript)); + gsl_matrix_set (cov, i, cov->size1 - 1, + gsl_matrix_get (cm, rows[i], dep_subscript)); + gsl_matrix_set (cov, cov->size1 - 1, i, + gsl_matrix_get (cm, rows[i], dep_subscript)); if (result > gsl_matrix_get (ssizes, rows[i], dep_subscript)) - { - result = gsl_matrix_get (ssizes, rows[i], dep_subscript); - } + { + result = gsl_matrix_get (ssizes, rows[i], dep_subscript); + } } - gsl_matrix_set (cov, cov->size1 - 1, cov->size1 - 1, - gsl_matrix_get (cm, dep_subscript, dep_subscript)); + gsl_matrix_set (cov, cov->size1 - 1, cov->size1 - 1, + gsl_matrix_get (cm, dep_subscript, dep_subscript)); free (rows); - gsl_matrix_free (cm); return result; } + /* STATISTICS subcommand output functions. */ -static void reg_stats_r (linreg *, void *); -static void reg_stats_coeff (linreg *, void *); -static void reg_stats_anova (linreg *, void *); -static void reg_stats_bcov (linreg *, void *); - -static void statistics_keyword_output (void (*)(linreg *, void *), - bool, linreg *, void *); - +static void reg_stats_r (const linreg *, const struct variable *); +static void reg_stats_coeff (const linreg *, const gsl_matrix *, const struct variable *, const struct regression *); +static void reg_stats_anova (const linreg *, const struct variable *); +static void reg_stats_bcov (const linreg *, const struct variable *); static void -subcommand_statistics (const struct regression *cmd , linreg * c, void *aux) +subcommand_statistics (const struct regression *cmd, const linreg * c, const gsl_matrix * cm, + const struct variable *var) { - statistics_keyword_output (reg_stats_r, cmd->r, c, aux); - statistics_keyword_output (reg_stats_anova, cmd->anova, c, aux); - statistics_keyword_output (reg_stats_coeff, cmd->coeff, c, aux); - statistics_keyword_output (reg_stats_bcov, cmd->bcov, c, aux); + if (cmd->stats & STATS_R) + reg_stats_r (c, var); + + if (cmd->stats & STATS_ANOVA) + reg_stats_anova (c, var); + + if (cmd->stats & STATS_COEFF) + reg_stats_coeff (c, cm, var, cmd); + + if (cmd->stats & STATS_BCOV) + reg_stats_bcov (c, var); } static void -run_regression (const struct regression *cmd, struct casereader *input) +run_regression (const struct regression *cmd, + struct regression_workspace *ws, + struct casereader *input) { size_t i; - int n_indep = 0; + linreg **models; + int k; - double *means; struct ccase *c; struct covariance *cov; - const struct variable **vars; - const struct variable **all_vars; - const struct variable *dep_var; struct casereader *reader; - size_t n_all_vars; + size_t n_all_vars = get_n_all_vars (cmd); + const struct variable **all_vars = xnmalloc (n_all_vars, sizeof (*all_vars)); - linreg **models = cmd->models; + double *means = xnmalloc (n_all_vars, sizeof (*means)); - n_all_vars = get_n_all_vars (cmd); - all_vars = xnmalloc (n_all_vars, sizeof (*all_vars)); fill_all_vars (all_vars, cmd); - vars = xnmalloc (cmd->n_vars, sizeof (*vars)); - means = xnmalloc (n_all_vars, sizeof (*means)); cov = covariance_1pass_create (n_all_vars, all_vars, - dict_get_weight (dataset_dict (cmd->ds)), MV_ANY); + dict_get_weight (dataset_dict (cmd->ds)), + MV_ANY); reader = casereader_clone (input); reader = casereader_create_filter_missing (reader, all_vars, n_all_vars, - MV_ANY, NULL, NULL); + MV_ANY, NULL, NULL); - for (; (c = casereader_read (reader)) != NULL; case_unref (c)) - { - covariance_accumulate (cov, c); - } + { + struct casereader *r = casereader_clone (reader); + + for (; (c = casereader_read (r)) != NULL; case_unref (c)) + { + covariance_accumulate (cov, c); + } + casereader_destroy (r); + } + models = xcalloc (cmd->n_dep_vars, sizeof (*models)); for (k = 0; k < cmd->n_dep_vars; k++) { - double n_data; - - gsl_matrix *this_cm; - dep_var = cmd->dep_vars[k]; - n_indep = identify_indep_vars (cmd, vars, dep_var); - - this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1); - n_data = fill_covariance (this_cm, cov, vars, n_indep, - dep_var, all_vars, n_all_vars, means); - models[k] = linreg_alloc (dep_var, (const struct variable **) vars, - n_data, n_indep); + const struct variable **vars = xnmalloc (cmd->n_vars, sizeof (*vars)); + const struct variable *dep_var = cmd->dep_vars[k]; + int n_indep = identify_indep_vars (cmd, vars, dep_var); + gsl_matrix *this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1); + double n_data = fill_covariance (this_cm, cov, vars, n_indep, + dep_var, all_vars, n_all_vars, means); + models[k] = linreg_alloc (dep_var, vars, n_data, n_indep); models[k]->depvar = dep_var; for (i = 0; i < n_indep; i++) - { - linreg_set_indep_variable_mean (models[k], i, means[i]); - } + { + linreg_set_indep_variable_mean (models[k], i, means[i]); + } linreg_set_depvar_mean (models[k], means[i]); /* - For large data sets, use QR decomposition. - */ + For large data sets, use QR decomposition. + */ if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA) - { - models[k]->method = LINREG_QR; - } + { + models[k]->method = LINREG_QR; + } if (n_data > 0) - { - /* - Find the least-squares estimates and other statistics. - */ - linreg_fit (this_cm, models[k]); - - if (!taint_has_tainted_successor (casereader_get_taint (input))) - { - subcommand_statistics (cmd, models[k], this_cm); - } - } + { + /* + Find the least-squares estimates and other statistics. + */ + linreg_fit (this_cm, models[k]); + + if (!taint_has_tainted_successor (casereader_get_taint (input))) + { + subcommand_statistics (cmd, models[k], this_cm, dep_var); + } + } else - { - msg (SE, - _("No valid data found. This command was skipped.")); - linreg_free (models[k]); - models[k] = NULL; - } + { + msg (SE, _("No valid data found. This command was skipped.")); + } gsl_matrix_free (this_cm); + free (vars); + } + + + if (ws->extras > 0) + { + struct casereader *r = casereader_clone (reader); + + for (; (c = casereader_read (r)) != NULL; case_unref (c)) + { + struct ccase *outc = case_create (casewriter_get_proto (ws->writer)); + for (k = 0; k < cmd->n_dep_vars; k++) + { + const struct variable **vars = xnmalloc (cmd->n_vars, sizeof (*vars)); + const struct variable *dep_var = cmd->dep_vars[k]; + int n_indep = identify_indep_vars (cmd, vars, dep_var); + double *vals = xnmalloc (n_indep, sizeof (*vals)); + for (i = 0; i < n_indep; i++) + { + const union value *tmp = case_data (c, vars[i]); + vals[i] = tmp->f; + } + + if (cmd->pred) + { + double pred = linreg_predict (models[k], vals, n_indep); + case_data_rw_idx (outc, k * ws->extras + ws->pred_idx)->f = pred; + } + + if (cmd->resid) + { + double obs = case_data (c, models[k]->depvar)->f; + double res = linreg_residual (models[k], obs, vals, n_indep); + case_data_rw_idx (outc, k * ws->extras + ws->res_idx)->f = res; + } + free (vals); + free (vars); + } + casewriter_write (ws->writer, outc); + } + casereader_destroy (r); } - + casereader_destroy (reader); - free (vars); + + for (k = 0; k < cmd->n_dep_vars; k++) + { + linreg_unref (models[k]); + } + free (models); + free (all_vars); free (means); casereader_destroy (input); covariance_destroy (cov); } - static void -reg_stats_r (linreg *c, void *aux UNUSED) +reg_stats_r (const linreg * c, const struct variable *var) { struct tab_table *t; int n_rows = 2; @@ -714,7 +770,9 @@ reg_stats_r (linreg *c, void *aux UNUSED) assert (c != NULL); rsq = linreg_ssreg (c) / linreg_sst (c); - adjrsq = 1.0 - (1.0 - rsq) * (linreg_n_obs (c) - 1.0) / (linreg_n_obs (c) - linreg_n_coeffs (c)); + adjrsq = rsq - + (1.0 - rsq) * linreg_n_coeffs (c) / (linreg_n_obs (c) - + linreg_n_coeffs (c) - 1); std_error = sqrt (linreg_mse (c)); t = tab_create (n_cols, n_rows); tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); @@ -726,11 +784,11 @@ reg_stats_r (linreg *c, void *aux UNUSED) tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square")); tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square")); tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate")); - tab_double (t, 1, 1, TAB_RIGHT, sqrt (rsq), NULL); - tab_double (t, 2, 1, TAB_RIGHT, rsq, NULL); - tab_double (t, 3, 1, TAB_RIGHT, adjrsq, NULL); - tab_double (t, 4, 1, TAB_RIGHT, std_error, NULL); - tab_title (t, _("Model Summary")); + tab_double (t, 1, 1, TAB_RIGHT, sqrt (rsq), NULL, RC_OTHER); + tab_double (t, 2, 1, TAB_RIGHT, rsq, NULL, RC_OTHER); + tab_double (t, 3, 1, TAB_RIGHT, adjrsq, NULL, RC_OTHER); + tab_double (t, 4, 1, TAB_RIGHT, std_error, NULL, RC_OTHER); + tab_title (t, _("Model Summary (%s)"), var_to_string (var)); tab_submit (t); } @@ -738,10 +796,11 @@ reg_stats_r (linreg *c, void *aux UNUSED) Table showing estimated regression coefficients. */ static void -reg_stats_coeff (linreg * c, void *aux_) +reg_stats_coeff (const linreg * c, const gsl_matrix *cov, const struct variable *var, const struct regression *cmd) { size_t j; int n_cols = 7; + const int heading_rows = 2; int n_rows; int this_row; double t_stat; @@ -752,76 +811,109 @@ reg_stats_coeff (linreg * c, void *aux_) const struct variable *v; struct tab_table *t; - gsl_matrix *cov = aux_; + + const double df = linreg_n_obs (c) - linreg_n_coeffs (c) - 1; + double q = (1 - cmd->ci) / 2.0; /* 2-tailed test */ + double tval = gsl_cdf_tdist_Qinv (q, df); assert (c != NULL); - n_rows = linreg_n_coeffs (c) + 3; + n_rows = linreg_n_coeffs (c) + heading_rows + 1; + + if (cmd->stats & STATS_CI) + n_cols += 2; t = tab_create (n_cols, n_rows); tab_headers (t, 2, 0, 1, 0); tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); - tab_hline (t, TAL_2, 0, n_cols - 1, 1); + tab_hline (t, TAL_2, 0, n_cols - 1, heading_rows); tab_vline (t, TAL_2, 2, 0, n_rows - 1); tab_vline (t, TAL_0, 1, 0, 0); - tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B")); - tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error")); - tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta")); - tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t")); - tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance")); - tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)")); - tab_double (t, 2, 1, 0, linreg_intercept (c), NULL); + + tab_hline (t, TAL_1, 2, 4, 1); + tab_joint_text (t, 2, 0, 3, 0, TAB_CENTER | TAT_TITLE, _("Unstandardized Coefficients")); + tab_text (t, 2, 1, TAB_CENTER | TAT_TITLE, _("B")); + tab_text (t, 3, 1, TAB_CENTER | TAT_TITLE, _("Std. Error")); + tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Standardized Coefficients")); + tab_text (t, 4, 1, TAB_CENTER | TAT_TITLE, _("Beta")); + tab_text (t, 5, 1, TAB_CENTER | TAT_TITLE, _("t")); + tab_text (t, 6, 1, TAB_CENTER | TAT_TITLE, _("Sig.")); + tab_text (t, 1, heading_rows, TAB_LEFT | TAT_TITLE, _("(Constant)")); + tab_double (t, 2, heading_rows, 0, linreg_intercept (c), NULL, RC_OTHER); std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0)); - tab_double (t, 3, 1, 0, std_err, NULL); - tab_double (t, 4, 1, 0, 0.0, NULL); + + if (cmd->stats & STATS_CI) + { + double lower = linreg_intercept (c) - tval * std_err ; + double upper = linreg_intercept (c) + tval * std_err ; + tab_double (t, 7, heading_rows, 0, lower, NULL, RC_OTHER); + tab_double (t, 8, heading_rows, 0, upper, NULL, RC_OTHER); + + tab_joint_text_format (t, 7, 0, 8, 0, TAB_CENTER | TAT_TITLE, _("%g%% Confidence Interval for B"), cmd->ci * 100); + tab_hline (t, TAL_1, 7, 8, 1); + tab_text (t, 7, 1, TAB_CENTER | TAT_TITLE, _("Lower Bound")); + tab_text (t, 8, 1, TAB_CENTER | TAT_TITLE, _("Upper Bound")); + } + tab_double (t, 3, heading_rows, 0, std_err, NULL, RC_OTHER); + tab_double (t, 4, heading_rows, 0, 0.0, NULL, RC_OTHER); t_stat = linreg_intercept (c) / std_err; - tab_double (t, 5, 1, 0, t_stat, NULL); - pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); - tab_double (t, 6, 1, 0, pval, NULL); + tab_double (t, 5, heading_rows, 0, t_stat, NULL, RC_OTHER); + pval = + 2 * gsl_cdf_tdist_Q (fabs (t_stat), + (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); + tab_double (t, 6, heading_rows, 0, pval, NULL, RC_PVALUE); + for (j = 0; j < linreg_n_coeffs (c); j++) { struct string tstr; ds_init_empty (&tstr); - this_row = j + 2; + this_row = j + heading_rows + 1; v = linreg_indep_var (c, j); label = var_to_string (v); /* Do not overwrite the variable's name. */ ds_put_cstr (&tstr, label); - tab_text (t, 1, this_row, TAB_CENTER, ds_cstr (&tstr)); + tab_text (t, 1, this_row, TAB_LEFT, ds_cstr (&tstr)); /* - Regression coefficients. - */ - tab_double (t, 2, this_row, 0, linreg_coeff (c, j), NULL); + Regression coefficients. + */ + tab_double (t, 2, this_row, 0, linreg_coeff (c, j), NULL, RC_OTHER); /* - Standard error of the coefficients. - */ + Standard error of the coefficients. + */ std_err = sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1)); - tab_double (t, 3, this_row, 0, std_err, NULL); + tab_double (t, 3, this_row, 0, std_err, NULL, RC_OTHER); /* - Standardized coefficient, i.e., regression coefficient - if all variables had unit variance. - */ + Standardized coefficient, i.e., regression coefficient + if all variables had unit variance. + */ beta = sqrt (gsl_matrix_get (cov, j, j)); - beta *= linreg_coeff (c, j) / - sqrt (gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1)); - tab_double (t, 4, this_row, 0, beta, NULL); + beta *= linreg_coeff (c, j) / + sqrt (gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1)); + tab_double (t, 4, this_row, 0, beta, NULL, RC_OTHER); /* - Test statistic for H0: coefficient is 0. - */ + Test statistic for H0: coefficient is 0. + */ t_stat = linreg_coeff (c, j) / std_err; - tab_double (t, 5, this_row, 0, t_stat, NULL); + tab_double (t, 5, this_row, 0, t_stat, NULL, RC_OTHER); /* - P values for the test statistic above. - */ - pval = - 2 * gsl_cdf_tdist_Q (fabs (t_stat), - (double) (linreg_n_obs (c) - linreg_n_coeffs (c))); - tab_double (t, 6, this_row, 0, pval, NULL); + P values for the test statistic above. + */ + pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), df); + tab_double (t, 6, this_row, 0, pval, NULL, RC_PVALUE); ds_destroy (&tstr); + + if (cmd->stats & STATS_CI) + { + double lower = linreg_coeff (c, j) - tval * std_err ; + double upper = linreg_coeff (c, j) + tval * std_err ; + + tab_double (t, 7, this_row, 0, lower, NULL, RC_OTHER); + tab_double (t, 8, this_row, 0, upper, NULL, RC_OTHER); + } } - tab_title (t, _("Coefficients")); + tab_title (t, _("Coefficients (%s)"), var_to_string (var)); tab_submit (t); } @@ -829,7 +921,7 @@ reg_stats_coeff (linreg * c, void *aux_) Display the ANOVA table. */ static void -reg_stats_anova (linreg * c, void *aux UNUSED) +reg_stats_anova (const linreg * c, const struct variable *var) { int n_cols = 7; int n_rows = 4; @@ -854,38 +946,38 @@ reg_stats_anova (linreg * c, void *aux UNUSED) tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df")); tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square")); tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F")); - tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance")); + tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Sig.")); tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression")); tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual")); tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total")); /* Sums of Squares */ - tab_double (t, 2, 1, 0, linreg_ssreg (c), NULL); - tab_double (t, 2, 3, 0, linreg_sst (c), NULL); - tab_double (t, 2, 2, 0, linreg_sse (c), NULL); + tab_double (t, 2, 1, 0, linreg_ssreg (c), NULL, RC_OTHER); + tab_double (t, 2, 3, 0, linreg_sst (c), NULL, RC_OTHER); + tab_double (t, 2, 2, 0, linreg_sse (c), NULL, RC_OTHER); /* Degrees of freedom */ - tab_text_format (t, 3, 1, TAB_RIGHT, "%g", c->dfm); - tab_text_format (t, 3, 2, TAB_RIGHT, "%g", c->dfe); - tab_text_format (t, 3, 3, TAB_RIGHT, "%g", c->dft); + tab_text_format (t, 3, 1, TAB_RIGHT, "%.*g", DBL_DIG + 1, c->dfm); + tab_text_format (t, 3, 2, TAB_RIGHT, "%.*g", DBL_DIG + 1, c->dfe); + tab_text_format (t, 3, 3, TAB_RIGHT, "%.*g", DBL_DIG + 1, c->dft); /* Mean Squares */ - tab_double (t, 4, 1, TAB_RIGHT, msm, NULL); - tab_double (t, 4, 2, TAB_RIGHT, mse, NULL); + tab_double (t, 4, 1, TAB_RIGHT, msm, NULL, RC_OTHER); + tab_double (t, 4, 2, TAB_RIGHT, mse, NULL, RC_OTHER); - tab_double (t, 5, 1, 0, F, NULL); + tab_double (t, 5, 1, 0, F, NULL, RC_OTHER); - tab_double (t, 6, 1, 0, pval, NULL); + tab_double (t, 6, 1, 0, pval, NULL, RC_PVALUE); - tab_title (t, _("ANOVA")); + tab_title (t, _("ANOVA (%s)"), var_to_string (var)); tab_submit (t); } static void -reg_stats_bcov (linreg * c, void *aux UNUSED) +reg_stats_bcov (const linreg * c, const struct variable *var) { int n_cols; int n_rows; @@ -914,23 +1006,14 @@ reg_stats_bcov (linreg * c, void *aux UNUSED) tab_text (t, 2, i, TAB_CENTER, label); tab_text (t, i + 2, 0, TAB_CENTER, label); for (k = 1; k < linreg_n_coeffs (c); k++) - { - col = (i <= k) ? k : i; - row = (i <= k) ? i : k; - tab_double (t, k + 2, i, TAB_CENTER, - gsl_matrix_get (c->cov, row, col), NULL); - } + { + col = (i <= k) ? k : i; + row = (i <= k) ? i : k; + tab_double (t, k + 2, i, TAB_CENTER, + gsl_matrix_get (c->cov, row, col), NULL, RC_OTHER); + } } - tab_title (t, _("Coefficient Correlations")); + tab_title (t, _("Coefficient Correlations (%s)"), var_to_string (var)); tab_submit (t); } -static void -statistics_keyword_output (void (*function) (linreg *, void *), - bool keyword, linreg * c, void *aux) -{ - if (keyword) - { - (*function) (c, aux); - } -}