From 4443108e98b43b196f3f07217c6ec8bd96581367 Mon Sep 17 00:00:00 2001 From: John Darrington Date: Tue, 2 Jul 2013 19:41:16 +0200 Subject: [PATCH] Fixed incorrect behaviour of REGRESSION when multiple dependent variables are entered The REGRESSION command behaved badly when more than one dependent variable was entered. The cause was that the unnormalised covariance matrix returned from the covariance module was mutated for each variable. Consequently, each variable after the first was wrong. This change fixes that by changing the ownership semantics of the returned matrix (and thereby its constness). This change also adds some comments, attends to constness and fixes the remaining memory leaks associated with regression. --- src/language/stats/glm.c | 15 ++-- src/language/stats/oneway.c | 8 +- src/language/stats/regression.c | 135 +++++++++++++++++++------------- src/math/covariance.c | 18 +++-- src/math/covariance.h | 2 +- src/math/linreg.c | 4 +- src/math/linreg.h | 4 +- 7 files changed, 109 insertions(+), 77 deletions(-) diff --git a/src/language/stats/glm.c b/src/language/stats/glm.c index 3d71779ff6..82ca9896e1 100644 --- a/src/language/stats/glm.c +++ b/src/language/stats/glm.c @@ -387,7 +387,7 @@ fill_submatrix (const gsl_matrix * cov, gsl_matrix * submatrix, bool *dropped_f) static void ssq_type1 (struct covariance *cov, gsl_vector *ssq, const struct glm_spec *cmd) { - gsl_matrix *cm = covariance_calculate_unnormalized (cov); + const gsl_matrix *cm = covariance_calculate_unnormalized (cov); size_t i; size_t k; bool *model_dropped = xcalloc (covariance_dim (cov), sizeof (*model_dropped)); @@ -447,7 +447,6 @@ ssq_type1 (struct covariance *cov, gsl_vector *ssq, const struct glm_spec *cmd) free (model_dropped); free (submodel_dropped); - gsl_matrix_free (cm); } /* @@ -457,7 +456,7 @@ ssq_type1 (struct covariance *cov, gsl_vector *ssq, const struct glm_spec *cmd) static void ssq_type2 (struct covariance *cov, gsl_vector *ssq, const struct glm_spec *cmd) { - gsl_matrix *cm = covariance_calculate_unnormalized (cov); + const gsl_matrix *cm = covariance_calculate_unnormalized (cov); size_t i; size_t k; bool *model_dropped = xcalloc (covariance_dim (cov), sizeof (*model_dropped)); @@ -511,7 +510,6 @@ ssq_type2 (struct covariance *cov, gsl_vector *ssq, const struct glm_spec *cmd) free (model_dropped); free (submodel_dropped); - gsl_matrix_free (cm); } /* @@ -521,7 +519,7 @@ ssq_type2 (struct covariance *cov, gsl_vector *ssq, const struct glm_spec *cmd) static void ssq_type3 (struct covariance *cov, gsl_vector *ssq, const struct glm_spec *cmd) { - gsl_matrix *cm = covariance_calculate_unnormalized (cov); + const gsl_matrix *cm = covariance_calculate_unnormalized (cov); size_t i; size_t k; bool *model_dropped = xcalloc (covariance_dim (cov), sizeof (*model_dropped)); @@ -568,8 +566,6 @@ ssq_type3 (struct covariance *cov, gsl_vector *ssq, const struct glm_spec *cmd) gsl_matrix_free (model_cov); } free (model_dropped); - - gsl_matrix_free (cm); } @@ -657,7 +653,9 @@ run_glm (struct glm_spec *cmd, struct casereader *input, } { - gsl_matrix *cm = covariance_calculate_unnormalized (cov); + const gsl_matrix *ucm = covariance_calculate_unnormalized (cov); + gsl_matrix *cm = gsl_matrix_alloc (ucm->size1, ucm->size2); + gsl_matrix_memcpy (cm, ucm); // dump_matrix (cm); @@ -686,7 +684,6 @@ run_glm (struct glm_spec *cmd, struct casereader *input, break; } // dump_matrix (cm); - gsl_matrix_free (cm); } diff --git a/src/language/stats/oneway.c b/src/language/stats/oneway.c index a30026e8b4..f9476d5c95 100644 --- a/src/language/stats/oneway.c +++ b/src/language/stats/oneway.c @@ -815,6 +815,7 @@ run_oneway (const struct oneway_spec *cmd, for (v = 0; v < cmd->n_vars; ++v) { + const gsl_matrix *ucm; gsl_matrix *cm; struct per_var_ws *pvw = &ws.vws[v]; const struct categoricals *cats = covariance_get_categoricals (pvw->cov); @@ -828,7 +829,10 @@ run_oneway (const struct oneway_spec *cmd, continue; } - cm = covariance_calculate_unnormalized (pvw->cov); + ucm = covariance_calculate_unnormalized (pvw->cov); + + cm = gsl_matrix_alloc (ucm->size1, ucm->size2); + gsl_matrix_memcpy (cm, ucm); moments1_calculate (ws.dd_total[v]->mom, &pvw->n, NULL, NULL, NULL, NULL); @@ -843,8 +847,6 @@ run_oneway (const struct oneway_spec *cmd, pvw->n_groups = categoricals_n_total (cats); pvw->mse = (pvw->sst - pvw->ssa) / (pvw->n - pvw->n_groups); - - gsl_matrix_free (cm); } for (v = 0; v < cmd->n_vars; ++v) diff --git a/src/language/stats/regression.c b/src/language/stats/regression.c index 6cf94c6ad9..de3194d76d 100644 --- a/src/language/stats/regression.c +++ b/src/language/stats/regression.c @@ -81,15 +81,21 @@ struct regression_workspace { struct per_split_ws *psw; + /* The new variables which will be introduced by /SAVE */ + const struct variable **predvars; + const struct variable **residvars; + + /* A reader/writer pair to temporarily hold the + values of the new variables */ struct casewriter *writer; struct casereader *reader; + /* Indeces of the new values in the reader/writer (-1 if not applicable) */ int res_idx; int pred_idx; - int extras; - const struct variable **predvars; - const struct variable **residvars; + /* 0, 1 or 2 depending on what new variables are to be created */ + int extras; }; static void run_regression (const struct regression *cmd, @@ -98,7 +104,8 @@ static void run_regression (const struct regression *cmd, struct casereader *input); - +/* Return a string based on PREFIX which may be used as the name + of a new variable in DICT */ static char * reg_get_name (const struct dictionary *dict, const char *prefix) { @@ -127,25 +134,39 @@ create_aux_var (struct dataset *ds, const char *prefix) return var; } -struct thing +/* Auxilliary data for transformation when /SAVE is entered */ +struct save_trans_data { int n_dep_vars; struct regression_workspace *ws; }; +static bool +save_trans_free (void *aux) +{ + struct save_trans_data *save_trans_data = aux; + free (save_trans_data->ws->predvars); + free (save_trans_data->ws->residvars); + + casereader_destroy (save_trans_data->ws->reader); + free (save_trans_data->ws); + free (save_trans_data); + return true; +} + static int -transX (void *aux, struct ccase **c, casenumber x UNUSED) +save_trans_func (void *aux, struct ccase **c, casenumber x UNUSED) { - struct thing *thing = aux; - struct regression_workspace *ws = thing->ws; - const struct ccase *in = casereader_read (ws->reader); + struct save_trans_data *save_trans_data = aux; + struct regression_workspace *ws = save_trans_data->ws; + struct ccase *in = casereader_read (ws->reader); if (in) { int k; *c = case_unshare (*c); - for (k = 0; k < thing->n_dep_vars; ++k) + for (k = 0; k < save_trans_data->n_dep_vars; ++k) { if (ws->pred_idx != -1) { @@ -159,6 +180,7 @@ transX (void *aux, struct ccase **c, casenumber x UNUSED) case_data_rw (*c, ws->residvars[k])->f = resid; } } + case_unref (in); } return TRNS_CONTINUE; @@ -168,6 +190,7 @@ transX (void *aux, struct ccase **c, casenumber x UNUSED) int cmd_regression (struct lexer *lexer, struct dataset *ds) { + int i; int n_splits = 0; struct regression_workspace workspace; struct regression regression; @@ -332,6 +355,7 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) "Temporary transformations will be made permanent.")); workspace.writer = autopaging_writer_create (proto); + caseproto_unref (proto); } @@ -357,21 +381,30 @@ cmd_regression (struct lexer *lexer, struct dataset *ds) ok = proc_commit (ds) && ok; } + if (workspace.writer) { - if (workspace.writer) - { - struct thing *thing = xmalloc (sizeof *thing); - struct casereader *r = casewriter_make_reader (workspace.writer); - workspace.writer = NULL; - workspace.reader = r; - thing->ws = xmalloc (sizeof (workspace)); - memcpy (thing->ws, &workspace, sizeof (workspace)); - thing->n_dep_vars = regression.n_dep_vars; + struct save_trans_data *save_trans_data = xmalloc (sizeof *save_trans_data); + struct casereader *r = casewriter_make_reader (workspace.writer); + workspace.writer = NULL; + workspace.reader = r; + save_trans_data->ws = xmalloc (sizeof (workspace)); + memcpy (save_trans_data->ws, &workspace, sizeof (workspace)); + save_trans_data->n_dep_vars = regression.n_dep_vars; - add_transformation (ds, transX, NULL, thing); - } + add_transformation (ds, save_trans_func, save_trans_free, save_trans_data); } + for (i = 0; i < n_splits; ++i) + { + int k; + + for (k = 0; k < regression.n_dep_vars; ++k) + linreg_unref (workspace.psw[i].models[k]); + + free (workspace.psw[i].models); + } + free (workspace.psw); + free (regression.vars); free (regression.dep_vars); @@ -384,7 +417,7 @@ error: return CMD_FAILURE; } - +/* Return the size of the union of dependent and independent variables */ static size_t get_n_all_vars (const struct regression *cmd) { @@ -406,6 +439,7 @@ get_n_all_vars (const struct regression *cmd) return result; } +/* Fill VARS with the union of dependent and independent variables */ static void fill_all_vars (const struct variable **vars, const struct regression *cmd) { @@ -492,7 +526,7 @@ fill_covariance (gsl_matrix * cov, struct covariance *all_cov, const gsl_matrix *ssize_matrix; double result = 0.0; - gsl_matrix *cm = covariance_calculate_unnormalized (all_cov); + const gsl_matrix *cm = covariance_calculate_unnormalized (all_cov); if (cm == NULL) return 0; @@ -543,34 +577,35 @@ fill_covariance (gsl_matrix * cov, struct covariance *all_cov, gsl_matrix_set (cov, cov->size1 - 1, cov->size1 - 1, gsl_matrix_get (cm, dep_subscript, dep_subscript)); free (rows); - gsl_matrix_free (cm); return result; } + /* STATISTICS subcommand output functions. */ -static void reg_stats_r (linreg *, void *, const struct variable *); -static void reg_stats_coeff (linreg *, void *, const struct variable *); -static void reg_stats_anova (linreg *, void *, const struct variable *); -static void reg_stats_bcov (linreg *, void *, const struct variable *); - -static void -statistics_keyword_output (void (*) - (linreg *, void *, const struct variable *), bool, - linreg *, void *, const struct variable *); - +static void reg_stats_r (const linreg *, const struct variable *); +static void reg_stats_coeff (const linreg *, const gsl_matrix *, const struct variable *); +static void reg_stats_anova (const linreg *, const struct variable *); +static void reg_stats_bcov (const linreg *, const struct variable *); static void -subcommand_statistics (const struct regression *cmd, linreg * c, void *aux, +subcommand_statistics (const struct regression *cmd, const linreg * c, const gsl_matrix * cm, const struct variable *var) { - statistics_keyword_output (reg_stats_r, cmd->r, c, aux, var); - statistics_keyword_output (reg_stats_anova, cmd->anova, c, aux, var); - statistics_keyword_output (reg_stats_coeff, cmd->coeff, c, aux, var); - statistics_keyword_output (reg_stats_bcov, cmd->bcov, c, aux, var); + if (cmd->r) + reg_stats_r (c, var); + + if (cmd->anova) + reg_stats_anova (c, var); + + if (cmd->coeff) + reg_stats_coeff (c, cm, var); + + if (cmd->bcov) + reg_stats_bcov (c, var); } @@ -688,6 +723,8 @@ run_regression (const struct regression *cmd, double res = linreg_residual (psw->models[k], obs, vals, n_indep); case_data_rw_idx (outc, k * ws->extras + ws->res_idx)->f = res; } + free (vals); + free (vars); } casewriter_write (ws->writer, outc); } @@ -708,7 +745,7 @@ run_regression (const struct regression *cmd, static void -reg_stats_r (linreg * c, void *aux UNUSED, const struct variable *var) +reg_stats_r (const linreg * c, const struct variable *var) { struct tab_table *t; int n_rows = 2; @@ -745,7 +782,7 @@ reg_stats_r (linreg * c, void *aux UNUSED, const struct variable *var) Table showing estimated regression coefficients. */ static void -reg_stats_coeff (linreg * c, void *aux_, const struct variable *var) +reg_stats_coeff (const linreg * c, const gsl_matrix *cov, const struct variable *var) { size_t j; int n_cols = 7; @@ -759,7 +796,6 @@ reg_stats_coeff (linreg * c, void *aux_, const struct variable *var) const struct variable *v; struct tab_table *t; - gsl_matrix *cov = aux_; assert (c != NULL); n_rows = linreg_n_coeffs (c) + 3; @@ -839,7 +875,7 @@ reg_stats_coeff (linreg * c, void *aux_, const struct variable *var) Display the ANOVA table. */ static void -reg_stats_anova (linreg * c, void *aux UNUSED, const struct variable *var) +reg_stats_anova (const linreg * c, const struct variable *var) { int n_cols = 7; int n_rows = 4; @@ -895,7 +931,7 @@ reg_stats_anova (linreg * c, void *aux UNUSED, const struct variable *var) static void -reg_stats_bcov (linreg * c, void *aux UNUSED, const struct variable *var) +reg_stats_bcov (const linreg * c, const struct variable *var) { int n_cols; int n_rows; @@ -935,14 +971,3 @@ reg_stats_bcov (linreg * c, void *aux UNUSED, const struct variable *var) tab_submit (t); } -static void -statistics_keyword_output (void (*function) - (linreg *, void *, const struct variable * var), - bool keyword, linreg * c, void *aux, - const struct variable *var) -{ - if (keyword) - { - (*function) (c, aux, var); - } -} diff --git a/src/math/covariance.c b/src/math/covariance.c index 2500f90423..c28dcb0358 100644 --- a/src/math/covariance.c +++ b/src/math/covariance.c @@ -113,6 +113,8 @@ struct covariance /* Flags indicating that the first case has been seen */ bool pass_one_first_case_seen; bool pass_two_first_case_seen; + + gsl_matrix *unnormalised; }; @@ -202,6 +204,7 @@ covariance_2pass_create (size_t n_vars, const struct variable *const *vars, cov->cm = NULL; cov->categoricals = cats; + cov->unnormalised = NULL; return cov; } @@ -667,27 +670,31 @@ covariance_calculate_single_pass_unnormalized (struct covariance *cov) /* Return a pointer to gsl_matrix containing the pairwise covariances. The - caller owns the returned matrix and must free it when it is no longer - needed. + returned matrix is owned by the structure, and must not be freed. Call this function only after all data have been accumulated. */ -gsl_matrix * +const gsl_matrix * covariance_calculate_unnormalized (struct covariance *cov) { if ( cov->state <= 0 ) return NULL; + if (cov->unnormalised != NULL) + return cov->unnormalised; + switch (cov->passes) { case 1: - return covariance_calculate_single_pass_unnormalized (cov); + cov->unnormalised = covariance_calculate_single_pass_unnormalized (cov); break; case 2: - return covariance_calculate_double_pass_unnormalized (cov); + cov->unnormalised = covariance_calculate_double_pass_unnormalized (cov); break; default: NOT_REACHED (); } + + return cov->unnormalised; } /* Function to access the categoricals used by COV @@ -711,6 +718,7 @@ covariance_destroy (struct covariance *cov) for (i = 0; i < n_MOMENTS; ++i) gsl_matrix_free (cov->moments[i]); + gsl_matrix_free (cov->unnormalised); free (cov->moments); free (cov->cm); free (cov); diff --git a/src/math/covariance.h b/src/math/covariance.h index 7345097195..2aee9f18b9 100644 --- a/src/math/covariance.h +++ b/src/math/covariance.h @@ -40,7 +40,7 @@ void covariance_accumulate_pass1 (struct covariance *, const struct ccase *); void covariance_accumulate_pass2 (struct covariance *, const struct ccase *); gsl_matrix * covariance_calculate (struct covariance *); -gsl_matrix * covariance_calculate_unnormalized (struct covariance *); +const gsl_matrix * covariance_calculate_unnormalized (struct covariance *); void covariance_destroy (struct covariance *cov); diff --git a/src/math/linreg.c b/src/math/linreg.c index be43735785..e84d3fa7f3 100644 --- a/src/math/linreg.c +++ b/src/math/linreg.c @@ -247,7 +247,7 @@ linreg_residual (const linreg *c, double obs, const double *vals, size_t n_vals) /* Mean of the independent variable. */ -double linreg_get_indep_variable_mean (linreg *c, size_t j) +double linreg_get_indep_variable_mean (const linreg *c, size_t j) { assert (c != NULL); return gsl_vector_get (c->indep_means, j); @@ -450,7 +450,7 @@ linreg_set_depvar_mean (linreg *c, double x) } double -linreg_get_depvar_mean (linreg *c) +linreg_get_depvar_mean (const linreg *c) { return c->depvar_mean; } diff --git a/src/math/linreg.h b/src/math/linreg.h index 72fe447b5b..c093a1d041 100644 --- a/src/math/linreg.h +++ b/src/math/linreg.h @@ -166,7 +166,7 @@ const struct variable ** linreg_get_vars (const linreg *); /* Mean of the independent variable. */ -double linreg_get_indep_variable_mean (linreg *, size_t); +double linreg_get_indep_variable_mean (const linreg *, size_t); void linreg_set_indep_variable_mean (linreg *, size_t, double); double linreg_mse (const linreg *); @@ -183,5 +183,5 @@ double linreg_ssreg (const linreg *); double linreg_dfmodel (const linreg *); double linreg_sst (const linreg *); void linreg_set_depvar_mean (linreg *, double); -double linreg_get_depvar_mean (linreg *); +double linreg_get_depvar_mean (const linreg *); #endif -- 2.30.2