From c360fff4fd3e4a98cfe02441f43c27725cead44b Mon Sep 17 00:00:00 2001 From: John Darrington Date: Sat, 14 Aug 2010 19:49:51 +0200 Subject: [PATCH] Add optional callback functions to categoricals. Added callback functions to categoricals to enable per category calculations to be done at the request of the caller. This change also modifies oneway.c (show_descriptives) to use this new feature. However at present it doesn't work properly when multiple dependent variables are specified. --- src/language/stats/glm.q | 7 +- src/language/stats/oneway.c | 186 ++++++++++++++++++++++++------------ src/math/categoricals.c | 41 +++++++- src/math/categoricals.h | 13 ++- 4 files changed, 182 insertions(+), 65 deletions(-) diff --git a/src/language/stats/glm.q b/src/language/stats/glm.q index 13b097f3b8..0d792df898 100644 --- a/src/language/stats/glm.q +++ b/src/language/stats/glm.q @@ -359,9 +359,10 @@ run_glm (struct casereader *input, k++; } - struct categoricals *cats = categoricals_create (categoricals, - n_categoricals, - NULL, MV_NEVER); + struct categoricals *cats = + categoricals_create (categoricals, n_categoricals, + NULL, MV_NEVER, + NULL, NULL, NULL); cov = covariance_2pass_create (n_numerics, numerics, cats, diff --git a/src/language/stats/oneway.c b/src/language/stats/oneway.c index e210bc4772..f421371c1d 100644 --- a/src/language/stats/oneway.c +++ b/src/language/stats/oneway.c @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -33,9 +34,9 @@ #include #include +#include #include - #include #include #include @@ -126,11 +127,15 @@ struct oneway_workspace struct hsh_table *group_hash; struct per_var_ws *vws; + + struct moments1 *totals; + double minimum; + double maximum; }; /* Routines to show the output tables */ static void show_anova_table (const struct oneway_spec *, const struct oneway_workspace *); -static void show_descriptives (const struct oneway_spec *); +static void show_descriptives (const struct oneway_spec *, const struct oneway_workspace *); static void show_homogeneity (const struct oneway_spec *, const struct oneway_workspace *); static void output_oneway (const struct oneway_spec *, struct oneway_workspace *ws); @@ -305,9 +310,57 @@ free_double (void *value_, const void *aux UNUSED) free (value); } + + static void postcalc (const struct oneway_spec *cmd); static void precalc (const struct oneway_spec *cmd); +struct descriptive_data +{ + struct moments1 *mom; + double minimum; + double maximum; +}; + +static void * +makeit (void) +{ + struct descriptive_data *dd = xmalloc (sizeof *dd); + dd->mom = moments1_create (MOMENT_VARIANCE); + dd->minimum = DBL_MAX; + dd->maximum = -DBL_MAX; + + return dd; +} + +static void +updateit (void *user_data, const struct variable *wv, + const struct variable *catvar, const struct ccase *c, void *aux) +{ + const union value *val = case_data_idx (c, 0); + struct descriptive_data *dd = user_data; + struct oneway_workspace *ws = aux; + + double weight = 1.0; + if (wv) + weight = case_data (c, wv)->f; + + moments1_add (dd->mom, val->f, weight); + moments1_add (ws->totals, val->f, weight); + + if (val->f * weight < dd->minimum) + dd->minimum = val->f * weight; + + if (val->f * weight > dd->maximum) + dd->maximum = val->f * weight; + + + if (val->f * weight < ws->minimum) + ws->minimum = val->f * weight; + + if (val->f * weight > ws->maximum) + ws->maximum = val->f * weight; +} static void run_oneway (const struct oneway_spec *cmd, @@ -320,16 +373,21 @@ run_oneway (const struct oneway_spec *cmd, struct casereader *reader; struct ccase *c; - struct oneway_workspace ws; ws.vws = xmalloc (cmd->n_vars * sizeof (*ws.vws)); + ws.totals = moments1_create (MOMENT_VARIANCE); + ws.minimum = DBL_MAX; + ws.maximum = -DBL_MAX; + for (v = 0; v < cmd->n_vars; ++v) { struct categoricals *cats = categoricals_create (&cmd->indep_var, 1, - cmd->wv, cmd->exclude); + cmd->wv, cmd->exclude, + makeit, + updateit, &ws); ws.vws[v].cov = covariance_2pass_create (1, &cmd->vars[v], cats, @@ -474,6 +532,13 @@ run_oneway (const struct oneway_spec *cmd, postcalc (cmd); + for (v = 0; v < cmd->n_vars; ++v) + { + struct categoricals *cats = covariance_get_categoricals (ws.vws[v].cov); + + categoricals_done (cats); + } + if ( cmd->stats & STATS_HOMOGENEITY ) levene (dict, casereader_clone (input), cmd->indep_var, cmd->n_vars, cmd->vars, cmd->exclude); @@ -590,7 +655,7 @@ output_oneway (const struct oneway_spec *cmd, struct oneway_workspace *ws) } if (cmd->stats & STATS_DESCRIPTIVES) - show_descriptives (cmd); + show_descriptives (cmd, ws); if (cmd->stats & STATS_HOMOGENEITY) show_homogeneity (cmd, ws); @@ -700,7 +765,7 @@ show_anova_table (const struct oneway_spec *cmd, const struct oneway_workspace * /* Show the descriptives table */ static void -show_descriptives (const struct oneway_spec *cmd) +show_descriptives (const struct oneway_spec *cmd, const struct oneway_workspace *ws) { size_t v; int n_cols = 10; @@ -710,17 +775,16 @@ show_descriptives (const struct oneway_spec *cmd) const double confidence = 0.95; const double q = (1.0 - confidence) / 2.0; - const struct fmt_spec *wfmt = cmd->wv ? var_get_print_format (cmd->wv) : & F_8_0; + const struct fmt_spec *wfmt = cmd->wv ? var_get_print_format (cmd->wv) : &F_8_0; int n_rows = 2; - for ( v = 0; v < cmd->n_vars; ++v ) - n_rows += group_proc_get (cmd->vars[v])->n_groups + 1; + for (v = 0; v < cmd->n_vars; ++v) + n_rows += ws->actual_number_of_groups + 1; t = tab_create (n_cols, n_rows); tab_headers (t, 2, 0, 2, 0); - /* Put a frame around the entire box, and vertical lines inside */ tab_box (t, TAL_2, TAL_2, @@ -750,39 +814,41 @@ show_descriptives (const struct oneway_spec *cmd) tab_text (t, 8, 1, TAB_CENTER | TAT_TITLE, _("Minimum")); tab_text (t, 9, 1, TAB_CENTER | TAT_TITLE, _("Maximum")); - tab_title (t, _("Descriptives")); - row = 2; for (v = 0; v < cmd->n_vars; ++v) { - double T; - double std_error; - - struct group_proc *gp = group_proc_get (cmd->vars[v]); - - struct group_statistics *gs; - struct group_statistics *totals = &gp->ugs; - const char *s = var_to_string (cmd->vars[v]); const struct fmt_spec *fmt = var_get_print_format (cmd->vars[v]); - struct group_statistics *const *gs_array = - (struct group_statistics *const *) hsh_sort (gp->group_hash); int count = 0; + struct per_var_ws *pvw = &ws->vws[v]; + const struct categoricals *cats = covariance_get_categoricals (pvw->cov); + tab_text (t, 0, row, TAB_LEFT | TAT_TITLE, s); if ( v > 0) tab_hline (t, TAL_1, 0, n_cols - 1, row); - for (count = 0; count < hsh_count (gp->group_hash); ++count) + for (count = 0; count < categoricals_total (cats); ++count) { + double T; + double n, mean, variance; + + const union value *gval = categoricals_get_value_by_subscript (cats, count); + const struct descriptive_data *dd = categoricals_get_user_data_by_subscript (cats, count); + + moments1_calculate (dd->mom, &n, &mean, &variance, NULL, NULL); + + double std_dev = sqrt (variance); + double std_error = std_dev / sqrt (n) ; + struct string vstr; + ds_init_empty (&vstr); - gs = gs_array[count]; - var_append_value_name (cmd->indep_var, &gs->id, &vstr); + var_append_value_name (cmd->indep_var, gval, &vstr); tab_text (t, 1, row + count, TAB_LEFT | TAT_TITLE, @@ -792,61 +858,68 @@ show_descriptives (const struct oneway_spec *cmd) /* Now fill in the numbers ... */ - tab_fixed (t, 2, row + count, 0, gs->n, 8, 0); + tab_fixed (t, 2, row + count, 0, n, 8, 0); - tab_double (t, 3, row + count, 0, gs->mean, NULL); + tab_double (t, 3, row + count, 0, mean, NULL); - tab_double (t, 4, row + count, 0, gs->std_dev, NULL); + tab_double (t, 4, row + count, 0, std_dev, NULL); - std_error = gs->std_dev / sqrt (gs->n) ; - tab_double (t, 5, row + count, 0, - std_error, NULL); + + tab_double (t, 5, row + count, 0, std_error, NULL); /* Now the confidence interval */ - T = gsl_cdf_tdist_Qinv (q, gs->n - 1); + T = gsl_cdf_tdist_Qinv (q, n - 1); tab_double (t, 6, row + count, 0, - gs->mean - T * std_error, NULL); + mean - T * std_error, NULL); tab_double (t, 7, row + count, 0, - gs->mean + T * std_error, NULL); + mean + T * std_error, NULL); /* Min and Max */ - tab_double (t, 8, row + count, 0, gs->minimum, fmt); - tab_double (t, 9, row + count, 0, gs->maximum, fmt); + tab_double (t, 8, row + count, 0, dd->minimum, fmt); + tab_double (t, 9, row + count, 0, dd->maximum, fmt); } - tab_text (t, 1, row + count, - TAB_LEFT | TAT_TITLE, _("Total")); + { + double T; + double n, mean, variance; - tab_double (t, 2, row + count, 0, totals->n, wfmt); + moments1_calculate (ws->totals, &n, &mean, &variance, NULL, NULL); - tab_double (t, 3, row + count, 0, totals->mean, NULL); + double std_dev = sqrt (variance); + double std_error = std_dev / sqrt (n) ; - tab_double (t, 4, row + count, 0, totals->std_dev, NULL); + tab_text (t, 1, row + count, + TAB_LEFT | TAT_TITLE, _("Total")); - std_error = totals->std_dev / sqrt (totals->n) ; + tab_double (t, 2, row + count, 0, n, wfmt); - tab_double (t, 5, row + count, 0, std_error, NULL); + tab_double (t, 3, row + count, 0, mean, NULL); - /* Now the confidence interval */ + tab_double (t, 4, row + count, 0, std_dev, NULL); - T = gsl_cdf_tdist_Qinv (q, totals->n - 1); + tab_double (t, 5, row + count, 0, std_error, NULL); - tab_double (t, 6, row + count, 0, - totals->mean - T * std_error, NULL); + /* Now the confidence interval */ - tab_double (t, 7, row + count, 0, - totals->mean + T * std_error, NULL); + T = gsl_cdf_tdist_Qinv (q, n - 1); - /* Min and Max */ + tab_double (t, 6, row + count, 0, + mean - T * std_error, NULL); - tab_double (t, 8, row + count, 0, totals->minimum, fmt); - tab_double (t, 9, row + count, 0, totals->maximum, fmt); + tab_double (t, 7, row + count, 0, + mean + T * std_error, NULL); + + /* Min and Max */ + + tab_double (t, 8, row + count, 0, ws->minimum, fmt); + tab_double (t, 9, row + count, 0, ws->maximum, fmt); + } - row += gp->n_groups + 1; + row += categoricals_total (cats) + 1; } tab_submit (t); @@ -860,13 +933,9 @@ show_homogeneity (const struct oneway_spec *cmd, const struct oneway_workspace * int n_cols = 5; size_t n_rows = cmd->n_vars + 1; - struct tab_table *t; - - - t = tab_create (n_cols, n_rows); + struct tab_table *t = tab_create (n_cols, n_rows); tab_headers (t, 1, 0, 1, 0); - /* Put a frame around the entire box, and vertical lines inside */ tab_box (t, TAL_2, TAL_2, @@ -1035,7 +1104,6 @@ show_contrast_tests (const struct oneway_spec *cmd, const struct oneway_workspac tab_hline (t, TAL_2, 0, n_cols - 1, 1); tab_vline (t, TAL_2, 3, 0, n_rows - 1); - tab_title (t, _("Contrast Tests")); tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Contrast")); diff --git a/src/math/categoricals.c b/src/math/categoricals.c index df26c1602e..ef9c528994 100644 --- a/src/math/categoricals.c +++ b/src/math/categoricals.c @@ -34,6 +34,9 @@ struct value_node struct hmap_node node; /* Node in hash map. */ union value value; /* The value being labeled. */ double cc; /* The total of the weights of cases with this value */ + + void *user_data; /* A pointer to data which the caller can store stuff */ + int subscript; /* A zero based integer, unique within the variable. Can be used as an index into an array */ }; @@ -84,6 +87,17 @@ struct categoricals /* Missing values to be excluded */ enum mv_class exclude; + + + + /* Function to be called on each update */ + update_func *update; + + /* Auxilliary data to be passed to update */ + void *update_aux; + + /* Function specified by the caller to create user_data */ + user_data_create_func *user_data_create; }; @@ -179,10 +193,12 @@ lookup_value (const struct hmap *map, const struct variable *var, const union va } - struct categoricals * categoricals_create (const struct variable *const *v, size_t n_vars, - const struct variable *wv, enum mv_class exclude) + const struct variable *wv, enum mv_class exclude, + user_data_create_func *udf, + update_func *update, void *aux + ) { size_t i; struct categoricals *cat = xmalloc (sizeof *cat); @@ -194,6 +210,9 @@ categoricals_create (const struct variable *const *v, size_t n_vars, cat->reverse_variable_map = NULL; cat->pool = pool_create (); cat->exclude = exclude; + cat->update = update; + cat->update_aux = aux; + cat->user_data_create = udf; cat->vp = pool_calloc (cat->pool, cat->n_vp, sizeof *cat->vp); @@ -246,10 +265,16 @@ categoricals_update (struct categoricals *cat, const struct ccase *c) cat->n_vars++; node->subscript = cat->vp[i].n_cats++ ; + + if ( cat->user_data_create ) + node->user_data = cat->user_data_create (); } node->cc += weight; cat->vp[i].cc += weight; + + if ( cat->update) + cat->update (node->user_data, cat->wv, var, c, cat->update_aux); } } @@ -385,3 +410,15 @@ categoricals_get_n_variables (const struct categoricals *cat) { return cat->n_vars; } + + + +void * +categoricals_get_user_data_by_subscript (const struct categoricals *cat, int subscript) +{ + int vindex = reverse_variable_lookup (cat, subscript); + const struct var_params *vp = &cat->vp[vindex]; + + const struct value_node *vn = vp->reverse_value_map [subscript - vp->base_subscript]; + return vn->user_data; +} diff --git a/src/math/categoricals.h b/src/math/categoricals.h index 67be020463..ff33540ca1 100644 --- a/src/math/categoricals.h +++ b/src/math/categoricals.h @@ -27,8 +27,17 @@ struct ccase; union value ; +typedef void update_func (void *user_data, + const struct variable *wv, + const struct variable *catvar, + const struct ccase *c, void *aux); + +typedef void *user_data_create_func (void); + struct categoricals *categoricals_create (const struct variable *const *v, size_t n_vars, - const struct variable *wv, enum mv_class exclude); + const struct variable *wv, enum mv_class exclude, + user_data_create_func *udf, + update_func *update, void *update_aux); void categoricals_destroy (struct categoricals *); @@ -63,6 +72,8 @@ double categoricals_get_sum_by_subscript (const struct categoricals *cat, int su double categoricals_get_binary_by_subscript (const struct categoricals *cat, int subscript, const struct ccase *c); +void * categoricals_get_user_data_by_subscript (const struct categoricals *cat, int subscript); + -- 2.30.2