X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fexamine.q;h=e9e0ca7ec252aff8468c96d2992fd21035009a1f;hb=d5e5cf282cc56899913654a62c425e584acecfb2;hp=491a59194fb9723044557a09c778437cb62fcb60;hpb=efbbea5561a1c11b3f794906d1d10dc31b27e21b;p=pspp diff --git a/src/examine.q b/src/examine.q index 491a59194f..e9e0ca7ec2 100644 --- a/src/examine.q +++ b/src/examine.q @@ -1,6 +1,6 @@ /* PSPP - EXAMINE data for normality . -*-c-*- -Copyright (C) 1997-9, 2000 Free Software Foundation, Inc. +Copyright (C) 2004 Free Software Foundation, Inc. Author: John Darrington 2004 This program is free software; you can redistribute it and/or @@ -19,6 +19,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include +#include #include "error.h" #include #include @@ -26,6 +27,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA #include "alloc.h" #include "str.h" #include "case.h" +#include "dictionary.h" #include "command.h" #include "lexer.h" #include "error.h" @@ -38,9 +40,12 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA #include "vfm.h" #include "hash.h" #include "casefile.h" +#include "factor_stats.h" +/* (headers) */ +#include "chart.h" /* (specification) - "EXAMINE" (examine_): + "EXAMINE" (xmn_): *variables=custom; +total=custom; +nototal=custom; @@ -48,6 +53,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA rep:report/!noreport, incl:include/!exclude; +compare=cmp:variables/!groups; + +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none; +cinterval=double; +statistics[st_]=descriptives,:extreme(*d:n),all,none. */ @@ -57,25 +63,34 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA /* (functions) */ + static struct cmd_examine cmd; static struct variable **dependent_vars; static int n_dependent_vars; -static struct hsh_table *hash_table_factors; +static struct hsh_table *hash_table_factors=0; + + struct factor { - struct variable *v1; - struct hsh_table *hash_table_v1; + /* The independent variable for this factor */ + struct variable *indep_var; + + /* The factor statistics for each value of the independent variable */ + struct hsh_table *hash_table_val; + + /* The subfactor (if any) */ + struct factor *subfactor; - struct variable *v2; - struct hsh_table *hash_table_v2; }; + + /* Parse the clause specifying the factors */ static int examine_parse_independent_vars(struct cmd_examine *cmd, struct hsh_table *hash_factors ); @@ -107,43 +122,87 @@ static void show_extremes(struct variable **dependent_var, int n_extremities); -/* Calculations */ -static void calculate(const struct casefile *cf, void *cmd_); +void np_plot(const struct metrics *m, const char *varname); + + + +/* Per Split function */ +static void run_examine(const struct casefile *cf, void *); + +static void output_examine(void); + + +static struct factor_statistics *totals = 0; + int cmd_examine(void) { - int i; - short total=1; if ( !parse_examine(&cmd) ) return CMD_FAILURE; + + if ( cmd.st_n == SYSMIS ) + cmd.st_n = 5; if ( ! cmd.sbc_cinterval) cmd.n_cinterval[0] = 95.0; - if ( cmd.sbc_nototal ) - total = 0; + totals = xmalloc ( sizeof (struct factor_statistics *) ); + + totals->stats = xmalloc(sizeof ( struct metrics ) * n_dependent_vars); + + multipass_procedure_with_splits (run_examine, NULL); - multipass_procedure_with_splits (calculate, &cmd); + + hsh_destroy(hash_table_factors); + + free(totals->stats); + free(totals); + + return CMD_SUCCESS; +}; + + +/* Show all the appropriate tables */ +static void +output_examine(void) +{ /* Show totals if appropriate */ - if ( total || !hash_table_factors || 0 == hsh_count (hash_table_factors)) + if ( ! cmd.sbc_nototal || + ! hash_table_factors || 0 == hsh_count (hash_table_factors)) { show_summary(dependent_vars, n_dependent_vars,0); if ( cmd.sbc_statistics ) { - if ( cmd.a_statistics[EXAMINE_ST_DESCRIPTIVES]) + if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES]) show_descriptives(dependent_vars, n_dependent_vars, 0); - if ( cmd.st_n != SYSMIS ) + if ( cmd.a_statistics[XMN_ST_EXTREME]) show_extremes(dependent_vars, n_dependent_vars, 0, cmd.st_n); } + + if ( cmd.sbc_plot) + { + if ( cmd.a_plot[XMN_PLT_NPPLOT] ) + { + int v; + + for ( v = 0 ; v < n_dependent_vars; ++v ) + { + np_plot(&totals->stats[v], var_to_string(dependent_vars[v])); + } + + } + } + } + /* Show grouped statistics if appropriate */ if ( hash_table_factors && 0 != hsh_count (hash_table_factors)) { @@ -158,24 +217,47 @@ cmd_examine(void) if ( cmd.sbc_statistics ) { - if ( cmd.a_statistics[EXAMINE_ST_DESCRIPTIVES]) - show_descriptives(dependent_vars, n_dependent_vars,f); + if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES]) + show_descriptives(dependent_vars, n_dependent_vars, f); - if ( cmd.st_n != SYSMIS ) - show_extremes(dependent_vars, n_dependent_vars,f,cmd.st_n); + if ( cmd.a_statistics[XMN_ST_EXTREME]) + show_extremes(dependent_vars, n_dependent_vars, f, cmd.st_n); + } + + + if ( cmd.sbc_plot) + { + if ( cmd.a_plot[XMN_PLT_NPPLOT] ) + { + struct hsh_iterator h2; + struct factor_statistics *foo ; + for (foo = hsh_first(f->hash_table_val,&h2); + foo != 0 ; + foo = hsh_next(f->hash_table_val,&h2)) + { + int v; + for ( v = 0 ; v < n_dependent_vars; ++ v) + { + char buf[100]; + sprintf(buf, "%s (%s = %s)", + var_to_string(dependent_vars[v]), + var_to_string(f->indep_var), + value_to_string(foo->id,f->indep_var)); + np_plot(&foo->stats[v], buf); + } + } + } } } } - hsh_destroy(hash_table_factors); +} - return CMD_SUCCESS; -}; /* TOTAL and NOTOTAL are simple, mutually exclusive flags */ static int -examine_custom_total(struct cmd_examine *p) +xmn_custom_total(struct cmd_examine *p) { if ( p->sbc_nototal ) { @@ -187,7 +269,7 @@ examine_custom_total(struct cmd_examine *p) } static int -examine_custom_nototal(struct cmd_examine *p) +xmn_custom_nototal(struct cmd_examine *p) { if ( p->sbc_total ) { @@ -203,38 +285,33 @@ examine_custom_nototal(struct cmd_examine *p) int compare_factors (const struct factor *f1, const struct factor *f2, - void *aux UNUSED) + void *aux) { - int v1_cmp; + int indep_var_cmp = strcmp(f1->indep_var->name, f2->indep_var->name); - v1_cmp = strcmp(f1->v1->name, f2->v1->name); + if ( 0 != indep_var_cmp ) + return indep_var_cmp; - if ( 0 != v1_cmp ) - return v1_cmp; - - if ( f1->v2 == 0 && f2->v2 == 0 ) + /* If the names are identical, and there are no subfactors then + the factors are identical */ + if ( ! f1->subfactor && ! f2->subfactor ) return 0; + + /* ... otherwise we must compare the subfactors */ - if ( f1->v2 == 0 && f2->v2 != 0 ) - return -1; - - if ( f1->v2 != 0 && f2->v2 == 0 ) - return +1; - - return strcmp(f1->v2->name, f2->v2->name); + return compare_factors(f1->subfactor, f2->subfactor, aux); } /* Create a hash of a factor */ unsigned -hash_factor( const struct factor *f, - void *aux UNUSED) +hash_factor( const struct factor *f, void *aux) { unsigned h; - h = hsh_hash_string(f->v1->name); + h = hsh_hash_string(f->indep_var->name); - if ( f->v2 ) - h += hsh_hash_string(f->v2->name); + if ( f->subfactor ) + h += hash_factor(f->subfactor, aux); return h; } @@ -242,10 +319,12 @@ hash_factor( const struct factor *f, /* Free up a factor */ void -free_factor(struct factor *f, void *aux UNUSED) +free_factor(struct factor *f, void *aux) { - hsh_destroy(f->hash_table_v1); - hsh_destroy(f->hash_table_v2); + hsh_destroy(f->hash_table_val); + + if ( f->subfactor ) + free_factor(f->subfactor, aux); free(f); } @@ -253,7 +332,7 @@ free_factor(struct factor *f, void *aux UNUSED) /* Parser for the variables sub command */ static int -examine_custom_variables(struct cmd_examine *cmd ) +xmn_custom_variables(struct cmd_examine *cmd ) { lex_match('='); @@ -301,36 +380,39 @@ examine_parse_independent_vars(struct cmd_examine *cmd, if ( !f ) { f = xmalloc(sizeof(struct factor)); - f->v2 = 0; - f->v1 = 0; - f->hash_table_v2 = 0; - f->hash_table_v1 = 0; + f->indep_var = 0; + f->hash_table_val = 0; + f->subfactor = 0; } - f->v1 = parse_variable(); + f->indep_var = parse_variable(); - if ( ! f->hash_table_v1 ) - f->hash_table_v1 = hsh_create(4,(hsh_compare_func *)compare_values, - (hsh_hash_func *)hash_value, - 0,(void *) f->v1->width); + if ( ! f->hash_table_val ) + f->hash_table_val = hsh_create(4,(hsh_compare_func *) compare_indep_values, + (hsh_hash_func *) hash_indep_value, + (hsh_free_func *) free_factor_stats, + (void *) f->indep_var->width); if ( token == T_BY ) { lex_match(T_BY); + if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL) && token != T_ALL) return 2; - f->v2 = parse_variable(); + f->subfactor = xmalloc(sizeof(struct factor)); + + f->subfactor->indep_var = parse_variable(); - if ( !f->hash_table_v2 ) - { - f->hash_table_v2 = hsh_create(4, - (hsh_compare_func *) compare_values, - (hsh_hash_func *) hash_value, - 0, - (void *) f->v2->width); - } + f->subfactor->subfactor = 0; + + f->subfactor->hash_table_val = + hsh_create(4, + (hsh_compare_func *) compare_indep_values, + (hsh_hash_func *) hash_indep_value, + (hsh_free_func *) free_factor_stats, + (void *) f->subfactor->indep_var->width); } hsh_insert(hash_table_factors, f); @@ -344,7 +426,8 @@ examine_parse_independent_vars(struct cmd_examine *cmd, } -void populate_descriptives(struct tab_table *t, int col, int row); +void populate_descriptives(struct tab_table *t, int col, int row, + const struct metrics *fs); void populate_extremities(struct tab_table *t, int col, int row, int n); @@ -374,17 +457,17 @@ show_descriptives(struct variable **dependent_var, } else { - assert(factor->v1); - if ( factor->v2 == 0 ) + assert(factor->indep_var); + if ( factor->subfactor == 0 ) { heading_columns = 2; - n_rows += n_dep_var * hsh_count(factor->hash_table_v1) * n_stat_rows; + n_rows += n_dep_var * hsh_count(factor->hash_table_val) * n_stat_rows; } else { heading_columns = 3; - n_rows += n_dep_var * hsh_count(factor->hash_table_v1) * - hsh_count(factor->hash_table_v2) * n_stat_rows ; + n_rows += n_dep_var * hsh_count(factor->hash_table_val) * + hsh_count(factor->subfactor->hash_table_val) * n_stat_rows ; } } @@ -392,7 +475,7 @@ show_descriptives(struct variable **dependent_var, t = tab_create (n_cols, n_rows, 0); - tab_headers (t, heading_columns, 0, heading_rows, 0); + tab_headers (t, heading_columns + 1, 0, heading_rows, 0); tab_dim (t, tab_natural_dimensions); @@ -405,8 +488,8 @@ show_descriptives(struct variable **dependent_var, tab_hline (t, TAL_2, 0, n_cols - 1, heading_rows ); - tab_vline (t, TAL_2, heading_columns, 0, n_rows - 1); - tab_vline (t, TAL_1, n_cols - 2, 0, n_rows - 1); + tab_vline (t, TAL_1, heading_columns, 0, n_rows - 1); + tab_vline (t, TAL_2, n_cols - 2, 0, n_rows - 1); tab_vline (t, TAL_1, n_cols - 1, 0, n_rows - 1); tab_text (t, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Statistic")); @@ -421,9 +504,9 @@ show_descriptives(struct variable **dependent_var, if ( factor ) { - n_factors = hsh_count(factor->hash_table_v1); - if ( factor->v2 ) - n_subfactors = hsh_count(factor->hash_table_v2); + n_factors = hsh_count(factor->hash_table_val); + if ( factor->subfactor ) + n_subfactors = hsh_count(factor->subfactor->hash_table_val); } @@ -432,46 +515,45 @@ show_descriptives(struct variable **dependent_var, if ( i > 0 ) tab_hline(t, TAL_1, 0, n_cols - 1, row ); - - if ( factor ) { struct hsh_iterator hi; - union value *v; + const struct factor_statistics *fs; int count = 0; - tab_text (t, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE, - var_to_string(factor->v1)); + tab_text (t, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE, + var_to_string(factor->indep_var)); - for ( v = hsh_first(factor->hash_table_v1, &hi); - v != 0; - v = hsh_next(factor->hash_table_v1, &hi)) + for (fs = hsh_first(factor->hash_table_val, &hi); + fs != 0; + fs = hsh_next(factor->hash_table_val, &hi)) { - struct hsh_iterator h2; - union value *vv; - tab_text (t, 1, row + count * n_subfactors * n_stat_rows, TAB_RIGHT | TAT_TITLE, - value_to_string(v, factor->v1) + value_to_string(fs->id, factor->indep_var) ); if ( count > 0 ) tab_hline (t, TAL_1, 1, n_cols - 1, row + count * n_subfactors * n_stat_rows); - if ( factor->v2 ) + if ( factor->subfactor ) { int count2=0; - + struct hsh_iterator h2; + const struct factor_statistics *sub_fs; + tab_text (t, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE, - var_to_string(factor->v2)); + var_to_string(factor->subfactor->indep_var)); - for ( vv = hsh_first(factor->hash_table_v2, &h2); - vv != 0; - vv = hsh_next(factor->hash_table_v2, &h2)) + for ( sub_fs = hsh_first(factor->subfactor->hash_table_val, + &h2); + sub_fs != 0; + sub_fs = hsh_next(factor->subfactor->hash_table_val, + &h2)) { tab_text(t, 2, @@ -479,7 +561,7 @@ show_descriptives(struct variable **dependent_var, + count * n_subfactors * n_stat_rows + count2 * n_stat_rows, TAB_RIGHT | TAT_TITLE , - value_to_string(vv, factor->v2) + value_to_string(sub_fs->id, factor->subfactor->indep_var) ); if ( count2 > 0 ) @@ -491,8 +573,9 @@ show_descriptives(struct variable **dependent_var, populate_descriptives(t, heading_columns, row + count * n_subfactors - * n_stat_rows - + count2 * n_stat_rows); + * n_stat_rows + + count2 * n_stat_rows, + &sub_fs->stats[i]); count2++; @@ -500,9 +583,11 @@ show_descriptives(struct variable **dependent_var, } else { + populate_descriptives(t, heading_columns, row - + count * n_subfactors * n_stat_rows); + + count * n_subfactors * n_stat_rows, + &fs->stats[i]); } count ++; @@ -511,7 +596,7 @@ show_descriptives(struct variable **dependent_var, else { populate_descriptives(t, heading_columns, - row); + row, &totals->stats[i]); } tab_text (t, @@ -525,83 +610,155 @@ show_descriptives(struct variable **dependent_var, tab_title (t, 0, _("Descriptives")); tab_submit(t); + } /* Fill in the descriptives data */ void -populate_descriptives(struct tab_table *t, int col, int row) +populate_descriptives(struct tab_table *tbl, int col, int row, + const struct metrics *m) { - tab_text (t, col, + const double t = gsl_cdf_tdist_Qinv(1 - cmd.n_cinterval[0]/100.0/2.0, \ + m->n -1); + + + tab_text (tbl, col, row, TAB_LEFT | TAT_TITLE, _("Mean")); + tab_float (tbl, col + 2, + row, + TAB_CENTER, + m->mean, + 8,2); + + tab_float (tbl, col + 3, + row, + TAB_CENTER, + m->stderr, + 8,3); + - tab_text (t, col, + tab_text (tbl, col, row + 1, TAB_LEFT | TAT_TITLE | TAT_PRINTF, _("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]); - tab_text (t, col + 1, - row + 1, - TAB_LEFT | TAT_TITLE, - _("Upper Bound")); - tab_text (t, col + 1, - row + 2, + tab_text (tbl, col + 1, + row + 1, TAB_LEFT | TAT_TITLE, _("Lower Bound")); + tab_float (tbl, col + 2, + row + 1, + TAB_CENTER, + m->mean - t * m->stderr, + 8,3); + + tab_text (tbl, col + 1, + row + 2, + TAB_LEFT | TAT_TITLE, + _("Upper Bound")); + + + tab_float (tbl, col + 2, + row + 2, + TAB_CENTER, + m->mean + t * m->stderr, + 8,3); - tab_text (t, col, + tab_text (tbl, col, row + 3, TAB_LEFT | TAT_TITLE, _("5% Trimmed Mean")); - tab_text (t, col, + tab_float (tbl, col + 2, + row + 3, + TAB_CENTER, + m->trimmed_mean, + 8,2); + + tab_text (tbl, col, row + 4, TAB_LEFT | TAT_TITLE, _("Median")); - tab_text (t, col, + tab_text (tbl, col, row + 5, TAB_LEFT | TAT_TITLE, _("Variance")); - tab_text (t, col, + tab_float (tbl, col + 2, + row + 5, + TAB_CENTER, + m->var, + 8,3); + + + tab_text (tbl, col, row + 6, TAB_LEFT | TAT_TITLE, _("Std. Deviation")); - tab_text (t, col, + + tab_float (tbl, col + 2, + row + 6, + TAB_CENTER, + m->stddev, + 8,3); + + + tab_text (tbl, col, row + 7, TAB_LEFT | TAT_TITLE, _("Minimum")); - tab_text (t, col, + tab_float (tbl, col + 2, + row + 7, + TAB_CENTER, + m->min, + 8,3); + + tab_text (tbl, col, row + 8, TAB_LEFT | TAT_TITLE, _("Maximum")); - tab_text (t, col, + tab_float (tbl, col + 2, + row + 8, + TAB_CENTER, + m->max, + 8,3); + + + tab_text (tbl, col, row + 9, TAB_LEFT | TAT_TITLE, _("Range")); - tab_text (t, col, + + tab_float (tbl, col + 2, + row + 9, + TAB_CENTER, + m->max - m->min, + 8,3); + + tab_text (tbl, col, row + 10, TAB_LEFT | TAT_TITLE, _("Interquartile Range")); - tab_text (t, col, + tab_text (tbl, col, row + 11, TAB_LEFT | TAT_TITLE, _("Skewness")); - tab_text (t, col, + tab_text (tbl, col, row + 12, TAB_LEFT | TAT_TITLE, _("Kurtosis")); @@ -624,7 +781,7 @@ show_summary(struct variable **dependent_var, int heading_columns ; int n_cols; const int heading_rows = 3; - struct tab_table *t; + struct tab_table *tbl; int n_rows = heading_rows; @@ -635,52 +792,52 @@ show_summary(struct variable **dependent_var, } else { - assert(factor->v1); - if ( factor->v2 == 0 ) + assert(factor->indep_var); + if ( factor->subfactor == 0 ) { heading_columns = 2; - n_rows += n_dep_var * hsh_count(factor->hash_table_v1); + n_rows += n_dep_var * hsh_count(factor->hash_table_val); } else { heading_columns = 3; - n_rows += n_dep_var * hsh_count(factor->hash_table_v1) * - hsh_count(factor->hash_table_v2) ; + n_rows += n_dep_var * hsh_count(factor->hash_table_val) * + hsh_count(factor->subfactor->hash_table_val) ; } } n_cols = heading_columns + 6; - t = tab_create (n_cols,n_rows,0); - tab_headers (t, heading_columns, 0, heading_rows, 0); + tbl = tab_create (n_cols,n_rows,0); + tab_headers (tbl, heading_columns, 0, heading_rows, 0); - tab_dim (t, tab_natural_dimensions); + tab_dim (tbl, tab_natural_dimensions); /* Outline the box and have vertical internal lines*/ - tab_box (t, + tab_box (tbl, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1); - tab_hline (t, TAL_2, 0, n_cols - 1, heading_rows ); - tab_hline (t, TAL_1, heading_columns, n_cols - 1, 1 ); - tab_hline (t, TAL_1, 0, n_cols - 1, heading_rows -1 ); + tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows ); + tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 ); + tab_hline (tbl, TAL_1, 0, n_cols - 1, heading_rows -1 ); - tab_vline (t, TAL_2, heading_columns, 0, n_rows - 1); + tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1); - tab_title (t, 0, _("Case Processing Summary")); + tab_title (tbl, 0, _("Case Processing Summary")); - tab_joint_text(t, heading_columns, 0, + tab_joint_text(tbl, heading_columns, 0, n_cols -1, 0, TAB_CENTER | TAT_TITLE, _("Cases")); /* Remove lines ... */ - tab_box (t, + tab_box (tbl, -1, -1, TAL_0, TAL_0, heading_columns, 0, @@ -688,26 +845,26 @@ show_summary(struct variable **dependent_var, if ( factor ) { - tab_text (t, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE, - var_to_string(factor->v1)); + tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE, + var_to_string(factor->indep_var)); - if ( factor->v2 ) - tab_text (t, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE, - var_to_string(factor->v2)); + if ( factor->subfactor ) + tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE, + var_to_string(factor->subfactor->indep_var)); } for ( i = 0 ; i < 3 ; ++i ) { - tab_text (t, heading_columns + i*2 , 2, TAB_CENTER | TAT_TITLE, _("N")); - tab_text (t, heading_columns + i*2 + 1, 2, TAB_CENTER | TAT_TITLE, + tab_text (tbl, heading_columns + i*2 , 2, TAB_CENTER | TAT_TITLE, _("N")); + tab_text (tbl, heading_columns + i*2 + 1, 2, TAB_CENTER | TAT_TITLE, _("Percent")); - tab_joint_text(t, heading_columns + i*2 , 1, + tab_joint_text(tbl, heading_columns + i*2 , 1, heading_columns + i*2 + 1, 1, TAB_CENTER | TAT_TITLE, subtitle[i]); - tab_box (t, -1, -1, + tab_box (tbl, -1, -1, TAL_0, TAL_0, heading_columns + i*2, 1, heading_columns + i*2 + 1, 1); @@ -722,12 +879,12 @@ show_summary(struct variable **dependent_var, if ( factor ) { - n_factors = hsh_count(factor->hash_table_v1); - if ( factor->v2 ) - n_subfactors = hsh_count(factor->hash_table_v2); + n_factors = hsh_count(factor->hash_table_val); + if ( factor->subfactor ) + n_subfactors = hsh_count(factor->subfactor->hash_table_val); } - tab_text (t, + tab_text (tbl, 0, i * n_factors * n_subfactors + heading_rows, TAB_LEFT | TAT_TITLE, var_to_string(dependent_var[i]) @@ -736,36 +893,38 @@ show_summary(struct variable **dependent_var, if ( factor ) { struct hsh_iterator hi; - union value *v; + const struct factor_statistics *fs; int count = 0; - for ( v = hsh_first(factor->hash_table_v1, &hi); - v != 0; - v = hsh_next(factor->hash_table_v1, &hi)) + for (fs = hsh_first(factor->hash_table_val, &hi); + fs != 0; + fs = hsh_next(factor->hash_table_val, &hi)) { - struct hsh_iterator h2; - union value *vv; - - tab_text (t, 1, + tab_text (tbl, 1, i * n_factors * n_subfactors + heading_rows + count * n_subfactors, TAB_RIGHT | TAT_TITLE, - value_to_string(v, factor->v1) + value_to_string(fs->id, factor->indep_var) ); - if ( factor->v2 ) + if ( factor->subfactor ) { int count2=0; - for ( vv = hsh_first(factor->hash_table_v2, &h2); - vv != 0; - vv = hsh_next(factor->hash_table_v2, &h2)) + struct hsh_iterator h2; + const struct factor_statistics *sub_fs; + + for ( sub_fs = hsh_first(factor->subfactor->hash_table_val, + &h2); + sub_fs != 0; + sub_fs = hsh_next(factor->subfactor->hash_table_val, + &h2)) { - tab_text(t, 2, + tab_text(tbl, 2, i * n_factors * n_subfactors + heading_rows + count * n_subfactors + count2, TAB_RIGHT | TAT_TITLE , - value_to_string(vv, factor->v2) + value_to_string(sub_fs->id, factor->subfactor->indep_var) ); count2++; @@ -777,32 +936,55 @@ show_summary(struct variable **dependent_var, } - tab_submit (t); + tab_submit (tbl); } - - static int bad_weight_warn = 1; + static void -calculate(const struct casefile *cf, void *cmd_) +run_examine(const struct casefile *cf, void *aux UNUSED) { + struct hsh_iterator hi; + struct factor *fctr; + struct casereader *r; struct ccase c; + int v; + + /* Make sure we haven't got rubbish left over from a + previous split */ + if ( hash_table_factors ) + { + for ( fctr = hsh_first(hash_table_factors, &hi); + fctr != 0; + fctr = hsh_next (hash_table_factors, &hi) ) + { + hsh_clear(fctr->hash_table_val); + + while ( (fctr = fctr->subfactor) ) + hsh_clear(fctr->hash_table_val); + } + } - struct cmd_examine *cmd = (struct cmd_examine *) cmd_; + for ( v = 0 ; v < n_dependent_vars ; ++v ) + metrics_precalc(&totals->stats[v]); for(r = casefile_get_reader (cf); casereader_read (r, &c) ; - case_destroy (&c)) + case_destroy (&c) ) { - int i; - struct hsh_iterator hi; - struct factor *fctr; - const double weight = - dict_get_case_weight(default_dict,&c,&bad_weight_warn); + dict_get_case_weight(default_dict, &c, &bad_weight_warn); + + for ( v = 0 ; v < n_dependent_vars ; ++v ) + { + const struct variable *var = dependent_vars[v]; + const union value *val = case_data (&c, var->fv); + + metrics_calc(&totals->stats[v], val, weight); + } if ( hash_table_factors ) { @@ -810,29 +992,117 @@ calculate(const struct casefile *cf, void *cmd_) fctr != 0; fctr = hsh_next (hash_table_factors, &hi) ) { - union value *val; + const union value *indep_val = + case_data(&c, fctr->indep_var->fv); + + struct factor_statistics **foo = ( struct factor_statistics ** ) + hsh_probe(fctr->hash_table_val, (void *) &indep_val); + + if ( !*foo ) + { + *foo = xmalloc ( sizeof ( struct factor_statistics)); + (*foo)->id = indep_val; + (*foo)->stats = xmalloc ( sizeof ( struct metrics ) + * n_dependent_vars); + for ( v = 0 ; v < n_dependent_vars ; ++v ) + metrics_precalc( &(*foo)->stats[v] ); - val = case_data (&c, fctr->v1->fv); - hsh_insert(fctr->hash_table_v1,val); + hsh_insert(fctr->hash_table_val, (void *) *foo); + } - if ( fctr->hash_table_v2 ) + for ( v = 0 ; v < n_dependent_vars ; ++v ) { - val = case_data (&c, fctr->v2->fv); - hsh_insert(fctr->hash_table_v2,val); + const struct variable *var = dependent_vars[v]; + const union value *val = case_data (&c, var->fv); + + metrics_calc( &(*foo)->stats[v], val, weight ); + } + + if ( fctr->subfactor ) + { + struct factor *sfctr = fctr->subfactor; + + const union value *ii_val = + case_data (&c, sfctr->indep_var->fv); + + struct factor_statistics **bar = + (struct factor_statistics **) + hsh_probe(sfctr->hash_table_val, (void *) &ii_val); + + if ( !*bar ) + { + *bar = xmalloc ( sizeof ( struct factor_statistics)); + (*bar)->id = ii_val; + (*bar)->stats = xmalloc ( sizeof ( struct metrics ) + * n_dependent_vars); + + for ( v = 0 ; v < n_dependent_vars ; ++v ) + metrics_precalc( &(*bar)->stats[v] ); + + hsh_insert(sfctr->hash_table_val, + (void *) *bar); + } + + for ( v = 0 ; v < n_dependent_vars ; ++v ) + { + const struct variable *var = dependent_vars[v]; + const union value *val = case_data (&c, var->fv); + + metrics_calc( &(*bar)->stats[v], val, weight ); + } } } } } + + for ( v = 0 ; v < n_dependent_vars ; ++v) + { + if ( hash_table_factors ) + { + for ( fctr = hsh_first(hash_table_factors, &hi); + fctr != 0; + fctr = hsh_next (hash_table_factors, &hi) ) + { + struct hsh_iterator h2; + struct factor_statistics *fs; + + for ( fs = hsh_first(fctr->hash_table_val,&h2); + fs != 0; + fs = hsh_next(fctr->hash_table_val,&h2)) + { + metrics_postcalc( &fs->stats[v] ); + } + + if ( fctr->subfactor) + { + struct hsh_iterator hsf; + struct factor_statistics *fss; + + for ( fss = hsh_first(fctr->subfactor->hash_table_val,&hsf); + fss != 0; + fss = hsh_next(fctr->subfactor->hash_table_val,&hsf)) + { + metrics_postcalc( &fss->stats[v] ); + } + } + } + } + + metrics_postcalc(&totals->stats[v]); + } + + output_examine(); + } static void show_extremes(struct variable **dependent_var, - int n_dep_var, - struct factor *factor, - int n_extremities) + int n_dep_var, + struct factor *factor, + int n_extremities) { int i; int heading_columns ; @@ -849,19 +1119,19 @@ show_extremes(struct variable **dependent_var, } else { - assert(factor->v1); - if ( factor->v2 == 0 ) + assert(factor->indep_var); + if ( factor->subfactor == 0 ) { heading_columns = 2 + 1; n_rows += n_dep_var * 2 * n_extremities - * hsh_count(factor->hash_table_v1); + * hsh_count(factor->hash_table_val); } else { heading_columns = 3 + 1; n_rows += n_dep_var * 2 * n_extremities - * hsh_count(factor->hash_table_v1) - * hsh_count(factor->hash_table_v2) ; + * hsh_count(factor->hash_table_val) + * hsh_count(factor->subfactor->hash_table_val) ; } } @@ -899,11 +1169,11 @@ show_extremes(struct variable **dependent_var, if ( factor ) { tab_text (t, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE, - var_to_string(factor->v1)); + var_to_string(factor->indep_var)); - if ( factor->v2 ) + if ( factor->subfactor ) tab_text (t, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE, - var_to_string(factor->v2)); + var_to_string(factor->subfactor->indep_var)); } tab_text (t, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Value")); @@ -917,9 +1187,9 @@ show_extremes(struct variable **dependent_var, if ( factor ) { - n_factors = hsh_count(factor->hash_table_v1); - if ( factor->v2 ) - n_subfactors = hsh_count(factor->hash_table_v2); + n_factors = hsh_count(factor->hash_table_val); + if ( factor->subfactor ) + n_subfactors = hsh_count(factor->subfactor->hash_table_val); } tab_text (t, @@ -935,26 +1205,23 @@ show_extremes(struct variable **dependent_var, TAL_1, 0, n_cols - 1, heading_rows + 2 * n_extremities * (i * n_factors * n_subfactors ) - ); + ); if ( factor ) { struct hsh_iterator hi; - union value *v; + const struct factor_statistics *fs; int count = 0; - for ( v = hsh_first(factor->hash_table_v1, &hi); - v != 0; - v = hsh_next(factor->hash_table_v1, &hi)) + for ( fs = hsh_first(factor->hash_table_val, &hi); + fs != 0; + fs = hsh_next(factor->hash_table_val, &hi)) { - struct hsh_iterator h2; - union value *vv; - tab_text (t, 1, heading_rows + 2 * n_extremities * (i * n_factors * n_subfactors + count * n_subfactors), TAB_RIGHT | TAT_TITLE, - value_to_string(v, factor->v1) + value_to_string(fs->id, factor->indep_var) ); if ( count > 0 ) @@ -964,19 +1231,25 @@ show_extremes(struct variable **dependent_var, + count * n_subfactors)); - if ( factor->v2 ) + if ( factor->subfactor ) { + struct hsh_iterator h2; + const struct factor_statistics *sub_fs; int count2=0; - for ( vv = hsh_first(factor->hash_table_v2, &h2); - vv != 0; - vv = hsh_next(factor->hash_table_v2, &h2)) + + for ( sub_fs = hsh_first(factor->subfactor->hash_table_val, + &h2); + sub_fs != 0; + sub_fs = hsh_next(factor->subfactor->hash_table_val, + &h2)) { tab_text(t, 2, heading_rows + 2 * n_extremities * (i * n_factors * n_subfactors + count * n_subfactors + count2 ), TAB_RIGHT | TAT_TITLE , - value_to_string(vv, factor->v2) + value_to_string(sub_fs->id, + factor->subfactor->indep_var) ); @@ -1001,7 +1274,7 @@ show_extremes(struct variable **dependent_var, heading_rows + 2 * n_extremities * (i * n_factors * n_subfactors + count * n_subfactors), - n_extremities); + n_extremities); } count ++; @@ -1017,9 +1290,7 @@ show_extremes(struct variable **dependent_var, } } - tab_submit (t); - } @@ -1035,21 +1306,111 @@ populate_extremities(struct tab_table *t, int col, int row, int n) _("Highest") ); - tab_text(t, col, row + n , TAB_RIGHT | TAT_TITLE , _("Lowest") ); - for (i = 0; i < n ; ++i ) { tab_float(t, col + 1, row + i, - TAB_RIGHT | TAT_TITLE, + TAB_RIGHT, i + 1, 8, 0); tab_float(t, col + 1, row + i + n, - TAB_RIGHT | TAT_TITLE, + TAB_RIGHT, i + 1, 8, 0); } } + + + +/* Plot the normal and detrended normal plots for m + Label the plots with factorname */ +void +np_plot(const struct metrics *m, const char *factorname) +{ + int i; + double yfirst=0, ylast=0; + + /* Normal Plot */ + struct chart np_chart; + + /* Detrended Normal Plot */ + struct chart dnp_chart; + + const struct weighted_value *wv = m->wv; + const int n_data = hsh_count(m->ordered_data) ; + + /* The slope and intercept of the ideal normal probability line */ + const double slope = 1.0 / m->stddev; + const double intercept = - m->mean / m->stddev; + + chart_initialise(&np_chart); + chart_write_title(&np_chart, _("Normal Q-Q Plot of %s"), factorname); + chart_write_xlabel(&np_chart, _("Observed Value")); + chart_write_ylabel(&np_chart, _("Expected Normal")); + + chart_initialise(&dnp_chart); + chart_write_title(&dnp_chart, _("Detrended Normal Q-Q Plot of %s"), + factorname); + chart_write_xlabel(&dnp_chart, _("Observed Value")); + chart_write_ylabel(&dnp_chart, _("Dev from Normal")); + + yfirst = gsl_cdf_ugaussian_Pinv (wv[0].rank / ( m->n + 1)); + ylast = gsl_cdf_ugaussian_Pinv (wv[n_data-1].rank / ( m->n + 1)); + + { + /* Need to make sure that both the scatter plot and the ideal fit into the + plot */ + double x_lower = min(m->min, (yfirst - intercept) / slope) ; + double x_upper = max(m->max, (ylast - intercept) / slope) ; + double slack = (x_upper - x_lower) * 0.05 ; + + chart_write_xscale(&np_chart, x_lower - slack, x_upper + slack, + chart_rounded_tick((m->max - m->min) / 5.0)); + + + chart_write_xscale(&dnp_chart, m->min, m->max, + chart_rounded_tick((m->max - m->min) / 5.0)); + + } + + chart_write_yscale(&np_chart, yfirst, ylast, + chart_rounded_tick((ylast - yfirst)/5.0) ); + + { + /* We have to cache the detrended data, beacause we need to + find its limits before we can plot it */ + double *d_data; + d_data = xmalloc (n_data * sizeof(double)); + double d_max = -DBL_MAX; + double d_min = DBL_MAX; + for ( i = 0 ; i < n_data; ++i ) + { + const double ns = gsl_cdf_ugaussian_Pinv (wv[i].rank / ( m->n + 1)); + + chart_datum(&np_chart, 0, wv[i].v.f, ns); + + d_data[i] = (wv[i].v.f - m->mean) / m->stddev - ns; + + if ( d_data[i] < d_min ) d_min = d_data[i]; + if ( d_data[i] > d_max ) d_max = d_data[i]; + } + + chart_write_yscale(&dnp_chart, d_min, d_max, + chart_rounded_tick((d_max - d_min) / 5.0)); + + for ( i = 0 ; i < n_data; ++i ) + chart_datum(&dnp_chart, 0, wv[i].v.f, d_data[i]); + + free(d_data); + } + + chart_line(&np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y); + chart_line(&dnp_chart, 0, 0, m->min, m->max , CHART_DIM_X); + + chart_finalise(&np_chart); + chart_finalise(&dnp_chart); + +}