X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Froc.c;h=024c9f85297c5bf3fc15fae0f7af95fac121b09e;hb=3d8d78ad9ca206b6489cc3944c985c8ba89e4b1e;hp=e6251b36b38ff79c495931e1b805f4489df3c992;hpb=8f07df1b704439ced3bb6715da86f71866cca41f;p=pspp-builds.git diff --git a/src/language/stats/roc.c b/src/language/stats/roc.c index e6251b36..024c9f85 100644 --- a/src/language/stats/roc.c +++ b/src/language/stats/roc.c @@ -16,10 +16,10 @@ #include -#include "roc.h" #include #include #include +#include #include #include @@ -47,8 +47,9 @@ struct cmd_roc { size_t n_vars; const struct variable **vars; + const struct dictionary *dict; - struct variable *state_var ; + const struct variable *state_var ; union value state_value; /* Plot the roc curve */ @@ -66,6 +67,11 @@ struct cmd_roc bool invert ; /* True iff a smaller test result variable indicates a positive result */ + + double pos; + double neg; + double pos_weighted; + double neg_weighted; }; static int run_roc (struct dataset *ds, struct cmd_roc *roc); @@ -86,21 +92,24 @@ cmd_roc (struct lexer *lexer, struct dataset *ds) roc.ci = 95; roc.bi_neg_exp = false; roc.invert = false; + roc.pos = roc.pos_weighted = 0; + roc.neg = roc.neg_weighted = 0; + roc.dict = dataset_dict (ds); if (!parse_variables_const (lexer, dict, &roc.vars, &roc.n_vars, PV_APPEND | PV_NO_DUPLICATE | PV_NUMERIC)) - return 2; + goto error;; if ( ! lex_force_match (lexer, T_BY)) { - return 2; + goto error;; } roc.state_var = parse_variable (lexer, dict); if ( !lex_force_match (lexer, '(')) { - return 2; + goto error;; } parse_value (lexer, &roc.state_value, var_get_width (roc.state_var)); @@ -108,7 +117,7 @@ cmd_roc (struct lexer *lexer, struct dataset *ds) if ( !lex_force_match (lexer, ')')) { - return 2; + goto error;; } @@ -131,7 +140,7 @@ cmd_roc (struct lexer *lexer, struct dataset *ds) else { lex_error (lexer, NULL); - return 2; + goto error;; } } } @@ -155,7 +164,7 @@ cmd_roc (struct lexer *lexer, struct dataset *ds) else { lex_error (lexer, NULL); - return 2; + goto error;; } } else if (lex_match_id (lexer, "PRINT")) @@ -174,7 +183,7 @@ cmd_roc (struct lexer *lexer, struct dataset *ds) else { lex_error (lexer, NULL); - return 2; + goto error;; } } } @@ -197,7 +206,7 @@ cmd_roc (struct lexer *lexer, struct dataset *ds) else { lex_error (lexer, NULL); - return 2; + goto error;; } lex_force_match (lexer, ')'); } @@ -215,7 +224,7 @@ cmd_roc (struct lexer *lexer, struct dataset *ds) else { lex_error (lexer, NULL); - return 2; + goto error;; } lex_force_match (lexer, ')'); } @@ -241,14 +250,14 @@ cmd_roc (struct lexer *lexer, struct dataset *ds) else { lex_error (lexer, NULL); - return 2; + goto error;; } lex_force_match (lexer, ')'); } else { lex_error (lexer, NULL); - return 2; + goto error;; } } } @@ -259,9 +268,14 @@ cmd_roc (struct lexer *lexer, struct dataset *ds) } } - run_roc (ds, &roc); + if ( ! run_roc (ds, &roc)) + goto error;; + + return CMD_SUCCESS; - return 1; + error: + free (roc.vars); + return CMD_FAILURE; } @@ -310,14 +324,36 @@ dump_casereader (struct casereader *reader) } #endif + +/* + Return true iff the state variable indicates that C has positive actual state. + + As a side effect, this function also accumulates the roc->{pos,neg} and + roc->{pos,neg}_weighted counts. + */ static bool match_positives (const struct ccase *c, void *aux) { struct cmd_roc *roc = aux; + const struct variable *wv = dict_get_weight (roc->dict); + const double weight = wv ? case_data (c, wv)->f : 1.0; + + const bool positive = + ( 0 == value_compare_3way (case_data (c, roc->state_var), &roc->state_value, + var_get_width (roc->state_var))); + + if ( positive ) + { + roc->pos++; + roc->pos_weighted += weight; + } + else + { + roc->neg++; + roc->neg_weighted += weight; + } - return 0 == value_compare_3way (case_data (c, roc->state_var), - &roc->state_value, - var_get_width (roc->state_var)); + return positive; } @@ -325,6 +361,8 @@ match_positives (const struct ccase *c, void *aux) #define N_EQ 1 #define N_PRED 2 +/* Some intermediate state for calculating the cutpoints and the + standard error values */ struct roc_state { double auc; @@ -342,8 +380,6 @@ struct roc_state double max; }; - - #define CUTPOINT 0 #define TP 1 #define FN 2 @@ -351,6 +387,15 @@ struct roc_state #define FP 4 +/* + Return a new casereader based upon CUTPOINT_RDR. + The number of "positive" cases are placed into + the position TRUE_INDEX, and the number of "negative" cases + into FALSE_INDEX. + POS_COND and RESULT determine the semantics of what is + "positive". + WEIGHT is the value of a single count. + */ static struct casereader * accumulate_counts (struct casereader *cutpoint_rdr, double result, double weight, @@ -364,12 +409,13 @@ accumulate_counts (struct casereader *cutpoint_rdr, struct ccase *cpc; double prev_cp = SYSMIS; - for ( ; (cpc = casereader_read (r) ); case_unref (cpc)) { struct ccase *new_case; const double cp = case_data_idx (cpc, CUTPOINT)->f; + assert (cp != SYSMIS); + /* We don't want duplicates here */ if ( cp == prev_cp ) continue; @@ -377,13 +423,9 @@ accumulate_counts (struct casereader *cutpoint_rdr, new_case = case_clone (cpc); if ( pos_cond (result, cp)) - { - case_data_rw_idx (new_case, true_index)->f += weight; - } + case_data_rw_idx (new_case, true_index)->f += weight; else - { - case_data_rw_idx (new_case, false_index)->f += weight; - } + case_data_rw_idx (new_case, false_index)->f += weight; prev_cp = cp; @@ -407,10 +449,10 @@ process_group (const struct variable *var, struct casereader *reader, struct casereader **cutpoint_rdr, bool (*pos_cond) (double, double), int true_index, - int false_index - ) + int false_index) { const struct variable *w = dict_get_weight (dict); + struct casereader *r1 = casereader_create_distinct (sort_execute_1var (reader, var), var, w); @@ -479,6 +521,12 @@ process_group (const struct variable *var, struct casereader *reader, return casewriter_make_reader (wtr); } +/* Some more indeces into case data */ +#define N_POS_EQ 1 /* number of positive cases with values equal to n */ +#define N_POS_GT 2 /* number of postive cases with values greater than n */ +#define N_NEG_EQ 3 /* number of negative cases with values equal to n */ +#define N_NEG_LT 4 /* number of negative cases with values less than n */ + static bool gt (double d1, double d2) { @@ -498,6 +546,14 @@ lt (double d1, double d2) return d1 < d2; } + +/* + Return a casereader with width 3, + populated with cases based upon READER. + The cases will have the values: + (N, number of cases equal to N, number of cases greater than N) + As a side effect, update RS->n1 with the number of positive cases. +*/ static struct casereader * process_positive_group (const struct variable *var, struct casereader *reader, const struct dictionary *dict, @@ -509,7 +565,13 @@ process_positive_group (const struct variable *var, struct casereader *reader, TP, FN); } - +/* + Return a casereader with width 3, + populated with cases based upon READER. + The cases will have the values: + (N, number of cases equal to N, number of cases less than N) + As a side effect, update RS->n2 with the number of negative cases. +*/ static struct casereader * process_negative_group (const struct variable *var, struct casereader *reader, const struct dictionary *dict, @@ -535,12 +597,17 @@ append_cutpoint (struct casewriter *writer, double cutpoint) case_data_rw_idx (cc, TN)->f = 0; case_data_rw_idx (cc, FP)->f = 0; - casewriter_write (writer, cc); } -/* Prepare the cutpoints */ +/* + Create and initialise the rs[x].cutpoint_rdr casereaders. That is, the readers will + be created with width 5, ready to take the values (cutpoint, TP, FN, TN, FP), and the + reader will be populated with its final number of cases. + However on exit from this function, only CUTPOINT entries will be set to their final + value. The other entries will be initialised to zero. +*/ static void prepare_cutpoints (struct cmd_roc *roc, struct roc_state *rs, struct casereader *input) { @@ -570,7 +637,11 @@ prepare_cutpoints (struct cmd_roc *roc, struct roc_state *rs, struct casereader { for (i = 0 ; i < roc->n_vars; ++i) { - const double result = case_data (c, roc->vars[i])->f; + const union value *v = case_data (c, roc->vars[i]); + const double result = v->f; + + if ( mv_is_value_missing (var_get_missing_values (roc->vars[i]), v, roc->exclude)) + continue; minimize (&rs[i].min, result); maximize (&rs[i].max, result); @@ -598,14 +669,12 @@ prepare_cutpoints (struct cmd_roc *roc, struct roc_state *rs, struct casereader } static void -do_roc (struct cmd_roc *roc, struct casereader *input, struct dictionary *dict) +do_roc (struct cmd_roc *roc, struct casereader *reader, struct dictionary *dict) { int i; struct roc_state *rs = xcalloc (roc->n_vars, sizeof *rs); - struct casewriter *neg_wtr = autopaging_writer_create (casereader_get_proto (input)); - struct casereader *negatives = NULL; struct casereader *positives = NULL; @@ -614,8 +683,26 @@ do_roc (struct cmd_roc *roc, struct casereader *input, struct dictionary *dict) struct subcase up_ordering; struct subcase down_ordering; + struct casewriter *neg_wtr = NULL; + + struct casereader *input = casereader_create_filter_missing (reader, + roc->vars, roc->n_vars, + roc->exclude, + NULL, + NULL); + + input = casereader_create_filter_missing (input, + &roc->state_var, 1, + roc->exclude, + NULL, + NULL); + + neg_wtr = autopaging_writer_create (casereader_get_proto (input)); + prepare_cutpoints (roc, rs, input); + + /* Separate the positive actual state cases from the negative ones */ positives = casereader_create_filter_func (input, match_positives, @@ -648,6 +735,7 @@ do_roc (struct cmd_roc *roc, struct casereader *input, struct dictionary *dict) struct casereader *neg ; struct casereader *pos = casereader_clone (positives); + struct casereader *n_pos = process_positive_group (var, pos, dict, &rs[i]); @@ -660,6 +748,8 @@ do_roc (struct cmd_roc *roc, struct casereader *input, struct dictionary *dict) n_neg = process_negative_group (var, neg, dict, &rs[i]); + + /* Merge the n_pos and n_neg casereaders */ w = sort_create_writer (&up_ordering, n_proto); for ( ; (cpos = casereader_read (n_pos) ); case_unref (cpos)) { @@ -674,12 +764,12 @@ do_roc (struct cmd_roc *roc, struct casereader *input, struct dictionary *dict) const double jneg = case_data_idx (cneg, VALUE)->f; case_data_rw_idx (nc, VALUE)->f = jneg; - case_data_rw_idx (nc, N_EQ)->f = 0; + case_data_rw_idx (nc, N_POS_EQ)->f = 0; - case_data_rw_idx (nc, N_PRED)->f = SYSMIS; + case_data_rw_idx (nc, N_POS_GT)->f = SYSMIS; - *case_data_rw_idx (nc, 3) = *case_data_idx (cneg, N_EQ); - *case_data_rw_idx (nc, 4) = *case_data_idx (cneg, N_PRED); + *case_data_rw_idx (nc, N_NEG_EQ) = *case_data_idx (cneg, N_EQ); + *case_data_rw_idx (nc, N_NEG_LT) = *case_data_idx (cneg, N_PRED); casewriter_write (w, nc); @@ -689,29 +779,35 @@ do_roc (struct cmd_roc *roc, struct casereader *input, struct dictionary *dict) } case_data_rw_idx (pos_case, VALUE)->f = jpos; - *case_data_rw_idx (pos_case, N_EQ) = *case_data_idx (cpos, N_EQ); - *case_data_rw_idx (pos_case, N_PRED) = *case_data_idx (cpos, N_PRED); - case_data_rw_idx (pos_case, 3)->f = 0; - case_data_rw_idx (pos_case, 4)->f = SYSMIS; + *case_data_rw_idx (pos_case, N_POS_EQ) = *case_data_idx (cpos, N_EQ); + *case_data_rw_idx (pos_case, N_POS_GT) = *case_data_idx (cpos, N_PRED); + case_data_rw_idx (pos_case, N_NEG_EQ)->f = 0; + case_data_rw_idx (pos_case, N_NEG_LT)->f = SYSMIS; casewriter_write (w, pos_case); } +/* These aren't used anymore */ +#undef N_EQ +#undef N_PRED + r = casewriter_make_reader (w); + /* Propagate the N_POS_GT values from the positive cases + to the negative ones */ { double prev_pos_gt = rs[i].n1; w = sort_create_writer (&down_ordering, n_proto); for ( ; (c = casereader_read (r) ); case_unref (c)) { - double n_pos_gt = case_data_idx (c, N_PRED)->f; + double n_pos_gt = case_data_idx (c, N_POS_GT)->f; struct ccase *nc = case_clone (c); if ( n_pos_gt == SYSMIS) { n_pos_gt = prev_pos_gt; - case_data_rw_idx (nc, N_PRED)->f = n_pos_gt; + case_data_rw_idx (nc, N_POS_GT)->f = n_pos_gt; } casewriter_write (w, nc); @@ -721,19 +817,21 @@ do_roc (struct cmd_roc *roc, struct casereader *input, struct dictionary *dict) r = casewriter_make_reader (w); } + /* Propagate the N_NEG_LT values from the negative cases + to the positive ones */ { double prev_neg_lt = rs[i].n2; w = sort_create_writer (&up_ordering, n_proto); for ( ; (c = casereader_read (r) ); case_unref (c)) { - double n_neg_lt = case_data_idx (c, 4)->f; + double n_neg_lt = case_data_idx (c, N_NEG_LT)->f; struct ccase *nc = case_clone (c); if ( n_neg_lt == SYSMIS) { n_neg_lt = prev_neg_lt; - case_data_rw_idx (nc, 4)->f = n_neg_lt; + case_data_rw_idx (nc, N_NEG_LT)->f = n_neg_lt; } casewriter_write (w, nc); @@ -750,23 +848,23 @@ do_roc (struct cmd_roc *roc, struct casereader *input, struct dictionary *dict) const struct ccase *next_case = casereader_peek (r, 0); const double j = case_data_idx (c, VALUE)->f; - double n_pos_eq = case_data_idx (c, N_EQ)->f; - double n_pos_gt = case_data_idx (c, N_PRED)->f; - double n_neg_eq = case_data_idx (c, 3)->f; - double n_neg_lt = case_data_idx (c, 4)->f; + double n_pos_eq = case_data_idx (c, N_POS_EQ)->f; + double n_pos_gt = case_data_idx (c, N_POS_GT)->f; + double n_neg_eq = case_data_idx (c, N_NEG_EQ)->f; + double n_neg_lt = case_data_idx (c, N_NEG_LT)->f; if ( prev_case && j == case_data_idx (prev_case, VALUE)->f) { - if ( 0 == case_data_idx (c, N_EQ)->f) + if ( 0 == case_data_idx (c, N_POS_EQ)->f) { - n_pos_eq = case_data_idx (prev_case, N_EQ)->f; - n_pos_gt = case_data_idx (prev_case, N_PRED)->f; + n_pos_eq = case_data_idx (prev_case, N_POS_EQ)->f; + n_pos_gt = case_data_idx (prev_case, N_POS_GT)->f; } - if ( 0 == case_data_idx (c, 3)->f) + if ( 0 == case_data_idx (c, N_NEG_EQ)->f) { - n_neg_eq = case_data_idx (prev_case, 3)->f; - n_neg_lt = case_data_idx (prev_case, 4)->f; + n_neg_eq = case_data_idx (prev_case, N_NEG_EQ)->f; + n_neg_lt = case_data_idx (prev_case, N_NEG_LT)->f; } } @@ -948,13 +1046,11 @@ show_summary (const struct cmd_roc *roc) tab_text (tbl, 0, 3, TAB_LEFT, _("Negative")); -#if 0 tab_double (tbl, 1, 2, 0, roc->pos, &F_8_0); tab_double (tbl, 1, 3, 0, roc->neg, &F_8_0); tab_double (tbl, 2, 2, 0, roc->pos_weighted, 0); tab_double (tbl, 2, 3, 0, roc->neg_weighted, 0); -#endif tab_submit (tbl); }