X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fdescript.c;h=e84733acb02380eb33d73dc04b9834951dadf1ae;hb=29c51e39acf3554a56aa2adc9451cc5fd70318ae;hp=55b1d37cccbe637c537238fb2e2a3cd579d639d2;hpb=18de82b59d34f4936aa5557da9f35ea189f2d561;p=pspp-builds.git diff --git a/src/descript.c b/src/descript.c index 55b1d37c..e84733ac 100644 --- a/src/descript.c +++ b/src/descript.c @@ -26,8 +26,10 @@ #include #include "algorithm.h" #include "alloc.h" +#include "case.h" #include "casefile.h" #include "command.h" +#include "dictionary.h" #include "lexer.h" #include "error.h" #include "magic.h" @@ -39,6 +41,15 @@ /* DESCRIPTIVES private data. */ +struct dsc_proc; + +/* Handling of missing values. */ +enum dsc_missing_type + { + DSC_VARIABLE, /* Handle missing values on a per-variable basis. */ + DSC_LISTWISE /* Discard entire case if any variable is missing. */ + }; + /* Describes properties of a distribution for the purpose of calculating a Z-score. */ struct dsc_z_score @@ -47,6 +58,7 @@ struct dsc_z_score int dst_idx; /* Destination index into case data. */ double mean; /* Distribution mean. */ double std_dev; /* Distribution standard deviation. */ + struct variable *v; /* Variable on which z-score is based. */ }; /* DESCRIPTIVES transformation (for calculating Z-scores). */ @@ -55,6 +67,10 @@ struct dsc_trns struct trns_header h; struct dsc_z_score *z_scores; /* Array of Z-scores. */ int z_score_cnt; /* Number of Z-scores. */ + struct variable **vars; /* Variables for listwise missing checks. */ + size_t var_cnt; /* Number of variables. */ + enum dsc_missing_type missing_type; /* Treatment of missing values. */ + int include_user_missing; /* Nonzero to include user-missing values. */ }; /* Statistics. Used as bit indexes, so must be 32 or fewer. */ @@ -111,13 +127,6 @@ struct dsc_var double stats[DSC_N_STATS]; /* All the stats' values. */ }; -/* Handling of missing values. */ -enum dsc_missing_type - { - DSC_VARIABLE, /* Handle missing values on a per-variable basis. */ - DSC_LISTWISE /* Discard entire case if any variable is missing. */ - }; - /* Output format. */ enum dsc_format { @@ -142,7 +151,7 @@ struct dsc_proc /* Accumulated results. */ double missing_listwise; /* Sum of weights of cases missing listwise. */ double valid; /* Sum of weights of valid cases. */ - int bad_weight; /* Nonzero if a bad weight has been found. */ + int bad_warn; /* Warn if bad weight found. */ enum dsc_statistic sort_by_stat; /* Statistic to sort by; -1: name. */ int sort_ascending; /* !0: ascending order; 0: descending. */ unsigned long show_stats; /* Statistics to display. */ @@ -189,7 +198,7 @@ cmd_descriptives (void) dsc->format = DSC_LINE; dsc->missing_listwise = 0.; dsc->valid = 0.; - dsc->bad_weight = 0; + dsc->bad_warn = 1; dsc->sort_by_stat = DSC_NONE; dsc->sort_ascending = 1; dsc->show_stats = dsc->calc_stats = DEFAULT_STATS; @@ -254,7 +263,7 @@ cmd_descriptives (void) else if (lex_match_id ("DEFAULT")) dsc->show_stats |= DEFAULT_STATS; else - dsc->show_stats |= 1ul << (match_statistic ()); + dsc->show_stats |= 1ul << (match_statistic ()); lex_match (','); } if (dsc->show_stats == 0) @@ -265,8 +274,12 @@ cmd_descriptives (void) lex_match ('='); if (lex_match_id ("NAME")) dsc->sort_by_stat = DSC_NAME; - else - dsc->sort_by_stat = match_statistic (); + else + { + dsc->sort_by_stat = match_statistic (); + if (dsc->sort_by_stat == DSC_NONE ) + dsc->sort_by_stat = DSC_MEAN; + } if (lex_match ('(')) { if (lex_match_id ("A")) @@ -292,7 +305,7 @@ cmd_descriptives (void) if (!parse_variables (default_dict, &vars, &var_cnt, PV_APPEND | PV_NO_DUPLICATE | PV_NUMERIC)) - break; + goto error; dsc->vars = xrealloc (dsc->vars, sizeof *dsc->vars * var_cnt); for (i = dsc->var_cnt; i < var_cnt; i++) @@ -309,7 +322,7 @@ cmd_descriptives (void) if (token != T_ID) { lex_error (NULL); - break; + goto error; } if (try_name (dsc, tokid)) { @@ -318,16 +331,17 @@ cmd_descriptives (void) } else msg (SE, _("Z-score variable name %s would be" - "a duplicate variable name."), tokid); + " a duplicate variable name."), tokid); lex_get (); - lex_force_match (')'); + if (!lex_force_match (')')) + goto error; } } } else { lex_error (NULL); - break; + goto error; } lex_match ('/'); @@ -386,27 +400,26 @@ cmd_descriptives (void) /* Data pass. */ multipass_procedure_with_splits (calc_descriptives, dsc); - if (dsc->bad_weight) - msg (SW, _("At least one case in the data file had a weight value " - "that was system-missing, zero, or negative. These case(s) " - "were ignored.")); /* Z-scoring! */ if (z_cnt) setup_z_trns (dsc); /* Done. */ + free (vars); free_dsc_proc (dsc); return CMD_SUCCESS; error: + free (vars); free_dsc_proc (dsc); return CMD_FAILURE; } -/* Returns the statistic named by the current token and skips - past the token. Emits an error if the current token does not - name a statistic. */ +/* Returns the statistic named by the current token and skips past the token. + Returns DSC_NONE if no statistic is given (e.g., subcommand with no + specifiers). Emits an error if the current token ID does not name a + statistic. */ static enum dsc_statistic match_statistic (void) { @@ -416,14 +429,13 @@ match_statistic (void) for (stat = 0; stat < DSC_N_STATS; stat++) if (lex_match_id (dsc_info[stat].identifier)) - { - lex_get (); - return stat; - } + return stat; + + lex_get(); + lex_error (_("expecting statistic name: reverting to default")); } - lex_error (_("expecting statistic name")); - return DSC_MEAN; + return DSC_NONE; } /* Frees DSC. */ @@ -547,22 +559,47 @@ dump_z_table (struct dsc_proc *dsc) tab_submit (t); } -/* Transformation function to calculate Z-scores. */ +/* Transformation function to calculate Z-scores. Will return SYSMIS if any of + the following are true: 1) mean or standard deviation is SYSMIS 2) score is + SYSMIS 3) score is user missing and they were not included in the original + analyis. 4) any of the variables in the original analysis were missing + (either system or user-missing values that weren't included). +*/ static int descriptives_trns_proc (struct trns_header *trns, struct ccase * c, - int case_num UNUSED) + int case_idx UNUSED) { struct dsc_trns *t = (struct dsc_trns *) trns; struct dsc_z_score *z; + struct variable **vars; + int all_sysmis = 0; + if (t->missing_type == DSC_LISTWISE) + { + assert(t->vars); + for (vars = t->vars; vars < t->vars + t->var_cnt; vars++) + { + double score = case_num (c, (*vars)->fv); + if ( score == SYSMIS || (!t->include_user_missing + && is_num_user_missing(score, *vars)) ) + { + all_sysmis = 1; + break; + } + } + } + for (z = t->z_scores; z < t->z_scores + t->z_score_cnt; z++) { - double score = c->data[z->src_idx].f; + double input = case_num (c, z->src_idx); + double *output = &case_data_rw (c, z->dst_idx)->f; - if (z->mean == SYSMIS || score == SYSMIS) - c->data[z->dst_idx].f = SYSMIS; + if (z->mean == SYSMIS || z->std_dev == SYSMIS + || all_sysmis || input == SYSMIS + || (!t->include_user_missing && is_num_user_missing(input, z->v))) + *output = SYSMIS; else - c->data[z->dst_idx].f = (score - z->mean) / z->std_dev; + *output = (input - z->mean) / z->std_dev; } return -1; } @@ -574,6 +611,8 @@ descriptives_trns_free (struct trns_header * trns) struct dsc_trns *t = (struct dsc_trns *) trns; free (t->z_scores); + assert((t->missing_type != DSC_LISTWISE) ^ (t->vars != NULL)); + free (t->vars); } /* Sets up a transformation to calculate Z scores. */ @@ -592,6 +631,21 @@ setup_z_trns (struct dsc_proc *dsc) t->h.free = descriptives_trns_free; t->z_scores = xmalloc (cnt * sizeof *t->z_scores); t->z_score_cnt = cnt; + t->missing_type = dsc->missing_type; + t->include_user_missing = dsc->include_user_missing; + if ( t->missing_type == DSC_LISTWISE ) + { + t->var_cnt = dsc->var_cnt; + t->vars = xmalloc(t->var_cnt * sizeof *t->vars); + for (i = 0; i < t->var_cnt; i++) + t->vars[i] = dsc->vars[i].v; + } + else + { + t->var_cnt = 0; + t->vars = NULL; + } + for (cnt = i = 0; i < dsc->var_cnt; i++) { @@ -622,6 +676,7 @@ setup_z_trns (struct dsc_proc *dsc) z->dst_idx = dst_var->fv; z->mean = dv->stats[DSC_MEAN]; z->std_dev = dv->stats[DSC_STDDEV]; + z->v = dv->v; } } @@ -639,7 +694,7 @@ calc_descriptives (const struct casefile *cf, void *dsc_) { struct dsc_proc *dsc = dsc_; struct casereader *reader; - const struct ccase *c; + struct ccase c; int i; for (i = 0; i < dsc->var_cnt; i++) @@ -656,18 +711,16 @@ calc_descriptives (const struct casefile *cf, void *dsc_) dsc->valid = 0.; /* First pass to handle most of the work. */ - reader = casefile_get_reader (cf); - while (casereader_read (reader, &c)) + for (reader = casefile_get_reader (cf); + casereader_read (reader, &c); + case_destroy (&c)) { - double weight = dict_get_case_weight (default_dict, c); + double weight = dict_get_case_weight (default_dict, &c, &dsc->bad_warn); if (weight <= 0.0) - { - dsc->bad_weight = 1; - continue; - } - + continue; + /* Check for missing values. */ - if (listwise_missing (dsc, c)) + if (listwise_missing (dsc, &c)) { dsc->missing_listwise += weight; if (dsc->missing_type == DSC_LISTWISE) @@ -678,7 +731,7 @@ calc_descriptives (const struct casefile *cf, void *dsc_) for (i = 0; i < dsc->var_cnt; i++) { struct dsc_var *dv = &dsc->vars[i]; - double x = c->data[dv->v->fv].f; + double x = case_num (&c, dv->v->fv); if (dsc->missing_type != DSC_LISTWISE && (x == SYSMIS @@ -689,8 +742,9 @@ calc_descriptives (const struct casefile *cf, void *dsc_) continue; } - if (dv->moments != NULL) + if (dv->moments != NULL) moments_pass_one (dv->moments, x, weight); + if (x < dv->min) dv->min = x; if (x > dv->max) @@ -702,22 +756,24 @@ calc_descriptives (const struct casefile *cf, void *dsc_) /* Second pass for higher-order moments. */ if (dsc->max_moment > MOMENT_MEAN) { - reader = casefile_get_reader (cf); - while (casereader_read (reader, &c)) + for (reader = casefile_get_reader (cf); + casereader_read (reader, &c); + case_destroy (&c)) { - double weight = dict_get_case_weight (default_dict, c); + double weight = dict_get_case_weight (default_dict, &c, + &dsc->bad_warn); if (weight <= 0.0) continue; /* Check for missing values. */ - if (listwise_missing (dsc, c) + if (listwise_missing (dsc, &c) && dsc->missing_type == DSC_LISTWISE) continue; for (i = 0; i < dsc->var_cnt; i++) { struct dsc_var *dv = &dsc->vars[i]; - double x = c->data[dv->v->fv].f; + double x = case_num (&c, dv->v->fv); if (dsc->missing_type != DSC_LISTWISE && (x == SYSMIS @@ -782,7 +838,7 @@ listwise_missing (struct dsc_proc *dsc, const struct ccase *c) for (i = 0; i < dsc->var_cnt; i++) { struct dsc_var *dv = &dsc->vars[i]; - double x = c->data[dv->v->fv].f; + double x = case_num (c, dv->v->fv); if (x == SYSMIS || (!dsc->include_user_missing && is_num_user_missing (x, dv->v)))