X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fdescript.c;h=a7152301abfdfcfcefb93af2dfea68b6d1b3205f;hb=b9799cdd10b30ea96d9178b7a0d48504d052228c;hp=7d25b3138d4199f85a8036b6b78aa1cd6a991959;hpb=cf89e411db41c05c39753b05cf144c8b26a44d96;p=pspp-builds.git diff --git a/src/descript.c b/src/descript.c index 7d25b313..a7152301 100644 --- a/src/descript.c +++ b/src/descript.c @@ -14,8 +14,8 @@ You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA - 02111-1307, USA. */ + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. */ /* FIXME: Many possible optimizations. */ @@ -26,8 +26,10 @@ #include #include "algorithm.h" #include "alloc.h" +#include "case.h" #include "casefile.h" #include "command.h" +#include "dictionary.h" #include "lexer.h" #include "error.h" #include "magic.h" @@ -37,8 +39,21 @@ #include "var.h" #include "vfm.h" +#include "gettext.h" +#define _(msgid) gettext (msgid) +#define N_(msgid) msgid + /* DESCRIPTIVES private data. */ +struct dsc_proc; + +/* Handling of missing values. */ +enum dsc_missing_type + { + DSC_VARIABLE, /* Handle missing values on a per-variable basis. */ + DSC_LISTWISE /* Discard entire case if any variable is missing. */ + }; + /* Describes properties of a distribution for the purpose of calculating a Z-score. */ struct dsc_z_score @@ -47,14 +62,18 @@ struct dsc_z_score int dst_idx; /* Destination index into case data. */ double mean; /* Distribution mean. */ double std_dev; /* Distribution standard deviation. */ + struct variable *v; /* Variable on which z-score is based. */ }; /* DESCRIPTIVES transformation (for calculating Z-scores). */ struct dsc_trns { - struct trns_header h; struct dsc_z_score *z_scores; /* Array of Z-scores. */ int z_score_cnt; /* Number of Z-scores. */ + struct variable **vars; /* Variables for listwise missing checks. */ + size_t var_cnt; /* Number of variables. */ + enum dsc_missing_type missing_type; /* Treatment of missing values. */ + int include_user_missing; /* Nonzero to include user-missing values. */ }; /* Statistics. Used as bit indexes, so must be 32 or fewer. */ @@ -104,20 +123,13 @@ static const struct dsc_statistic_info dsc_info[DSC_N_STATS] = struct dsc_var { struct variable *v; /* Variable to calculate on. */ - char z_name[9]; /* Name for z-score variable. */ + char z_name[LONG_NAME_LEN + 1]; /* Name for z-score variable. */ double valid, missing; /* Valid, missing counts. */ struct moments *moments; /* Moments. */ double min, max; /* Maximum and mimimum values. */ double stats[DSC_N_STATS]; /* All the stats' values. */ }; -/* Handling of missing values. */ -enum dsc_missing_type - { - DSC_VARIABLE, /* Handle missing values on a per-variable basis. */ - DSC_LISTWISE /* Discard entire case if any variable is missing. */ - }; - /* Output format. */ enum dsc_format { @@ -157,7 +169,7 @@ static void free_dsc_proc (struct dsc_proc *); /* Z-score functions. */ static int try_name (struct dsc_proc *dsc, char *name); static int generate_z_varname (struct dsc_proc *dsc, char *z_name, - const char *name, int *z_cnt); + const char *name, size_t *z_cnt); static void dump_z_table (struct dsc_proc *); static void setup_z_trns (struct dsc_proc *); @@ -173,10 +185,10 @@ cmd_descriptives (void) { struct dsc_proc *dsc; struct variable **vars = NULL; - int var_cnt = 0; + size_t var_cnt = 0; int save_z_scores = 0; - int z_cnt = 0; - int i; + size_t z_cnt = 0; + size_t i; /* Create and initialize dsc. */ dsc = xmalloc (sizeof *dsc); @@ -254,7 +266,7 @@ cmd_descriptives (void) else if (lex_match_id ("DEFAULT")) dsc->show_stats |= DEFAULT_STATS; else - dsc->show_stats |= 1ul << (match_statistic ()); + dsc->show_stats |= 1ul << (match_statistic ()); lex_match (','); } if (dsc->show_stats == 0) @@ -265,8 +277,12 @@ cmd_descriptives (void) lex_match ('='); if (lex_match_id ("NAME")) dsc->sort_by_stat = DSC_NAME; - else - dsc->sort_by_stat = match_statistic (); + else + { + dsc->sort_by_stat = match_statistic (); + if (dsc->sort_by_stat == DSC_NONE ) + dsc->sort_by_stat = DSC_MEAN; + } if (lex_match ('(')) { if (lex_match_id ("A")) @@ -292,9 +308,9 @@ cmd_descriptives (void) if (!parse_variables (default_dict, &vars, &var_cnt, PV_APPEND | PV_NO_DUPLICATE | PV_NUMERIC)) - break; + goto error; - dsc->vars = xrealloc (dsc->vars, sizeof *dsc->vars * var_cnt); + dsc->vars = xnrealloc (dsc->vars, var_cnt, sizeof *dsc->vars); for (i = dsc->var_cnt; i < var_cnt; i++) { struct dsc_var *dv = &dsc->vars[i]; @@ -309,7 +325,7 @@ cmd_descriptives (void) if (token != T_ID) { lex_error (NULL); - break; + goto error; } if (try_name (dsc, tokid)) { @@ -318,16 +334,17 @@ cmd_descriptives (void) } else msg (SE, _("Z-score variable name %s would be" - "a duplicate variable name."), tokid); + " a duplicate variable name."), tokid); lex_get (); - lex_force_match (')'); + if (!lex_force_match (')')) + goto error; } } } else { lex_error (NULL); - break; + goto error; } lex_match ('/'); @@ -343,7 +360,7 @@ cmd_descriptives (void) { if (save_z_scores) { - int gen_cnt = 0; + size_t gen_cnt = 0; for (i = 0; i < dsc->var_cnt; i++) if (dsc->vars[i].z_name[0] == 0) @@ -402,9 +419,10 @@ cmd_descriptives (void) return CMD_FAILURE; } -/* Returns the statistic named by the current token and skips - past the token. Emits an error if the current token does not - name a statistic. */ +/* Returns the statistic named by the current token and skips past the token. + Returns DSC_NONE if no statistic is given (e.g., subcommand with no + specifiers). Emits an error if the current token ID does not name a + statistic. */ static enum dsc_statistic match_statistic (void) { @@ -414,14 +432,13 @@ match_statistic (void) for (stat = 0; stat < DSC_N_STATS; stat++) if (lex_match_id (dsc_info[stat].identifier)) - { - lex_get (); - return stat; - } + return stat; + + lex_get(); + lex_error (_("expecting statistic name: reverting to default")); } - lex_error (_("expecting statistic name")); - return DSC_MEAN; + return DSC_NONE; } /* Frees DSC. */ @@ -446,12 +463,12 @@ free_dsc_proc (struct dsc_proc *dsc) static int try_name (struct dsc_proc *dsc, char *name) { - int i; + size_t i; if (dict_lookup_var (default_dict, name) != NULL) return 0; for (i = 0; i < dsc->var_cnt; i++) - if (!strcmp (dsc->vars[i].z_name, name)) + if (!strcasecmp (dsc->vars[i].z_name, name)) return 0; return 1; } @@ -462,14 +479,13 @@ try_name (struct dsc_proc *dsc, char *name) copies the new name into Z_NAME. On failure, returns zero. */ static int generate_z_varname (struct dsc_proc *dsc, char *z_name, - const char *var_name, int *z_cnt) + const char *var_name, size_t *z_cnt) { - char name[10]; + char name[LONG_NAME_LEN + 1]; /* Try a name based on the original variable name. */ name[0] = 'Z'; - strcpy (name + 1, var_name); - name[8] = '\0'; + str_copy_trunc (name + 1, sizeof name - 1, var_name); if (try_name (dsc, name)) { strcpy (z_name, name); @@ -510,11 +526,11 @@ generate_z_varname (struct dsc_proc *dsc, char *z_name, static void dump_z_table (struct dsc_proc *dsc) { - int cnt = 0; + size_t cnt = 0; struct tab_table *t; { - int i; + size_t i; for (i = 0; i < dsc->var_cnt; i++) if (dsc->vars[i].z_name[0] != '\0') @@ -532,7 +548,7 @@ dump_z_table (struct dsc_proc *dsc) tab_dim (t, tab_natural_dimensions); { - int i, y; + size_t i, y; for (i = 0, y = 1; i < dsc->var_cnt; i++) if (dsc->vars[i].z_name[0] != '\0') @@ -545,33 +561,62 @@ dump_z_table (struct dsc_proc *dsc) tab_submit (t); } -/* Transformation function to calculate Z-scores. */ +/* Transformation function to calculate Z-scores. Will return SYSMIS if any of + the following are true: 1) mean or standard deviation is SYSMIS 2) score is + SYSMIS 3) score is user missing and they were not included in the original + analyis. 4) any of the variables in the original analysis were missing + (either system or user-missing values that weren't included). +*/ static int -descriptives_trns_proc (struct trns_header *trns, struct ccase * c, - int case_num UNUSED) +descriptives_trns_proc (void *trns_, struct ccase * c, + int case_idx UNUSED) { - struct dsc_trns *t = (struct dsc_trns *) trns; + struct dsc_trns *t = trns_; struct dsc_z_score *z; + struct variable **vars; + int all_sysmis = 0; + if (t->missing_type == DSC_LISTWISE) + { + assert(t->vars); + for (vars = t->vars; vars < t->vars + t->var_cnt; vars++) + { + double score = case_num (c, (*vars)->fv); + if ( score == SYSMIS + || (!t->include_user_missing + && mv_is_num_user_missing (&(*vars)->miss, score))) + { + all_sysmis = 1; + break; + } + } + } + for (z = t->z_scores; z < t->z_scores + t->z_score_cnt; z++) { - double score = c->data[z->src_idx].f; - - if (z->mean == SYSMIS || score == SYSMIS) - c->data[z->dst_idx].f = SYSMIS; + double input = case_num (c, z->src_idx); + double *output = &case_data_rw (c, z->dst_idx)->f; + + if (z->mean == SYSMIS || z->std_dev == SYSMIS + || all_sysmis || input == SYSMIS + || (!t->include_user_missing + && mv_is_num_user_missing (&z->v->miss, input))) + *output = SYSMIS; else - c->data[z->dst_idx].f = (score - z->mean) / z->std_dev; + *output = (input - z->mean) / z->std_dev; } return -1; } /* Frees a descriptives_trns struct. */ static void -descriptives_trns_free (struct trns_header * trns) +descriptives_trns_free (void *trns_) { - struct dsc_trns *t = (struct dsc_trns *) trns; + struct dsc_trns *t = trns_; free (t->z_scores); + assert((t->missing_type != DSC_LISTWISE) ^ (t->vars != NULL)); + free (t->vars); } /* Sets up a transformation to calculate Z scores. */ @@ -579,17 +624,29 @@ static void setup_z_trns (struct dsc_proc *dsc) { struct dsc_trns *t; - int cnt, i; + size_t cnt, i; for (cnt = i = 0; i < dsc->var_cnt; i++) if (dsc->vars[i].z_name[0] != '\0') cnt++; t = xmalloc (sizeof *t); - t->h.proc = descriptives_trns_proc; - t->h.free = descriptives_trns_free; - t->z_scores = xmalloc (cnt * sizeof *t->z_scores); + t->z_scores = xnmalloc (cnt, sizeof *t->z_scores); t->z_score_cnt = cnt; + t->missing_type = dsc->missing_type; + t->include_user_missing = dsc->include_user_missing; + if ( t->missing_type == DSC_LISTWISE ) + { + t->var_cnt = dsc->var_cnt; + t->vars = xnmalloc (t->var_cnt, sizeof *t->vars); + for (i = 0; i < t->var_cnt; i++) + t->vars[i] = dsc->vars[i].v; + } + else + { + t->var_cnt = 0; + t->vars = NULL; + } for (cnt = i = 0; i < dsc->var_cnt; i++) { @@ -620,10 +677,11 @@ setup_z_trns (struct dsc_proc *dsc) z->dst_idx = dst_var->fv; z->mean = dv->stats[DSC_MEAN]; z->std_dev = dv->stats[DSC_STDDEV]; + z->v = dv->v; } } - add_transformation ((struct trns_header *) t); + add_transformation (descriptives_trns_proc, descriptives_trns_free, t); } /* Statistical calculation. */ @@ -637,8 +695,8 @@ calc_descriptives (const struct casefile *cf, void *dsc_) { struct dsc_proc *dsc = dsc_; struct casereader *reader; - const struct ccase *c; - int i; + struct ccase c; + size_t i; for (i = 0; i < dsc->var_cnt; i++) { @@ -654,15 +712,16 @@ calc_descriptives (const struct casefile *cf, void *dsc_) dsc->valid = 0.; /* First pass to handle most of the work. */ - reader = casefile_get_reader (cf); - while (casereader_read (reader, &c)) + for (reader = casefile_get_reader (cf); + casereader_read (reader, &c); + case_destroy (&c)) { - double weight = dict_get_case_weight (default_dict, c, &dsc->bad_warn); + double weight = dict_get_case_weight (default_dict, &c, &dsc->bad_warn); if (weight <= 0.0) - continue; + continue; /* Check for missing values. */ - if (listwise_missing (dsc, c)) + if (listwise_missing (dsc, &c)) { dsc->missing_listwise += weight; if (dsc->missing_type == DSC_LISTWISE) @@ -673,19 +732,20 @@ calc_descriptives (const struct casefile *cf, void *dsc_) for (i = 0; i < dsc->var_cnt; i++) { struct dsc_var *dv = &dsc->vars[i]; - double x = c->data[dv->v->fv].f; + double x = case_num (&c, dv->v->fv); if (dsc->missing_type != DSC_LISTWISE && (x == SYSMIS || (!dsc->include_user_missing - && is_num_user_missing (x, dv->v)))) + && mv_is_num_user_missing (&dv->v->miss, x)))) { dv->missing += weight; continue; } - if (dv->moments != NULL) + if (dv->moments != NULL) moments_pass_one (dv->moments, x, weight); + if (x < dv->min) dv->min = x; if (x > dv->max) @@ -697,28 +757,29 @@ calc_descriptives (const struct casefile *cf, void *dsc_) /* Second pass for higher-order moments. */ if (dsc->max_moment > MOMENT_MEAN) { - reader = casefile_get_reader (cf); - while (casereader_read (reader, &c)) + for (reader = casefile_get_reader (cf); + casereader_read (reader, &c); + case_destroy (&c)) { - double weight = dict_get_case_weight (default_dict, c, + double weight = dict_get_case_weight (default_dict, &c, &dsc->bad_warn); if (weight <= 0.0) continue; /* Check for missing values. */ - if (listwise_missing (dsc, c) + if (listwise_missing (dsc, &c) && dsc->missing_type == DSC_LISTWISE) continue; for (i = 0; i < dsc->var_cnt; i++) { struct dsc_var *dv = &dsc->vars[i]; - double x = c->data[dv->v->fv].f; + double x = case_num (&c, dv->v->fv); if (dsc->missing_type != DSC_LISTWISE && (x == SYSMIS || (!dsc->include_user_missing - && is_num_user_missing (x, dv->v)))) + && mv_is_num_user_missing (&dv->v->miss, x)))) continue; if (dv->moments != NULL) @@ -773,15 +834,16 @@ calc_descriptives (const struct casefile *cf, void *dsc_) static int listwise_missing (struct dsc_proc *dsc, const struct ccase *c) { - int i; + size_t i; for (i = 0; i < dsc->var_cnt; i++) { struct dsc_var *dv = &dsc->vars[i]; - double x = c->data[dv->v->fv].f; + double x = case_num (c, dv->v->fv); if (x == SYSMIS - || (!dsc->include_user_missing && is_num_user_missing (x, dv->v))) + || (!dsc->include_user_missing + && mv_is_num_user_missing (&dv->v->miss, x))) return 1; } return 0; @@ -795,7 +857,7 @@ static algo_compare_func descriptives_compare_dsc_vars; static void display (struct dsc_proc *dsc) { - int i, j; + size_t i; int nc; struct tab_table *t; @@ -836,6 +898,7 @@ display (struct dsc_proc *dsc) for (i = 0; i < dsc->var_cnt; i++) { struct dsc_var *dv = &dsc->vars[i]; + size_t j; nc = 0; tab_text (t, nc++, i + 1, TAB_LEFT, dv->v->name); @@ -865,7 +928,7 @@ descriptives_compare_dsc_vars (const void *a_, const void *b_, void *dsc_) int result; if (dsc->sort_by_stat == DSC_NAME) - result = strcmp (a->v->name, b->v->name); + result = strcasecmp (a->v->name, b->v->name); else { double as = a->stats[dsc->sort_by_stat];