#include <stdlib.h>
#include "algorithm.h"
#include "alloc.h"
+#include "case.h"
#include "casefile.h"
#include "command.h"
+#include "dictionary.h"
#include "lexer.h"
#include "error.h"
#include "magic.h"
/* DESCRIPTIVES private data. */
+struct dsc_proc;
+
+/* Handling of missing values. */
+enum dsc_missing_type
+ {
+ DSC_VARIABLE, /* Handle missing values on a per-variable basis. */
+ DSC_LISTWISE /* Discard entire case if any variable is missing. */
+ };
+
/* Describes properties of a distribution for the purpose of
calculating a Z-score. */
struct dsc_z_score
int dst_idx; /* Destination index into case data. */
double mean; /* Distribution mean. */
double std_dev; /* Distribution standard deviation. */
+ struct variable *v; /* Variable on which z-score is based. */
};
/* DESCRIPTIVES transformation (for calculating Z-scores). */
struct trns_header h;
struct dsc_z_score *z_scores; /* Array of Z-scores. */
int z_score_cnt; /* Number of Z-scores. */
+ struct variable **vars; /* Variables for listwise missing checks. */
+ size_t var_cnt; /* Number of variables. */
+ enum dsc_missing_type missing_type; /* Treatment of missing values. */
+ int include_user_missing; /* Nonzero to include user-missing values. */
};
/* Statistics. Used as bit indexes, so must be 32 or fewer. */
struct dsc_var
{
struct variable *v; /* Variable to calculate on. */
- char z_name[9]; /* Name for z-score variable. */
+ char z_name[SHORT_NAME_LEN + 1];/* Name for z-score variable. */
double valid, missing; /* Valid, missing counts. */
struct moments *moments; /* Moments. */
double min, max; /* Maximum and mimimum values. */
double stats[DSC_N_STATS]; /* All the stats' values. */
};
-/* Handling of missing values. */
-enum dsc_missing_type
- {
- DSC_VARIABLE, /* Handle missing values on a per-variable basis. */
- DSC_LISTWISE /* Discard entire case if any variable is missing. */
- };
-
/* Output format. */
enum dsc_format
{
/* Accumulated results. */
double missing_listwise; /* Sum of weights of cases missing listwise. */
double valid; /* Sum of weights of valid cases. */
- int bad_weight; /* Nonzero if a bad weight has been found. */
+ int bad_warn; /* Warn if bad weight found. */
enum dsc_statistic sort_by_stat; /* Statistic to sort by; -1: name. */
int sort_ascending; /* !0: ascending order; 0: descending. */
unsigned long show_stats; /* Statistics to display. */
dsc->format = DSC_LINE;
dsc->missing_listwise = 0.;
dsc->valid = 0.;
- dsc->bad_weight = 0;
+ dsc->bad_warn = 1;
dsc->sort_by_stat = DSC_NONE;
dsc->sort_ascending = 1;
dsc->show_stats = dsc->calc_stats = DEFAULT_STATS;
else if (lex_match_id ("DEFAULT"))
dsc->show_stats |= DEFAULT_STATS;
else
- dsc->show_stats |= 1ul << (match_statistic ());
+ dsc->show_stats |= 1ul << (match_statistic ());
lex_match (',');
}
if (dsc->show_stats == 0)
lex_match ('=');
if (lex_match_id ("NAME"))
dsc->sort_by_stat = DSC_NAME;
- else
- dsc->sort_by_stat = match_statistic ();
+ else
+ {
+ dsc->sort_by_stat = match_statistic ();
+ if (dsc->sort_by_stat == DSC_NONE )
+ dsc->sort_by_stat = DSC_MEAN;
+ }
if (lex_match ('('))
{
if (lex_match_id ("A"))
if (!parse_variables (default_dict, &vars, &var_cnt,
PV_APPEND | PV_NO_DUPLICATE | PV_NUMERIC))
- break;
+ goto error;
dsc->vars = xrealloc (dsc->vars, sizeof *dsc->vars * var_cnt);
for (i = dsc->var_cnt; i < var_cnt; i++)
if (token != T_ID)
{
lex_error (NULL);
- break;
+ goto error;
}
if (try_name (dsc, tokid))
{
}
else
msg (SE, _("Z-score variable name %s would be"
- "a duplicate variable name."), tokid);
+ " a duplicate variable name."), tokid);
lex_get ();
- lex_force_match (')');
+ if (!lex_force_match (')'))
+ goto error;
}
}
}
else
{
lex_error (NULL);
- break;
+ goto error;
}
lex_match ('/');
/* Data pass. */
multipass_procedure_with_splits (calc_descriptives, dsc);
- if (dsc->bad_weight)
- msg (SW, _("At least one case in the data file had a weight value "
- "that was system-missing, zero, or negative. These case(s) "
- "were ignored."));
/* Z-scoring! */
if (z_cnt)
return CMD_FAILURE;
}
-/* Returns the statistic named by the current token and skips
- past the token. Emits an error if the current token does not
- name a statistic. */
+/* Returns the statistic named by the current token and skips past the token.
+ Returns DSC_NONE if no statistic is given (e.g., subcommand with no
+ specifiers). Emits an error if the current token ID does not name a
+ statistic. */
static enum dsc_statistic
match_statistic (void)
{
for (stat = 0; stat < DSC_N_STATS; stat++)
if (lex_match_id (dsc_info[stat].identifier))
- {
- lex_get ();
- return stat;
- }
+ return stat;
+
+ lex_get();
+ lex_error (_("expecting statistic name: reverting to default"));
}
- lex_error (_("expecting statistic name"));
- return DSC_MEAN;
+ return DSC_NONE;
}
/* Frees DSC. */
/* Try a name based on the original variable name. */
name[0] = 'Z';
strcpy (name + 1, var_name);
- name[8] = '\0';
+ name[SHORT_NAME_LEN] = '\0';
if (try_name (dsc, name))
{
strcpy (z_name, name);
tab_submit (t);
}
-/* Transformation function to calculate Z-scores. */
+/* Transformation function to calculate Z-scores. Will return SYSMIS if any of
+ the following are true: 1) mean or standard deviation is SYSMIS 2) score is
+ SYSMIS 3) score is user missing and they were not included in the original
+ analyis. 4) any of the variables in the original analysis were missing
+ (either system or user-missing values that weren't included).
+*/
static int
descriptives_trns_proc (struct trns_header *trns, struct ccase * c,
- int case_num UNUSED)
+ int case_idx UNUSED)
{
struct dsc_trns *t = (struct dsc_trns *) trns;
struct dsc_z_score *z;
+ struct variable **vars;
+ int all_sysmis = 0;
+ if (t->missing_type == DSC_LISTWISE)
+ {
+ assert(t->vars);
+ for (vars = t->vars; vars < t->vars + t->var_cnt; vars++)
+ {
+ double score = case_num (c, (*vars)->fv);
+ if ( score == SYSMIS || (!t->include_user_missing
+ && is_num_user_missing(score, *vars)) )
+ {
+ all_sysmis = 1;
+ break;
+ }
+ }
+ }
+
for (z = t->z_scores; z < t->z_scores + t->z_score_cnt; z++)
{
- double score = c->data[z->src_idx].f;
+ double input = case_num (c, z->src_idx);
+ double *output = &case_data_rw (c, z->dst_idx)->f;
- if (z->mean == SYSMIS || score == SYSMIS)
- c->data[z->dst_idx].f = SYSMIS;
+ if (z->mean == SYSMIS || z->std_dev == SYSMIS
+ || all_sysmis || input == SYSMIS
+ || (!t->include_user_missing && is_num_user_missing(input, z->v)))
+ *output = SYSMIS;
else
- c->data[z->dst_idx].f = (score - z->mean) / z->std_dev;
+ *output = (input - z->mean) / z->std_dev;
}
return -1;
}
struct dsc_trns *t = (struct dsc_trns *) trns;
free (t->z_scores);
+ assert((t->missing_type != DSC_LISTWISE) ^ (t->vars != NULL));
+ free (t->vars);
}
/* Sets up a transformation to calculate Z scores. */
t->h.free = descriptives_trns_free;
t->z_scores = xmalloc (cnt * sizeof *t->z_scores);
t->z_score_cnt = cnt;
+ t->missing_type = dsc->missing_type;
+ t->include_user_missing = dsc->include_user_missing;
+ if ( t->missing_type == DSC_LISTWISE )
+ {
+ t->var_cnt = dsc->var_cnt;
+ t->vars = xmalloc(t->var_cnt * sizeof *t->vars);
+ for (i = 0; i < t->var_cnt; i++)
+ t->vars[i] = dsc->vars[i].v;
+ }
+ else
+ {
+ t->var_cnt = 0;
+ t->vars = NULL;
+ }
+
for (cnt = i = 0; i < dsc->var_cnt; i++)
{
z->dst_idx = dst_var->fv;
z->mean = dv->stats[DSC_MEAN];
z->std_dev = dv->stats[DSC_STDDEV];
+ z->v = dv->v;
}
}
{
struct dsc_proc *dsc = dsc_;
struct casereader *reader;
- const struct ccase *c;
+ struct ccase c;
int i;
for (i = 0; i < dsc->var_cnt; i++)
dsc->valid = 0.;
/* First pass to handle most of the work. */
- reader = casefile_get_reader (cf);
- while (casereader_read (reader, &c))
+ for (reader = casefile_get_reader (cf);
+ casereader_read (reader, &c);
+ case_destroy (&c))
{
- double weight = dict_get_case_weight (default_dict, c);
+ double weight = dict_get_case_weight (default_dict, &c, &dsc->bad_warn);
if (weight <= 0.0)
- {
- dsc->bad_weight = 1;
- continue;
- }
-
+ continue;
+
/* Check for missing values. */
- if (listwise_missing (dsc, c))
+ if (listwise_missing (dsc, &c))
{
dsc->missing_listwise += weight;
if (dsc->missing_type == DSC_LISTWISE)
for (i = 0; i < dsc->var_cnt; i++)
{
struct dsc_var *dv = &dsc->vars[i];
- double x = c->data[dv->v->fv].f;
+ double x = case_num (&c, dv->v->fv);
if (dsc->missing_type != DSC_LISTWISE
&& (x == SYSMIS
continue;
}
- if (dv->moments != NULL)
+ if (dv->moments != NULL)
moments_pass_one (dv->moments, x, weight);
+
if (x < dv->min)
dv->min = x;
if (x > dv->max)
/* Second pass for higher-order moments. */
if (dsc->max_moment > MOMENT_MEAN)
{
- reader = casefile_get_reader (cf);
- while (casereader_read (reader, &c))
+ for (reader = casefile_get_reader (cf);
+ casereader_read (reader, &c);
+ case_destroy (&c))
{
- double weight = dict_get_case_weight (default_dict, c);
+ double weight = dict_get_case_weight (default_dict, &c,
+ &dsc->bad_warn);
if (weight <= 0.0)
continue;
/* Check for missing values. */
- if (listwise_missing (dsc, c)
+ if (listwise_missing (dsc, &c)
&& dsc->missing_type == DSC_LISTWISE)
continue;
for (i = 0; i < dsc->var_cnt; i++)
{
struct dsc_var *dv = &dsc->vars[i];
- double x = c->data[dv->v->fv].f;
+ double x = case_num (&c, dv->v->fv);
if (dsc->missing_type != DSC_LISTWISE
&& (x == SYSMIS
for (i = 0; i < dsc->var_cnt; i++)
{
struct dsc_var *dv = &dsc->vars[i];
- double x = c->data[dv->v->fv].f;
+ double x = case_num (c, dv->v->fv);
if (x == SYSMIS
|| (!dsc->include_user_missing && is_num_user_missing (x, dv->v)))