You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- 02111-1307, USA. */
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA. */
/* FIXME: Many possible optimizations. */
#include <stdlib.h>
#include "algorithm.h"
#include "alloc.h"
+#include "case.h"
#include "casefile.h"
#include "command.h"
+#include "dictionary.h"
#include "lexer.h"
#include "error.h"
#include "magic.h"
#include "var.h"
#include "vfm.h"
+#include "gettext.h"
+#define _(msgid) gettext (msgid)
+#define N_(msgid) msgid
+
/* DESCRIPTIVES private data. */
+struct dsc_proc;
+
+/* Handling of missing values. */
+enum dsc_missing_type
+ {
+ DSC_VARIABLE, /* Handle missing values on a per-variable basis. */
+ DSC_LISTWISE /* Discard entire case if any variable is missing. */
+ };
+
/* Describes properties of a distribution for the purpose of
calculating a Z-score. */
struct dsc_z_score
int dst_idx; /* Destination index into case data. */
double mean; /* Distribution mean. */
double std_dev; /* Distribution standard deviation. */
+ struct variable *v; /* Variable on which z-score is based. */
};
/* DESCRIPTIVES transformation (for calculating Z-scores). */
struct dsc_trns
{
- struct trns_header h;
struct dsc_z_score *z_scores; /* Array of Z-scores. */
int z_score_cnt; /* Number of Z-scores. */
+ struct variable **vars; /* Variables for listwise missing checks. */
+ size_t var_cnt; /* Number of variables. */
+ enum dsc_missing_type missing_type; /* Treatment of missing values. */
+ int include_user_missing; /* Nonzero to include user-missing values. */
};
/* Statistics. Used as bit indexes, so must be 32 or fewer. */
struct dsc_var
{
struct variable *v; /* Variable to calculate on. */
- char z_name[9]; /* Name for z-score variable. */
+ char z_name[LONG_NAME_LEN + 1]; /* Name for z-score variable. */
double valid, missing; /* Valid, missing counts. */
struct moments *moments; /* Moments. */
double min, max; /* Maximum and mimimum values. */
double stats[DSC_N_STATS]; /* All the stats' values. */
};
-/* Handling of missing values. */
-enum dsc_missing_type
- {
- DSC_VARIABLE, /* Handle missing values on a per-variable basis. */
- DSC_LISTWISE /* Discard entire case if any variable is missing. */
- };
-
/* Output format. */
enum dsc_format
{
/* Accumulated results. */
double missing_listwise; /* Sum of weights of cases missing listwise. */
double valid; /* Sum of weights of valid cases. */
- int bad_weight; /* Nonzero if a bad weight has been found. */
+ int bad_warn; /* Warn if bad weight found. */
enum dsc_statistic sort_by_stat; /* Statistic to sort by; -1: name. */
int sort_ascending; /* !0: ascending order; 0: descending. */
unsigned long show_stats; /* Statistics to display. */
/* Z-score functions. */
static int try_name (struct dsc_proc *dsc, char *name);
static int generate_z_varname (struct dsc_proc *dsc, char *z_name,
- const char *name, int *z_cnt);
+ const char *name, size_t *z_cnt);
static void dump_z_table (struct dsc_proc *);
static void setup_z_trns (struct dsc_proc *);
{
struct dsc_proc *dsc;
struct variable **vars = NULL;
- int var_cnt = 0;
+ size_t var_cnt = 0;
int save_z_scores = 0;
- int z_cnt = 0;
- int i;
+ size_t z_cnt = 0;
+ size_t i;
/* Create and initialize dsc. */
dsc = xmalloc (sizeof *dsc);
dsc->format = DSC_LINE;
dsc->missing_listwise = 0.;
dsc->valid = 0.;
- dsc->bad_weight = 0;
+ dsc->bad_warn = 1;
dsc->sort_by_stat = DSC_NONE;
dsc->sort_ascending = 1;
dsc->show_stats = dsc->calc_stats = DEFAULT_STATS;
else if (lex_match_id ("DEFAULT"))
dsc->show_stats |= DEFAULT_STATS;
else
- dsc->show_stats |= 1ul << (match_statistic ());
+ dsc->show_stats |= 1ul << (match_statistic ());
lex_match (',');
}
if (dsc->show_stats == 0)
lex_match ('=');
if (lex_match_id ("NAME"))
dsc->sort_by_stat = DSC_NAME;
- else
- dsc->sort_by_stat = match_statistic ();
+ else
+ {
+ dsc->sort_by_stat = match_statistic ();
+ if (dsc->sort_by_stat == DSC_NONE )
+ dsc->sort_by_stat = DSC_MEAN;
+ }
if (lex_match ('('))
{
if (lex_match_id ("A"))
if (!parse_variables (default_dict, &vars, &var_cnt,
PV_APPEND | PV_NO_DUPLICATE | PV_NUMERIC))
- break;
+ goto error;
- dsc->vars = xrealloc (dsc->vars, sizeof *dsc->vars * var_cnt);
+ dsc->vars = xnrealloc (dsc->vars, var_cnt, sizeof *dsc->vars);
for (i = dsc->var_cnt; i < var_cnt; i++)
{
struct dsc_var *dv = &dsc->vars[i];
if (token != T_ID)
{
lex_error (NULL);
- break;
+ goto error;
}
if (try_name (dsc, tokid))
{
}
else
msg (SE, _("Z-score variable name %s would be"
- "a duplicate variable name."), tokid);
+ " a duplicate variable name."), tokid);
lex_get ();
- lex_force_match (')');
+ if (!lex_force_match (')'))
+ goto error;
}
}
}
else
{
lex_error (NULL);
- break;
+ goto error;
}
lex_match ('/');
{
if (save_z_scores)
{
- int gen_cnt = 0;
+ size_t gen_cnt = 0;
for (i = 0; i < dsc->var_cnt; i++)
if (dsc->vars[i].z_name[0] == 0)
/* Data pass. */
multipass_procedure_with_splits (calc_descriptives, dsc);
- if (dsc->bad_weight)
- msg (SW, _("At least one case in the data file had a weight value "
- "that was system-missing, zero, or negative. These case(s) "
- "were ignored."));
/* Z-scoring! */
if (z_cnt)
setup_z_trns (dsc);
/* Done. */
+ free (vars);
free_dsc_proc (dsc);
return CMD_SUCCESS;
error:
+ free (vars);
free_dsc_proc (dsc);
return CMD_FAILURE;
}
-/* Returns the statistic named by the current token and skips
- past the token. Emits an error if the current token does not
- name a statistic. */
+/* Returns the statistic named by the current token and skips past the token.
+ Returns DSC_NONE if no statistic is given (e.g., subcommand with no
+ specifiers). Emits an error if the current token ID does not name a
+ statistic. */
static enum dsc_statistic
match_statistic (void)
{
for (stat = 0; stat < DSC_N_STATS; stat++)
if (lex_match_id (dsc_info[stat].identifier))
- {
- lex_get ();
- return stat;
- }
+ return stat;
+
+ lex_get();
+ lex_error (_("expecting statistic name: reverting to default"));
}
- lex_error (_("expecting statistic name"));
- return DSC_MEAN;
+ return DSC_NONE;
}
/* Frees DSC. */
static int
try_name (struct dsc_proc *dsc, char *name)
{
- int i;
+ size_t i;
if (dict_lookup_var (default_dict, name) != NULL)
return 0;
for (i = 0; i < dsc->var_cnt; i++)
- if (!strcmp (dsc->vars[i].z_name, name))
+ if (!strcasecmp (dsc->vars[i].z_name, name))
return 0;
return 1;
}
copies the new name into Z_NAME. On failure, returns zero. */
static int
generate_z_varname (struct dsc_proc *dsc, char *z_name,
- const char *var_name, int *z_cnt)
+ const char *var_name, size_t *z_cnt)
{
- char name[10];
+ char name[LONG_NAME_LEN + 1];
/* Try a name based on the original variable name. */
name[0] = 'Z';
- strcpy (name + 1, var_name);
- name[8] = '\0';
+ str_copy_trunc (name + 1, sizeof name - 1, var_name);
if (try_name (dsc, name))
{
strcpy (z_name, name);
static void
dump_z_table (struct dsc_proc *dsc)
{
- int cnt = 0;
+ size_t cnt = 0;
struct tab_table *t;
{
- int i;
+ size_t i;
for (i = 0; i < dsc->var_cnt; i++)
if (dsc->vars[i].z_name[0] != '\0')
tab_dim (t, tab_natural_dimensions);
{
- int i, y;
+ size_t i, y;
for (i = 0, y = 1; i < dsc->var_cnt; i++)
if (dsc->vars[i].z_name[0] != '\0')
tab_submit (t);
}
-/* Transformation function to calculate Z-scores. */
+/* Transformation function to calculate Z-scores. Will return SYSMIS if any of
+ the following are true: 1) mean or standard deviation is SYSMIS 2) score is
+ SYSMIS 3) score is user missing and they were not included in the original
+ analyis. 4) any of the variables in the original analysis were missing
+ (either system or user-missing values that weren't included).
+*/
static int
-descriptives_trns_proc (struct trns_header *trns, struct ccase * c,
- int case_num UNUSED)
+descriptives_trns_proc (void *trns_, struct ccase * c,
+ int case_idx UNUSED)
{
- struct dsc_trns *t = (struct dsc_trns *) trns;
+ struct dsc_trns *t = trns_;
struct dsc_z_score *z;
+ struct variable **vars;
+ int all_sysmis = 0;
+ if (t->missing_type == DSC_LISTWISE)
+ {
+ assert(t->vars);
+ for (vars = t->vars; vars < t->vars + t->var_cnt; vars++)
+ {
+ double score = case_num (c, (*vars)->fv);
+ if ( score == SYSMIS
+ || (!t->include_user_missing
+ && mv_is_num_user_missing (&(*vars)->miss, score)))
+ {
+ all_sysmis = 1;
+ break;
+ }
+ }
+ }
+
for (z = t->z_scores; z < t->z_scores + t->z_score_cnt; z++)
{
- double score = c->data[z->src_idx].f;
-
- if (z->mean == SYSMIS || score == SYSMIS)
- c->data[z->dst_idx].f = SYSMIS;
+ double input = case_num (c, z->src_idx);
+ double *output = &case_data_rw (c, z->dst_idx)->f;
+
+ if (z->mean == SYSMIS || z->std_dev == SYSMIS
+ || all_sysmis || input == SYSMIS
+ || (!t->include_user_missing
+ && mv_is_num_user_missing (&z->v->miss, input)))
+ *output = SYSMIS;
else
- c->data[z->dst_idx].f = (score - z->mean) / z->std_dev;
+ *output = (input - z->mean) / z->std_dev;
}
return -1;
}
/* Frees a descriptives_trns struct. */
static void
-descriptives_trns_free (struct trns_header * trns)
+descriptives_trns_free (void *trns_)
{
- struct dsc_trns *t = (struct dsc_trns *) trns;
+ struct dsc_trns *t = trns_;
free (t->z_scores);
+ assert((t->missing_type != DSC_LISTWISE) ^ (t->vars != NULL));
+ free (t->vars);
}
/* Sets up a transformation to calculate Z scores. */
setup_z_trns (struct dsc_proc *dsc)
{
struct dsc_trns *t;
- int cnt, i;
+ size_t cnt, i;
for (cnt = i = 0; i < dsc->var_cnt; i++)
if (dsc->vars[i].z_name[0] != '\0')
cnt++;
t = xmalloc (sizeof *t);
- t->h.proc = descriptives_trns_proc;
- t->h.free = descriptives_trns_free;
- t->z_scores = xmalloc (cnt * sizeof *t->z_scores);
+ t->z_scores = xnmalloc (cnt, sizeof *t->z_scores);
t->z_score_cnt = cnt;
+ t->missing_type = dsc->missing_type;
+ t->include_user_missing = dsc->include_user_missing;
+ if ( t->missing_type == DSC_LISTWISE )
+ {
+ t->var_cnt = dsc->var_cnt;
+ t->vars = xnmalloc (t->var_cnt, sizeof *t->vars);
+ for (i = 0; i < t->var_cnt; i++)
+ t->vars[i] = dsc->vars[i].v;
+ }
+ else
+ {
+ t->var_cnt = 0;
+ t->vars = NULL;
+ }
for (cnt = i = 0; i < dsc->var_cnt; i++)
{
z->dst_idx = dst_var->fv;
z->mean = dv->stats[DSC_MEAN];
z->std_dev = dv->stats[DSC_STDDEV];
+ z->v = dv->v;
}
}
- add_transformation ((struct trns_header *) t);
+ add_transformation (descriptives_trns_proc, descriptives_trns_free, t);
}
\f
/* Statistical calculation. */
{
struct dsc_proc *dsc = dsc_;
struct casereader *reader;
- const struct ccase *c;
- int i;
+ struct ccase c;
+ size_t i;
for (i = 0; i < dsc->var_cnt; i++)
{
dsc->valid = 0.;
/* First pass to handle most of the work. */
- reader = casefile_get_reader (cf);
- while (casereader_read (reader, &c))
+ for (reader = casefile_get_reader (cf);
+ casereader_read (reader, &c);
+ case_destroy (&c))
{
- double weight = dict_get_case_weight (default_dict, c);
+ double weight = dict_get_case_weight (default_dict, &c, &dsc->bad_warn);
if (weight <= 0.0)
- {
- dsc->bad_weight = 1;
- continue;
- }
-
+ continue;
+
/* Check for missing values. */
- if (listwise_missing (dsc, c))
+ if (listwise_missing (dsc, &c))
{
dsc->missing_listwise += weight;
if (dsc->missing_type == DSC_LISTWISE)
for (i = 0; i < dsc->var_cnt; i++)
{
struct dsc_var *dv = &dsc->vars[i];
- double x = c->data[dv->v->fv].f;
+ double x = case_num (&c, dv->v->fv);
if (dsc->missing_type != DSC_LISTWISE
&& (x == SYSMIS
|| (!dsc->include_user_missing
- && is_num_user_missing (x, dv->v))))
+ && mv_is_num_user_missing (&dv->v->miss, x))))
{
dv->missing += weight;
continue;
}
- if (dv->moments != NULL)
+ if (dv->moments != NULL)
moments_pass_one (dv->moments, x, weight);
+
if (x < dv->min)
dv->min = x;
if (x > dv->max)
/* Second pass for higher-order moments. */
if (dsc->max_moment > MOMENT_MEAN)
{
- reader = casefile_get_reader (cf);
- while (casereader_read (reader, &c))
+ for (reader = casefile_get_reader (cf);
+ casereader_read (reader, &c);
+ case_destroy (&c))
{
- double weight = dict_get_case_weight (default_dict, c);
+ double weight = dict_get_case_weight (default_dict, &c,
+ &dsc->bad_warn);
if (weight <= 0.0)
continue;
/* Check for missing values. */
- if (listwise_missing (dsc, c)
+ if (listwise_missing (dsc, &c)
&& dsc->missing_type == DSC_LISTWISE)
continue;
for (i = 0; i < dsc->var_cnt; i++)
{
struct dsc_var *dv = &dsc->vars[i];
- double x = c->data[dv->v->fv].f;
+ double x = case_num (&c, dv->v->fv);
if (dsc->missing_type != DSC_LISTWISE
&& (x == SYSMIS
|| (!dsc->include_user_missing
- && is_num_user_missing (x, dv->v))))
+ && mv_is_num_user_missing (&dv->v->miss, x))))
continue;
if (dv->moments != NULL)
static int
listwise_missing (struct dsc_proc *dsc, const struct ccase *c)
{
- int i;
+ size_t i;
for (i = 0; i < dsc->var_cnt; i++)
{
struct dsc_var *dv = &dsc->vars[i];
- double x = c->data[dv->v->fv].f;
+ double x = case_num (c, dv->v->fv);
if (x == SYSMIS
- || (!dsc->include_user_missing && is_num_user_missing (x, dv->v)))
+ || (!dsc->include_user_missing
+ && mv_is_num_user_missing (&dv->v->miss, x)))
return 1;
}
return 0;
static void
display (struct dsc_proc *dsc)
{
- int i, j;
+ size_t i;
int nc;
struct tab_table *t;
for (i = 0; i < dsc->var_cnt; i++)
{
struct dsc_var *dv = &dsc->vars[i];
+ size_t j;
nc = 0;
tab_text (t, nc++, i + 1, TAB_LEFT, dv->v->name);
int result;
if (dsc->sort_by_stat == DSC_NAME)
- result = strcmp (a->v->name, b->v->name);
+ result = strcasecmp (a->v->name, b->v->name);
else
{
double as = a->stats[dsc->sort_by_stat];