From d4e19e545c74781f3d36b670f9a27d93a76a3771 Mon Sep 17 00:00:00 2001 From: John Darrington Date: Mon, 23 Jan 2012 19:47:01 +0100 Subject: [PATCH] Moved rank.q to rank.c --- src/language/stats/.gitignore | 2 - src/language/stats/automake.mk | 2 +- src/language/stats/{rank.q => rank.c} | 1011 +++++++++++++------------ 3 files changed, 522 insertions(+), 493 deletions(-) rename src/language/stats/{rank.q => rank.c} (50%) diff --git a/src/language/stats/.gitignore b/src/language/stats/.gitignore index 8b9a1e75..79c1832a 100644 --- a/src/language/stats/.gitignore +++ b/src/language/stats/.gitignore @@ -1,6 +1,4 @@ crosstabs.c examine.c frequencies.c -rank.c regression.c -t-test.c diff --git a/src/language/stats/automake.mk b/src/language/stats/automake.mk index 843653b3..1ca22afc 100644 --- a/src/language/stats/automake.mk +++ b/src/language/stats/automake.mk @@ -6,7 +6,6 @@ src_language_stats_built_sources = \ src/language/stats/crosstabs.c \ src/language/stats/examine.c \ src/language/stats/frequencies.c \ - src/language/stats/rank.c \ src/language/stats/regression.c language_stats_sources = \ @@ -45,6 +44,7 @@ language_stats_sources = \ src/language/stats/npar-summary.h \ src/language/stats/oneway.c \ src/language/stats/quick-cluster.c \ + src/language/stats/rank.c \ src/language/stats/reliability.c \ src/language/stats/roc.c \ src/language/stats/roc.h \ diff --git a/src/language/stats/rank.q b/src/language/stats/rank.c similarity index 50% rename from src/language/stats/rank.q rename to src/language/stats/rank.c index 2247354d..cd932ac8 100644 --- a/src/language/stats/rank.q +++ b/src/language/stats/rank.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2005, 2006, 2007, 2009, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 2005, 2006, 2007, 2009, 2010, 2011, 2012 Free Software Foundation, Inc This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,78 +16,69 @@ #include -#include -#include -#include - #include "data/case.h" #include "data/casegrouper.h" #include "data/casereader.h" -#include "data/casewriter.h" #include "data/dataset.h" #include "data/dictionary.h" #include "data/format.h" -#include "data/missing-values.h" -#include "data/short-names.h" -#include "data/subcase.h" #include "data/variable.h" +#include "data/subcase.h" +#include "data/casewriter.h" +#include "data/short-names.h" + #include "language/command.h" +#include "language/lexer/lexer.h" +#include "language/lexer/variable-parser.h" #include "language/stats/sort-criteria.h" -#include "libpspp/compiler.h" -#include "libpspp/taint.h" + #include "math/sort.h" + +#include "libpspp/assertion.h" +#include "libpspp/misc.h" +#include "libpspp/taint.h" +#include "libpspp/pool.h" +#include "libpspp/message.h" + + #include "output/tab.h" +#include + +#include + #include "gettext.h" #define _(msgid) gettext (msgid) +#define N_(msgid) (msgid) -/* (headers) */ - -/* (specification) - "RANK" (rank_): - *^variables=custom; - +rank=custom; - +normal=custom; - +percent=custom; - +ntiles=custom; - +rfraction=custom; - +proportion=custom; - +n=custom; - +savage=custom; - +print=print:!yes/no; - +fraction=fraction:!blom/tukey/vw/rankit; - +ties=ties:!mean/low/high/condense; - missing=miss:!exclude/include. -*/ -/* (declarations) */ -/* (functions) */ +struct rank; -typedef double (*rank_function_t) (double c, double cc, double cc_1, - int i, double w); +typedef double (*rank_function_t) (const struct rank*, double c, double cc, double cc_1, + int i, double w); -static double rank_proportion (double c, double cc, double cc_1, +static double rank_proportion (const struct rank *, double c, double cc, double cc_1, int i, double w); -static double rank_normal (double c, double cc, double cc_1, +static double rank_normal (const struct rank *, double c, double cc, double cc_1, int i, double w); -static double rank_percent (double c, double cc, double cc_1, +static double rank_percent (const struct rank *, double c, double cc, double cc_1, int i, double w); -static double rank_rfraction (double c, double cc, double cc_1, +static double rank_rfraction (const struct rank *, double c, double cc, double cc_1, int i, double w); -static double rank_rank (double c, double cc, double cc_1, +static double rank_rank (const struct rank *, double c, double cc, double cc_1, int i, double w); -static double rank_n (double c, double cc, double cc_1, +static double rank_n (const struct rank *, double c, double cc, double cc_1, int i, double w); -static double rank_savage (double c, double cc, double cc_1, - int i, double w); +static double rank_savage (const struct rank *, double c, double cc, double cc_1, + int i, double w); -static double rank_ntiles (double c, double cc, double cc_1, - int i, double w); +static double rank_ntiles (const struct rank *, double c, double cc, double cc_1, + int i, double w); enum RANK_FUNC @@ -134,8 +125,24 @@ static const rank_function_t rank_func[n_RANK_FUNCS] = { rank_n, rank_ntiles, rank_savage +}; + + +enum ties + { + TIES_LOW, + TIES_HIGH, + TIES_MEAN, + TIES_CONDENSE }; +enum fraction + { + FRAC_BLOM, + FRAC_RANKIT, + FRAC_TUKEY, + FRAC_VW + }; struct rank_spec { @@ -144,179 +151,226 @@ struct rank_spec }; -/* Categories of missing values to exclude. */ -static enum mv_class exclude_values; - -static struct rank_spec *rank_specs; -static size_t n_rank_specs; +/* Create and return a new variable in which to store the ranks of SRC_VAR + accoring to the rank function F. + VNAME is the name of the variable to be created. + If VNAME is NULL, then a name will be automatically chosen. +*/ +static struct variable * +create_rank_variable (struct dictionary *dict, enum RANK_FUNC f, + const struct variable *src_var, + const char *vname) +{ + int i; + struct variable *var = NULL; + char name[SHORT_NAME_LEN + 1]; -static struct subcase sc; + if ( vname ) + var = dict_create_var(dict, vname, 0); -static const struct variable **group_vars; -static size_t n_group_vars; + if ( NULL == var ) + { + snprintf (name, SHORT_NAME_LEN + 1, "%c%s", + function_name[f][0], var_get_name (src_var)); -static const struct variable **src_vars; -static size_t n_src_vars; + var = dict_create_var(dict, name, 0); + } + i = 1; + while( NULL == var ) + { + char func_abb[4]; + snprintf(func_abb, 4, "%s", function_name[f]); + snprintf(name, SHORT_NAME_LEN + 1, "%s%03d", func_abb, + i); + var = dict_create_var(dict, name, 0); + if (i++ >= 999) + break; + } -static int k_ntiles; + i = 1; + while ( NULL == var ) + { + char func_abb[3]; + snprintf(func_abb, 3, "%s", function_name[f]); -static struct cmd_rank cmd; + snprintf(name, SHORT_NAME_LEN + 1, + "RNK%s%02d", func_abb, i); -static void rank_sorted_file (struct casereader *, - struct casewriter *, - const struct dictionary *, - const struct rank_spec *rs, - int n_rank_specs, - int idx, - const struct variable *rank_var); + var = dict_create_var(dict, name, 0); + if ( i++ >= 99 ) + break; + } -static const char * -fraction_name(void) -{ - static char name[10]; - switch ( cmd.fraction ) + if ( NULL == var ) { - case RANK_BLOM: - strcpy (name, "BLOM"); - break; - case RANK_RANKIT: - strcpy (name, "RANKIT"); - break; - case RANK_TUKEY: - strcpy (name, "TUKEY"); - break; - case RANK_VW: - strcpy (name, "VW"); - break; - default: - NOT_REACHED (); + msg(ME, _("Cannot create new rank variable. All candidates in use.")); + return NULL; } - return name; + + var_set_both_formats (var, &dest_format[f]); + + return var; } -/* Create a label on DEST_VAR, describing its derivation from SRC_VAR and F */ -static void -create_var_label (struct variable *dest_var, - const struct variable *src_var, enum RANK_FUNC f) +struct rank { - struct string label; - ds_init_empty (&label); + struct dictionary *dict; - if ( n_group_vars > 0 ) - { - struct string group_var_str; - int g; + struct subcase sc; - ds_init_empty (&group_var_str); + const struct variable **vars; + size_t n_vars; - for (g = 0 ; g < n_group_vars ; ++g ) - { - if ( g > 0 ) ds_put_cstr (&group_var_str, " "); - ds_put_cstr (&group_var_str, var_get_name (group_vars[g])); - } + bool ascending; - ds_put_format (&label, _("%s of %s by %s"), function_name[f], - var_get_name (src_var), ds_cstr (&group_var_str)); - ds_destroy (&group_var_str); - } - else - ds_put_format (&label, _("%s of %s"), - function_name[f], var_get_name (src_var)); + const struct variable **group_vars; + size_t n_group_vars; - var_set_label (dest_var, ds_cstr (&label), false); - ds_destroy (&label); -} + enum mv_class exclude; + struct rank_spec *rs; + size_t n_rs; -static bool -rank_cmd (struct dataset *ds, const struct subcase *sc, - const struct rank_spec *rank_specs, int n_rank_specs) -{ - struct dictionary *d = dataset_dict (ds); - bool ok = true; - int i; + enum ties ties; - for (i = 0 ; i < subcase_get_n_fields (sc) ; ++i ) - { - /* Rank variable at index I in SC. */ - struct casegrouper *split_grouper; - struct casereader *split_group; - struct casewriter *output; + enum fraction fraction; + int k_ntiles; - proc_discard_output (ds); - split_grouper = casegrouper_create_splits (proc_open (ds), d); - output = autopaging_writer_create (dict_get_proto (d)); + bool print; - while (casegrouper_get_next_group (split_grouper, &split_group)) - { - struct subcase ordering; - struct casereader *ordered; - struct casegrouper *by_grouper; - struct casereader *by_group; + /* Pool on which cell functions may allocate data */ + struct pool *pool; +}; - /* Sort this split group by the BY variables as primary - keys and the rank variable as secondary key. */ - subcase_init_vars (&ordering, group_vars, n_group_vars); - subcase_add_var (&ordering, src_vars[i], - subcase_get_direction (sc, i)); - ordered = sort_execute (split_group, &ordering); - subcase_destroy (&ordering); - /* Rank the rank variable within this split group. */ - by_grouper = casegrouper_create_vars (ordered, - group_vars, n_group_vars); - while (casegrouper_get_next_group (by_grouper, &by_group)) - { - /* Rank the rank variable within this BY group - within the split group. */ +static void +destroy_rank (struct rank *rank) +{ + free (rank->vars); + free (rank->group_vars); + subcase_destroy (&rank->sc); + pool_destroy (rank->pool); +} - rank_sorted_file (by_group, output, d, rank_specs, n_rank_specs, - i, src_vars[i]); - } - ok = casegrouper_destroy (by_grouper) && ok; - } - ok = casegrouper_destroy (split_grouper); - ok = proc_commit (ds) && ok; - ok = (dataset_set_source (ds, casewriter_make_reader (output)) - && ok); - if (!ok) - break; +static bool +parse_into (struct lexer *lexer, struct rank *cmd) +{ + int var_count = 0; + struct rank_spec *rs = NULL; + + cmd->rs = pool_realloc (cmd->pool, cmd->rs, sizeof (*cmd->rs) * (cmd->n_rs + 1)); + rs = &cmd->rs[cmd->n_rs]; + + if (lex_match_id (lexer, "RANK")) + { + rs->rfunc = RANK; + } + else if (lex_match_id (lexer, "NORMAL")) + { + rs->rfunc = NORMAL; + } + else if (lex_match_id (lexer, "RFRACTION")) + { + rs->rfunc = RFRACTION; + } + else if (lex_match_id (lexer, "N")) + { + rs->rfunc = N; + } + else if (lex_match_id (lexer, "SAVAGE")) + { + rs->rfunc = SAVAGE; + } + else if (lex_match_id (lexer, "PERCENT")) + { + rs->rfunc = PERCENT; + } + else if (lex_match_id (lexer, "PROPORTION")) + { + rs->rfunc = PROPORTION; + } + else if (lex_match_id (lexer, "NTILES")) + { + if ( !lex_force_match (lexer, T_LPAREN)) + return false; + + if (! lex_force_int (lexer) ) + return false; + + cmd->k_ntiles = lex_integer (lexer); + lex_get (lexer); + + if ( !lex_force_match (lexer, T_RPAREN)) + return false; + + rs->rfunc = NTILES; + } + else + { + return false; } - return ok; + cmd->n_rs++; + rs->destvars = NULL; + rs->destvars = pool_calloc (cmd->pool, cmd->n_vars, sizeof (*rs->destvars)); + + if (lex_match_id (lexer, "INTO")) + { + while( lex_token (lexer) == T_ID ) + { + const char *name = lex_tokcstr (lexer); + if ( dict_lookup_var (cmd->dict, name) != NULL ) + { + msg (SE, _("Variable %s already exists."), name); + return false; + } + + if ( var_count >= subcase_get_n_fields (&cmd->sc) ) + { + msg (SE, _("Too many variables in INTO clause.")); + return false; + } + rs->destvars[var_count] = + create_rank_variable (cmd->dict, rs->rfunc, cmd->vars[var_count], name); + ++var_count; + lex_get (lexer); + } + } + + return true; } /* Hardly a rank function !! */ static double -rank_n (double c UNUSED, double cc UNUSED, double cc_1 UNUSED, - int i UNUSED, double w) +rank_n (const struct rank *cmd UNUSED, double c UNUSED, double cc UNUSED, double cc_1 UNUSED, + int i UNUSED, double w) { return w; } static double -rank_rank (double c, double cc, double cc_1, - int i, double w UNUSED) +rank_rank (const struct rank *cmd, double c, double cc, double cc_1, + int i, double w UNUSED) { double rank; if ( c >= 1.0 ) { - switch (cmd.ties) + switch (cmd->ties) { - case RANK_LOW: + case TIES_LOW: rank = cc_1 + 1; break; - case RANK_HIGH: + case TIES_HIGH: rank = cc; break; - case RANK_MEAN: + case TIES_MEAN: rank = cc_1 + (c + 1.0)/ 2.0; break; - case RANK_CONDENSE: + case TIES_CONDENSE: rank = i; break; default: @@ -325,18 +379,18 @@ rank_rank (double c, double cc, double cc_1, } else { - switch (cmd.ties) + switch (cmd->ties) { - case RANK_LOW: + case TIES_LOW: rank = cc_1; break; - case RANK_HIGH: + case TIES_HIGH: rank = cc; break; - case RANK_MEAN: + case TIES_MEAN: rank = cc_1 + c / 2.0 ; break; - case RANK_CONDENSE: + case TIES_CONDENSE: rank = i; break; default: @@ -349,41 +403,41 @@ rank_rank (double c, double cc, double cc_1, static double -rank_rfraction (double c, double cc, double cc_1, +rank_rfraction (const struct rank *cmd, double c, double cc, double cc_1, int i, double w) { - return rank_rank (c, cc, cc_1, i, w) / w ; + return rank_rank (cmd, c, cc, cc_1, i, w) / w ; } static double -rank_percent (double c, double cc, double cc_1, - int i, double w) +rank_percent (const struct rank *cmd, double c, double cc, double cc_1, + int i, double w) { - return rank_rank (c, cc, cc_1, i, w) * 100.0 / w ; + return rank_rank (cmd, c, cc, cc_1, i, w) * 100.0 / w ; } static double -rank_proportion (double c, double cc, double cc_1, +rank_proportion (const struct rank *cmd, double c, double cc, double cc_1, int i, double w) { - const double r = rank_rank (c, cc, cc_1, i, w) ; + const double r = rank_rank (cmd, c, cc, cc_1, i, w) ; double f; - switch ( cmd.fraction ) + switch ( cmd->fraction ) { - case RANK_BLOM: + case FRAC_BLOM: f = (r - 3.0/8.0) / (w + 0.25); break; - case RANK_RANKIT: + case FRAC_RANKIT: f = (r - 0.5) / w ; break; - case RANK_TUKEY: + case FRAC_TUKEY: f = (r - 1.0/3.0) / (w + 1.0/3.0); break; - case RANK_VW: + case FRAC_VW: f = r / ( w + 1.0); break; default: @@ -395,22 +449,22 @@ rank_proportion (double c, double cc, double cc_1, } static double -rank_normal (double c, double cc, double cc_1, +rank_normal (const struct rank *cmd, double c, double cc, double cc_1, int i, double w) { - double f = rank_proportion (c, cc, cc_1, i, w); + double f = rank_proportion (cmd, c, cc, cc_1, i, w); return gsl_cdf_ugaussian_Pinv (f); } static double -rank_ntiles (double c, double cc, double cc_1, - int i, double w) +rank_ntiles (const struct rank *cmd, double c, double cc, double cc_1, + int i, double w) { - double r = rank_rank (c, cc, cc_1, i, w); + double r = rank_rank (cmd, c, cc, cc_1, i, w); - return ( floor (( r * k_ntiles) / ( w + 1) ) + 1); + return ( floor (( r * cmd->k_ntiles) / ( w + 1) ) + 1); } /* Expected value of the order statistics from an exponential distribution */ @@ -428,8 +482,8 @@ ee (int j, double w_star) static double -rank_savage (double c, double cc, double cc_1, - int i UNUSED, double w) +rank_savage (const struct rank *cmd UNUSED, double c, double cc, double cc_1, + int i UNUSED, double w) { double int_part; const int i_1 = floor (cc_1); @@ -463,14 +517,14 @@ rank_savage (double c, double cc, double cc_1, NOT_REACHED(); } + static void rank_sorted_file (struct casereader *input, struct casewriter *output, const struct dictionary *dict, - const struct rank_spec *rs, - int n_rank_specs, int dest_idx, - const struct variable *rank_var) + const struct rank *cmd + ) { struct casereader *pass1, *pass2, *pass2_1; struct casegrouper *tie_grouper; @@ -479,9 +533,8 @@ rank_sorted_file (struct casereader *input, double cc = 0.0; int tie_group = 1; - - input = casereader_create_filter_missing (input, &rank_var, 1, - exclude_values, NULL, output); + input = casereader_create_filter_missing (input, &cmd->vars[dest_idx], 1, + cmd->exclude, NULL, output); input = casereader_create_filter_weight (input, dict, NULL, output); casereader_split (input, &pass1, &pass2); @@ -492,7 +545,7 @@ rank_sorted_file (struct casereader *input, casereader_destroy (pass1); /* Pass 2: Do ranking. */ - tie_grouper = casegrouper_create_vars (pass2, &rank_var, 1); + tie_grouper = casegrouper_create_vars (pass2, &cmd->vars[dest_idx], 1); while (casegrouper_get_next_group (tie_grouper, &pass2_1)) { struct casereader *pass2_2; @@ -514,11 +567,11 @@ rank_sorted_file (struct casereader *input, while ((c = casereader_read (pass2_2)) != NULL) { c = case_unshare (c); - for (i = 0; i < n_rank_specs; ++i) + for (i = 0; i < cmd->n_rs; ++i) { - const struct variable *dst_var = rs[i].destvars[dest_idx]; + const struct variable *dst_var = cmd->rs[i].destvars[dest_idx]; double *dst_value = &case_data_rw (c, dst_var)->f; - *dst_value = rank_func[rs[i].rfunc] (tw, cc, cc_1, tie_group, w); + *dst_value = rank_func[cmd->rs[i].rfunc] (cmd, tw, cc, cc_1, tie_group, w); } casewriter_write (output, c); } @@ -529,6 +582,7 @@ rank_sorted_file (struct casereader *input, casegrouper_destroy (tie_grouper); } + /* Transformation function to enumerate all the cases */ static int create_resort_key (void *key_var_, struct ccase **cc, casenumber case_num) @@ -541,226 +595,303 @@ create_resort_key (void *key_var_, struct ccase **cc, casenumber case_num) return TRNS_CONTINUE; } +static bool +rank_cmd (struct dataset *ds, const struct rank *cmd); -/* Create and return a new variable in which to store the ranks of SRC_VAR - accoring to the rank function F. - VNAME is the name of the variable to be created. - If VNAME is NULL, then a name will be automatically chosen. - */ -static struct variable * -create_rank_variable (struct dictionary *dict, enum RANK_FUNC f, - const struct variable *src_var, - const char *vname) -{ - int i; - struct variable *var = NULL; - char name[SHORT_NAME_LEN + 1]; - if ( vname ) - var = dict_create_var(dict, vname, 0); - - if ( NULL == var ) +static const char * +fraction_name (const struct rank *cmd) +{ + static char name[10]; + switch (cmd->fraction ) { - snprintf (name, SHORT_NAME_LEN + 1, "%c%s", - function_name[f][0], var_get_name (src_var)); - - var = dict_create_var(dict, name, 0); + case FRAC_BLOM: + strcpy (name, "BLOM"); + break; + case FRAC_RANKIT: + strcpy (name, "RANKIT"); + break; + case FRAC_TUKEY: + strcpy (name, "TUKEY"); + break; + case FRAC_VW: + strcpy (name, "VW"); + break; + default: + NOT_REACHED (); } + return name; +} - i = 1; - while( NULL == var ) - { - char func_abb[4]; - snprintf(func_abb, 4, "%s", function_name[f]); - snprintf(name, SHORT_NAME_LEN + 1, "%s%03d", func_abb, - i); - - var = dict_create_var(dict, name, 0); - if (i++ >= 999) - break; - } +/* Create a label on DEST_VAR, describing its derivation from SRC_VAR and F */ +static void +create_var_label (struct rank *cmd, struct variable *dest_var, + const struct variable *src_var, enum RANK_FUNC f) +{ + struct string label; + ds_init_empty (&label); - i = 1; - while ( NULL == var ) + if ( cmd->n_group_vars > 0 ) { - char func_abb[3]; - snprintf(func_abb, 3, "%s", function_name[f]); + struct string group_var_str; + int g; - snprintf(name, SHORT_NAME_LEN + 1, - "RNK%s%02d", func_abb, i); + ds_init_empty (&group_var_str); - var = dict_create_var(dict, name, 0); - if ( i++ >= 99 ) - break; - } + for (g = 0 ; g < cmd->n_group_vars ; ++g ) + { + if ( g > 0 ) ds_put_cstr (&group_var_str, " "); + ds_put_cstr (&group_var_str, var_get_name (cmd->group_vars[g])); + } - if ( NULL == var ) - { - msg(ME, _("Cannot create new rank variable. All candidates in use.")); - return NULL; + ds_put_format (&label, _("%s of %s by %s"), function_name[f], + var_get_name (src_var), ds_cstr (&group_var_str)); + ds_destroy (&group_var_str); } + else + ds_put_format (&label, _("%s of %s"), + function_name[f], var_get_name (src_var)); - var_set_both_formats (var, &dest_format[f]); - - return var; + var_set_label (dest_var, ds_cstr (&label), false); + + ds_destroy (&label); } - -static void -rank_cleanup(void) +int +cmd_rank (struct lexer *lexer, struct dataset *ds) { - int i; + struct rank rank; + struct variable *order; + bool result = true; - free (group_vars); - group_vars = NULL; - n_group_vars = 0; + subcase_init_empty (&rank.sc); - for (i = 0 ; i < n_rank_specs ; ++i ) - free (rank_specs[i].destvars); + rank.rs = NULL; + rank.n_rs = 0; + rank.exclude = MV_ANY; + rank.n_group_vars = 0; + rank.group_vars = NULL; + rank.dict = dataset_dict (ds); + rank.ties = TIES_MEAN; + rank.fraction = FRAC_BLOM; + rank.print = true; + rank.pool = pool_create (); - free (rank_specs); - rank_specs = NULL; - n_rank_specs = 0; + if (lex_match_id (lexer, "VARIABLES")) + lex_force_match (lexer, T_EQUALS); - subcase_destroy (&sc); + if (!parse_sort_criteria (lexer, rank.dict, + &rank.sc, + &rank.vars, + &rank.ascending)) + goto error; - free (src_vars); - src_vars = NULL; - n_src_vars = 0; -} + rank.n_vars = rank.sc.n_fields; -int -cmd_rank (struct lexer *lexer, struct dataset *ds) -{ - bool result; - struct variable *order; - size_t i; - n_rank_specs = 0; - - subcase_init_empty (&sc); - if ( !parse_rank (lexer, ds, &cmd, NULL) ) + if (lex_match (lexer, T_BY) ) { - rank_cleanup (); - return CMD_FAILURE; + if ( ! parse_variables_const (lexer, rank.dict, + &rank.group_vars, &rank.n_group_vars, + PV_NO_DUPLICATE | PV_NO_SCRATCH)) + goto error; } - /* If /MISSING=INCLUDE is set, then user missing values are ignored */ - exclude_values = cmd.miss == RANK_INCLUDE ? MV_SYSTEM : MV_ANY; - /* Default to /RANK if no function subcommands are given */ - if ( !( cmd.sbc_normal || cmd.sbc_ntiles || cmd.sbc_proportion || - cmd.sbc_rfraction || cmd.sbc_savage || cmd.sbc_n || - cmd.sbc_percent || cmd.sbc_rank ) ) + while (lex_token (lexer) != T_ENDCMD ) { - assert ( n_rank_specs == 0 ); + lex_force_match (lexer, T_SLASH); + if (lex_match_id (lexer, "TIES")) + { + lex_force_match (lexer, T_EQUALS); + if (lex_match_id (lexer, "MEAN")) + { + rank.ties = TIES_MEAN; + } + else if (lex_match_id (lexer, "LOW")) + { + rank.ties = TIES_LOW; + } + else if (lex_match_id (lexer, "HIGH")) + { + rank.ties = TIES_HIGH; + } + else if (lex_match_id (lexer, "CONDENSE")) + { + rank.ties = TIES_CONDENSE; + } + else + { + lex_error (lexer, NULL); + goto error; + } + } + else if (lex_match_id (lexer, "FRACTION")) + { + lex_force_match (lexer, T_EQUALS); + if (lex_match_id (lexer, "BLOM")) + { + rank.fraction = FRAC_BLOM; + } + else if (lex_match_id (lexer, "TUKEY")) + { + rank.fraction = FRAC_TUKEY; + } + else if (lex_match_id (lexer, "VW")) + { + rank.fraction = FRAC_VW; + } + else if (lex_match_id (lexer, "RANKIT")) + { + rank.fraction = FRAC_RANKIT; + } + else + { + lex_error (lexer, NULL); + goto error; + } + } + else if (lex_match_id (lexer, "PRINT")) + { + lex_force_match (lexer, T_EQUALS); + if (lex_match_id (lexer, "YES")) + { + rank.print = true; + } + else if (lex_match_id (lexer, "NO")) + { + rank.print = false; + } + else + { + lex_error (lexer, NULL); + goto error; + } + } + else if (lex_match_id (lexer, "MISSING")) + { + lex_force_match (lexer, T_EQUALS); + if (lex_match_id (lexer, "INCLUDE")) + { + rank.exclude = MV_SYSTEM; + } + else if (lex_match_id (lexer, "EXCLUDE")) + { + rank.exclude = MV_ANY; + } + else + { + lex_error (lexer, NULL); + goto error; + } + } + else if (! parse_into (lexer, &rank)) + goto error; + } - rank_specs = xmalloc (sizeof (*rank_specs)); - rank_specs[0].rfunc = RANK; - rank_specs[0].destvars = - xcalloc (subcase_get_n_fields (&sc), sizeof (struct variable *)); - n_rank_specs = 1; + /* If no rank specs are given, then apply a default */ + if ( rank.n_rs == 0) + { + rank.rs = pool_calloc (rank.pool, 1, sizeof (*rank.rs)); + rank.n_rs = 1; + rank.rs[0].rfunc = RANK; + rank.rs[0].destvars = pool_calloc (rank.pool, rank.n_vars, sizeof (*rank.rs[0].destvars)); } - assert ( subcase_get_n_fields (&sc) == n_src_vars); - /* Create variables for all rank destinations which haven't already been created with INTO. Add labels to all the destination variables. */ - for (i = 0 ; i < n_rank_specs ; ++i ) + int i; + for (i = 0 ; i < rank.n_rs ; ++i ) { int v; - for ( v = 0 ; v < n_src_vars ; v ++ ) - { - struct dictionary *dict = dataset_dict (ds); + struct rank_spec *rs = &rank.rs[i]; - if ( rank_specs[i].destvars[v] == NULL ) + for ( v = 0 ; v < rank.n_vars ; v ++ ) + { + if ( rs->destvars[v] == NULL ) { - rank_specs[i].destvars[v] = - create_rank_variable (dict, rank_specs[i].rfunc, src_vars[v], NULL); + rs->destvars[v] = + create_rank_variable (rank.dict, rs->rfunc, rank.vars[v], NULL); } - create_var_label ( rank_specs[i].destvars[v], - src_vars[v], - rank_specs[i].rfunc); + create_var_label (&rank, rs->destvars[v], + rank.vars[v], + rs->rfunc); } } - if ( cmd.print == RANK_YES ) + if ( rank.print ) { int v; tab_output_text (0, _("Variables Created By RANK")); tab_output_text (0, ""); - for (i = 0 ; i < n_rank_specs ; ++i ) + for (i = 0 ; i < rank.n_rs ; ++i ) { - for ( v = 0 ; v < n_src_vars ; v ++ ) + for ( v = 0 ; v < rank.n_vars ; v ++ ) { - if ( n_group_vars > 0 ) + if ( rank.n_group_vars > 0 ) { struct string varlist; int g; ds_init_empty (&varlist); - for ( g = 0 ; g < n_group_vars ; ++g ) + for ( g = 0 ; g < rank.n_group_vars ; ++g ) { - ds_put_cstr (&varlist, var_get_name (group_vars[g])); + ds_put_cstr (&varlist, var_get_name (rank.group_vars[g])); - if ( g < n_group_vars - 1) + if ( g < rank.n_group_vars - 1) ds_put_cstr (&varlist, " "); } - if ( rank_specs[i].rfunc == NORMAL || - rank_specs[i].rfunc == PROPORTION ) + if ( rank.rs[i].rfunc == NORMAL || + rank.rs[i].rfunc == PROPORTION ) tab_output_text_format (0, _("%s into %s(%s of %s using %s BY %s)"), - var_get_name (src_vars[v]), - var_get_name (rank_specs[i].destvars[v]), - function_name[rank_specs[i].rfunc], - var_get_name (src_vars[v]), - fraction_name(), + var_get_name (rank.vars[v]), + var_get_name (rank.rs[i].destvars[v]), + function_name[rank.rs[i].rfunc], + var_get_name (rank.vars[v]), + fraction_name (&rank), ds_cstr (&varlist)); else tab_output_text_format (0, _("%s into %s(%s of %s BY %s)"), - var_get_name (src_vars[v]), - var_get_name (rank_specs[i].destvars[v]), - function_name[rank_specs[i].rfunc], - var_get_name (src_vars[v]), + var_get_name (rank.vars[v]), + var_get_name (rank.rs[i].destvars[v]), + function_name[rank.rs[i].rfunc], + var_get_name (rank.vars[v]), ds_cstr (&varlist)); ds_destroy (&varlist); } else { - if ( rank_specs[i].rfunc == NORMAL || - rank_specs[i].rfunc == PROPORTION ) + if ( rank.rs[i].rfunc == NORMAL || + rank.rs[i].rfunc == PROPORTION ) tab_output_text_format (0, _("%s into %s(%s of %s using %s)"), - var_get_name (src_vars[v]), - var_get_name (rank_specs[i].destvars[v]), - function_name[rank_specs[i].rfunc], - var_get_name (src_vars[v]), - fraction_name()); + var_get_name (rank.vars[v]), + var_get_name (rank.rs[i].destvars[v]), + function_name[rank.rs[i].rfunc], + var_get_name (rank.vars[v]), + fraction_name (&rank)); else tab_output_text_format (0, _("%s into %s(%s of %s)"), - var_get_name (src_vars[v]), - var_get_name (rank_specs[i].destvars[v]), - function_name[rank_specs[i].rfunc], - var_get_name (src_vars[v])); + var_get_name (rank.vars[v]), + var_get_name (rank.rs[i].destvars[v]), + function_name[rank.rs[i].rfunc], + var_get_name (rank.vars[v])); } } } } - if ( cmd.sbc_fraction && - ( ! cmd.sbc_normal && ! cmd.sbc_proportion) ) - msg(MW, _("FRACTION has been specified, but NORMAL and PROPORTION rank functions have not been requested. The FRACTION subcommand will be ignored.") ); - /* Add a variable which we can sort by to get back the original order */ order = dict_create_var_assert (dataset_dict (ds), "$ORDER_", 0); @@ -768,13 +899,14 @@ cmd_rank (struct lexer *lexer, struct dataset *ds) add_transformation (ds, create_resort_key, 0, order); /* Do the ranking */ - result = rank_cmd (ds, &sc, rank_specs, n_rank_specs); - + result = rank_cmd (ds, &rank); + /* Put the active dataset back in its original order. Delete our sort key, which we don't need anymore. */ { struct casereader *sorted; + /* FIXME: loses error conditions. */ proc_discard_output (ds); @@ -783,175 +915,74 @@ cmd_rank (struct lexer *lexer, struct dataset *ds) dict_delete_var (dataset_dict (ds), order); result = dataset_set_source (ds, sorted) && result; + if ( result != true) + goto error; } - rank_cleanup(); + destroy_rank (&rank); + return CMD_SUCCESS; + error: - return (result ? CMD_SUCCESS : CMD_CASCADING_FAILURE); + destroy_rank (&rank); + return CMD_FAILURE; } -/* Parser for the variables sub command - Returns 1 on success */ -static int -rank_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd UNUSED, void *aux UNUSED) -{ - lex_match (lexer, T_EQUALS); - - if ((lex_token (lexer) != T_ID || dict_lookup_var (dataset_dict (ds), lex_tokcstr (lexer)) == NULL) - && lex_token (lexer) != T_ALL) - return 2; - - if (!parse_sort_criteria (lexer, dataset_dict (ds), &sc, &src_vars, NULL)) - return 0; - n_src_vars = subcase_get_n_fields (&sc); - - if ( lex_match (lexer, T_BY) ) - { - if ((lex_token (lexer) != T_ID || dict_lookup_var (dataset_dict (ds), lex_tokcstr (lexer)) == NULL)) - { - return 2; - } - - if (!parse_variables_const (lexer, dataset_dict (ds), - &group_vars, &n_group_vars, - PV_NO_DUPLICATE | PV_NO_SCRATCH) ) - { - free (group_vars); - return 0; - } - } - - return 1; -} - -/* Parse the [/rank INTO var1 var2 ... varN ] clause */ -static int -parse_rank_function (struct lexer *lexer, struct dictionary *dict, struct cmd_rank *cmd UNUSED, enum RANK_FUNC f) +static bool +rank_cmd (struct dataset *ds, const struct rank *cmd) { - int var_count = 0; - - n_rank_specs++; - rank_specs = xnrealloc(rank_specs, n_rank_specs, sizeof *rank_specs); - rank_specs[n_rank_specs - 1].rfunc = f; - rank_specs[n_rank_specs - 1].destvars = NULL; - - rank_specs[n_rank_specs - 1].destvars = - xcalloc (subcase_get_n_fields (&sc), sizeof (struct variable *)); + struct dictionary *d = dataset_dict (ds); + bool ok = true; + int i; - if (lex_match_id (lexer, "INTO")) + for (i = 0 ; i < subcase_get_n_fields (&cmd->sc) ; ++i ) { - struct variable *destvar; - - while( lex_token (lexer) == T_ID ) - { - - if ( dict_lookup_var (dict, lex_tokcstr (lexer)) != NULL ) - { - msg(SE, _("Variable %s already exists."), lex_tokcstr (lexer)); - return 0; - } - if ( var_count >= subcase_get_n_fields (&sc) ) - { - msg(SE, _("Too many variables in INTO clause.")); - return 0; - } - - destvar = create_rank_variable (dict, f, src_vars[var_count], lex_tokcstr (lexer)); - rank_specs[n_rank_specs - 1].destvars[var_count] = destvar ; - - lex_get (lexer); - ++var_count; - } - } - - return 1; -} - - -static int -rank_custom_rank (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED ) -{ - struct dictionary *dict = dataset_dict (ds); - - return parse_rank_function (lexer, dict, cmd, RANK); -} - -static int -rank_custom_normal (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED ) -{ - struct dictionary *dict = dataset_dict (ds); - - return parse_rank_function (lexer, dict, cmd, NORMAL); -} - -static int -rank_custom_percent (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED ) -{ - struct dictionary *dict = dataset_dict (ds); - - return parse_rank_function (lexer, dict, cmd, PERCENT); -} - -static int -rank_custom_rfraction (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED ) -{ - struct dictionary *dict = dataset_dict (ds); - - return parse_rank_function (lexer, dict, cmd, RFRACTION); -} - -static int -rank_custom_proportion (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED ) -{ - struct dictionary *dict = dataset_dict (ds); - - return parse_rank_function (lexer, dict, cmd, PROPORTION); -} - -static int -rank_custom_n (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED ) -{ - struct dictionary *dict = dataset_dict (ds); + /* Rank variable at index I in SC. */ + struct casegrouper *split_grouper; + struct casereader *split_group; + struct casewriter *output; - return parse_rank_function (lexer, dict, cmd, N); -} + proc_discard_output (ds); + split_grouper = casegrouper_create_splits (proc_open (ds), d); + output = autopaging_writer_create (dict_get_proto (d)); -static int -rank_custom_savage (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED ) -{ - struct dictionary *dict = dataset_dict (ds); + while (casegrouper_get_next_group (split_grouper, &split_group)) + { + struct subcase ordering; + struct casereader *ordered; + struct casegrouper *by_grouper; + struct casereader *by_group; - return parse_rank_function (lexer, dict, cmd, SAVAGE); -} + /* Sort this split group by the BY variables as primary + keys and the rank variable as secondary key. */ + subcase_init_vars (&ordering, cmd->group_vars, cmd->n_group_vars); + subcase_add_var (&ordering, cmd->vars[i], + subcase_get_direction (&cmd->sc, i)); + ordered = sort_execute (split_group, &ordering); + subcase_destroy (&ordering); + /* Rank the rank variable within this split group. */ + by_grouper = casegrouper_create_vars (ordered, + cmd->group_vars, cmd->n_group_vars); + while (casegrouper_get_next_group (by_grouper, &by_group)) + { + /* Rank the rank variable within this BY group + within the split group. */ -static int -rank_custom_ntiles (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED ) -{ - struct dictionary *dict = dataset_dict (ds); + rank_sorted_file (by_group, output, d, i, cmd); - if ( lex_force_match (lexer, T_LPAREN) ) - { - if ( lex_force_int (lexer) ) - { - k_ntiles = lex_integer (lexer); - lex_get (lexer); - lex_force_match (lexer, T_RPAREN); - } - else - return 0; + } + ok = casegrouper_destroy (by_grouper) && ok; + } + ok = casegrouper_destroy (split_grouper); + ok = proc_commit (ds) && ok; + ok = (dataset_set_source (ds, casewriter_make_reader (output)) + && ok); + if (!ok) + break; } - else - return 0; - return parse_rank_function (lexer, dict, cmd, NTILES); + return ok; } - -/* - Local Variables: - mode: c - End: -*/ -- 2.30.2