From 1255b66223eac8ff1b0abf138ead22fd801c6633 Mon Sep 17 00:00:00 2001 From: John Darrington Date: Sat, 7 Oct 2006 03:10:14 +0000 Subject: [PATCH] Implemented the RANK command. --- doc/ChangeLog | 4 + doc/regression.texi | 2 +- doc/statistics.texi | 67 ++- po/de.po | 50 +- po/pspp.pot | 50 +- src/language/command.def | 3 +- src/language/stats/ChangeLog | 4 + src/language/stats/rank.q | 924 ++++++++++++++++++++++++++++---- src/language/xforms/ChangeLog | 4 + src/language/xforms/automake.mk | 1 + src/language/xforms/fail.c | 51 ++ tests/ChangeLog | 4 + tests/automake.mk | 1 + tests/command/rank.sh | 558 +++++++++++++++++++ 14 files changed, 1599 insertions(+), 124 deletions(-) create mode 100644 src/language/xforms/fail.c create mode 100755 tests/command/rank.sh diff --git a/doc/ChangeLog b/doc/ChangeLog index 03055447..ca7ba9a7 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,7 @@ +Sat Oct 7 11:02:44 WST 2006 + + * Added documentation for RANK. + Tue May 2 10:43:06 WST 2006 * data-file-format.texi: Updated information about the case_size diff --git a/doc/regression.texi b/doc/regression.texi index 74816307..ff91df23 100644 --- a/doc/regression.texi +++ b/doc/regression.texi @@ -1,4 +1,4 @@ -@node REGRESSION, ,ONEWAY, Statistics +@node REGRESSION, ,RANK, Statistics @comment node-name, next, previous, up @section REGRESSION diff --git a/doc/statistics.texi b/doc/statistics.texi index 2cb79d23..9f1923b8 100644 --- a/doc/statistics.texi +++ b/doc/statistics.texi @@ -14,6 +14,7 @@ far. * CROSSTABS:: Crosstabulation tables. * T-TEST:: Test hypotheses about means. * ONEWAY:: One way analysis of variance. +* RANK:: Compute rank scores. * REGRESSION:: Linear regression. @end menu @@ -600,7 +601,7 @@ of variable preceding @code{WITH} against variable following @code{WITH} are generated. -@node ONEWAY, REGRESSION, T-TEST, Statistics +@node ONEWAY, RANK, T-TEST, Statistics @comment node-name, next, previous, up @section ONEWAY @@ -650,4 +651,68 @@ The @code{CONTRASTS} subcommand may be given up to 10 times in order to specify different contrast tests. @setfilename ignored +@node RANK, REGRESSION, ONEWAY, Statistics +@comment node-name, next, previous, up +@section RANK + +@vindex RANK +@cindex RANK + +@display +RANK + [VARIABLES=] var_list [@{A,D@}] [BY var_list] + /TIES=@{MEAN,LOW,HIGH,CONDENSE@} + /FRACTION=@{BLOM,TUKEY,VW,RANKIT@} + /PRINT[=@{YES,NO@} + /MISSING=@{EXCLUDE,INCLUDE@} + + /RANK [INTO var_list] + /NTILES(k) [INTO var_list] + /NORMAL [INTO var_list] + /PERCENT [INTO var_list] + /RFRACTION [INTO var_list] + /PROPORTION [INTO var_list] + /N [INTO var_list] + /SAVAGE [INTO var_list] +@end display + +The @cmd{RANK} command ranks variables and stores the results into new +variables. + +The VARIABLES subcommand, which is mandatory, specifies one or +more variables whose values are to be ranked. +After each variable, @samp{A} or @samp{D} may appear, indicating that +the variable is to be ranked in ascending or descending order. +Ascending is the default. +If a BY keyword appears, it should be followed by a list of variables +which are to serve as group variables. +In this case, the cases are gathered into groups, and ranks calculated +for each group. + +The TIES subcommand specifies how tied values are to be treated. The +default is to take the mean value of all the tied cases. + +The FRACTION subcommand specifies how proportional ranks are to be +calculated. This only has any effect if NORMAL or PROPORTIONAL rank +functions are requested. + +The PRINT subcommand may be used to specify that a summary of the rank +variables created should appear in the output. + +The function subcommands are RANK, NTILES, NORMAL, PERCENT, RFRACTION, +PROPORTION and SAVAGE. Any number of function subcommands may appear. +If none are given, then the default is RANK. +The NTILES subcommand must take an integer specifying the number of +partitions into which values should be ranked. +Each subcommand may be followed by the INTO keyword and a list of +variables which are the variables to be created and receive the rank +scores. There may be as many variables specified as there are +variables named on the VARIABLES subcommand. If fewer are specified, +then the variable names are automatically created. + +The MISSING subcommand determines how user missing values are to be +treated. A setting of EXCLUDE means that variables whose values are +user-missing are to be excluded from the rank scores. A setting of +INCLUDE means they are to be included. The default is EXCLUDE. + @include regression.texi diff --git a/po/de.po b/po/de.po index c1ca5413..f17851f5 100644 --- a/po/de.po +++ b/po/de.po @@ -10,7 +10,7 @@ msgid "" msgstr "" "Project-Id-Version: PSPP 0.4.2\n" "Report-Msgid-Bugs-To: pspp-dev@gnu.org\n" -"POT-Creation-Date: 2006-09-27 08:45+0800\n" +"POT-Creation-Date: 2006-10-06 17:58+0800\n" "PO-Revision-Date: 2006-05-26 17:49+0800\n" "Last-Translator: John Darrington \n" "Language-Team: German \n" @@ -3306,12 +3306,56 @@ msgstr "" msgid "Does not assume equal" msgstr "" -#: src/language/stats/rank.q:247 +#: src/language/stats/rank.q:221 +#, c-format +msgid "%s of %s by %s" +msgstr "" + +#: src/language/stats/rank.q:226 +#, c-format +msgid "%s of %s" +msgstr "" + +#: src/language/stats/rank.q:710 +msgid "Cannot create new rank variable. All candidates in use." +msgstr "" + +#: src/language/stats/rank.q:810 +msgid "Variables Created By RANK" +msgstr "" + +#: src/language/stats/rank.q:834 +#, c-format +msgid "%s into %s(%s of %s using %s BY %s)" +msgstr "" + +#: src/language/stats/rank.q:845 +#, c-format +msgid "%s into %s(%s of %s BY %s)" +msgstr "" + +#: src/language/stats/rank.q:859 +#, c-format +msgid "%s into %s(%s of %s using %s)" +msgstr "" + +#: src/language/stats/rank.q:869 +#, c-format +msgid "%s into %s(%s of %s)" +msgstr "" + +#: src/language/stats/rank.q:882 +msgid "" +"FRACTION has been specified, but NORMAL and PROPORTION rank functions have " +"not been requested. The FRACTION subcommand will be ignored." +msgstr "" + +#: src/language/stats/rank.q:974 #, c-format msgid "Variable %s already exists." msgstr "" -#: src/language/stats/rank.q:252 +#: src/language/stats/rank.q:979 msgid "Too many variables in INTO clause." msgstr "" diff --git a/po/pspp.pot b/po/pspp.pot index bd26c476..88294177 100644 --- a/po/pspp.pot +++ b/po/pspp.pot @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: PACKAGE VERSION\n" "Report-Msgid-Bugs-To: pspp-dev@gnu.org\n" -"POT-Creation-Date: 2006-09-27 08:45+0800\n" +"POT-Creation-Date: 2006-10-06 17:58+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -3305,12 +3305,56 @@ msgstr "" msgid "Does not assume equal" msgstr "" -#: src/language/stats/rank.q:247 +#: src/language/stats/rank.q:221 +#, c-format +msgid "%s of %s by %s" +msgstr "" + +#: src/language/stats/rank.q:226 +#, c-format +msgid "%s of %s" +msgstr "" + +#: src/language/stats/rank.q:710 +msgid "Cannot create new rank variable. All candidates in use." +msgstr "" + +#: src/language/stats/rank.q:810 +msgid "Variables Created By RANK" +msgstr "" + +#: src/language/stats/rank.q:834 +#, c-format +msgid "%s into %s(%s of %s using %s BY %s)" +msgstr "" + +#: src/language/stats/rank.q:845 +#, c-format +msgid "%s into %s(%s of %s BY %s)" +msgstr "" + +#: src/language/stats/rank.q:859 +#, c-format +msgid "%s into %s(%s of %s using %s)" +msgstr "" + +#: src/language/stats/rank.q:869 +#, c-format +msgid "%s into %s(%s of %s)" +msgstr "" + +#: src/language/stats/rank.q:882 +msgid "" +"FRACTION has been specified, but NORMAL and PROPORTION rank functions have " +"not been requested. The FRACTION subcommand will be ignored." +msgstr "" + +#: src/language/stats/rank.q:974 #, c-format msgid "Variable %s already exists." msgstr "" -#: src/language/stats/rank.q:252 +#: src/language/stats/rank.q:979 msgid "Too many variables in INTO clause." msgstr "" diff --git a/src/language/command.def b/src/language/command.def index c8253955..7d3b95e0 100644 --- a/src/language/command.def +++ b/src/language/command.def @@ -109,6 +109,7 @@ DEF_CMD (S_DATA, 0, "MEANS", cmd_means) DEF_CMD (S_DATA, 0, "MODIFY VARS", cmd_modify_vars) DEF_CMD (S_DATA, 0, "ONEWAY", cmd_oneway) DEF_CMD (S_DATA, 0, "PEARSON CORRELATIONS", cmd_correlations) +DEF_CMD (S_DATA, 0, "RANK", cmd_rank) DEF_CMD (S_DATA, 0, "REGRESSION", cmd_regression) DEF_CMD (S_DATA, 0, "RENAME VARIABLES", cmd_rename_variables) DEF_CMD (S_DATA, 0, "SAMPLE", cmd_sample) @@ -130,6 +131,7 @@ DEF_CMD (S_ANY, F_TESTING, "DEBUG CASEFILE", cmd_debug_casefile) DEF_CMD (S_ANY, F_TESTING, "DEBUG EVALUATE", cmd_debug_evaluate) DEF_CMD (S_ANY, F_TESTING, "DEBUG MOMENTS", cmd_debug_moments) DEF_CMD (S_ANY, F_TESTING, "DEBUG POOL", cmd_debug_pool) +DEF_CMD (S_ANY, F_TESTING, "DEBUG XFORM FAIL", cmd_debug_xform_fail) /* Unimplemented commands. */ UNIMPL_CMD ("ACF", "Autocorrelation function") @@ -177,7 +179,6 @@ UNIMPL_CMD ("PROBIT", "Probit analysis") UNIMPL_CMD ("PROCEDURE OUTPUT", "Specify output file") UNIMPL_CMD ("PROXIMITIES", "Pairwise similarity") UNIMPL_CMD ("QUICK CLUSTER", "Fast clustering") -UNIMPL_CMD ("RANK", "Create rank scores") UNIMPL_CMD ("RECORD TYPE", "Defines a type of record within FILE TYPE") UNIMPL_CMD ("REFORMAT", "Read obsolete files") UNIMPL_CMD ("REPEATING DATA", "Specify multiple cases per input record") diff --git a/src/language/stats/ChangeLog b/src/language/stats/ChangeLog index eb943c60..9d45ca7b 100644 --- a/src/language/stats/ChangeLog +++ b/src/language/stats/ChangeLog @@ -1,3 +1,7 @@ +Sat Oct 7 11:06:01 WST 2006 John Darrington + + * rank.q: Implemented most of the RANK command. + 2006-07-14 Jason Stover * regression.q (run_regression): New function to move knowledge of diff --git a/src/language/stats/rank.q b/src/language/stats/rank.q index 71eaeccb..c102d195 100644 --- a/src/language/stats/rank.q +++ b/src/language/stats/rank.q @@ -1,7 +1,8 @@ /* PSPP - RANK. -*-c-*- -Copyright (C) 2005 Free Software Foundation, Inc. -Author: John Darrington 2005 +Copyright (C) 2005, 2006 Free Software Foundation, Inc. +Author: John Darrington , + Ben Pfaff . This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as @@ -25,10 +26,20 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA #include #include #include +#include +#include +#include +#include #include #include +#include #include #include +#include +#include + +#include +#include #include "gettext.h" #define _(msgid) gettext (msgid) @@ -47,11 +58,39 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +n=custom; +savage=custom; +print=print:!yes/no; + +fraction=fraction:!blom/tukey/vw/rankit; + +ties=ties:!mean/low/high/condense; missing=miss:!exclude/include. */ /* (declarations) */ /* (functions) */ +typedef double (*rank_function_t) (double c, double cc, double cc_1, + int i, double w); + +static double rank_proportion (double c, double cc, double cc_1, + int i, double w); + +static double rank_normal (double c, double cc, double cc_1, + int i, double w); + +static double rank_percent (double c, double cc, double cc_1, + int i, double w); + +static double rank_rfraction (double c, double cc, double cc_1, + int i, double w); + +static double rank_rank (double c, double cc, double cc_1, + int i, double w); + +static double rank_n (double c, double cc, double cc_1, + int i, double w); + +static double rank_savage (double c, double cc, double cc_1, + int i, double w); + +static double rank_ntiles (double c, double cc, double cc_1, + int i, double w); enum RANK_FUNC @@ -64,6 +103,40 @@ enum RANK_FUNC N, NTILES, SAVAGE, + n_RANK_FUNCS + }; + +static const struct fmt_spec dest_format[n_RANK_FUNCS] = { + {FMT_F, 9, 3}, /* rank */ + {FMT_F, 6, 4}, /* normal */ + {FMT_F, 6, 2}, /* percent */ + {FMT_F, 6, 4}, /* rfraction */ + {FMT_F, 6, 4}, /* proportion */ + {FMT_F, 6, 0}, /* n */ + {FMT_F, 3, 0}, /* ntiles */ + {FMT_F, 8, 4} /* savage */ +}; + +static const char *function_name[n_RANK_FUNCS] = { + "RANK", + "NORMAL", + "PERCENT", + "RFRACTION", + "PROPORTION", + "N", + "NTILES", + "SAVAGE" +}; + +static rank_function_t rank_func[n_RANK_FUNCS] = { + rank_rank, + rank_normal, + rank_percent, + rank_rfraction, + rank_proportion, + rank_n, + rank_ntiles, + rank_savage }; @@ -71,10 +144,12 @@ struct rank_spec { enum RANK_FUNC rfunc; struct variable **destvars; - struct variable *srcvar; }; +/* Function to use for testing for missing values */ +static is_missing_func *value_is_missing; + static struct rank_spec *rank_specs; static size_t n_rank_specs; @@ -83,57 +158,759 @@ static struct sort_criteria *sc; static struct variable **group_vars; static size_t n_group_vars; +static struct variable **src_vars; +static size_t n_src_vars; + + +static int k_ntiles; + static struct cmd_rank cmd; +static struct casefile *rank_sorted_casefile (struct casefile *cf, + const struct sort_criteria *, + const struct rank_spec *rs, + int n_rank_specs, + int idx, + const struct missing_values *miss + ); +static const char * +fraction_name(void) +{ + static char name[10]; + switch ( cmd.fraction ) + { + case RANK_BLOM: + strcpy (name, "BLOM"); + break; + case RANK_RANKIT: + strcpy (name, "RANKIT"); + break; + case RANK_TUKEY: + strcpy (name, "TUKEY"); + break; + case RANK_VW: + strcpy (name, "VW"); + break; + default: + NOT_REACHED (); + } + return name; +} + +/* Create a label on DEST_VAR, describing its derivation from SRC_VAR and F */ +static void +create_var_label (struct variable *dest_var, + const struct variable *src_var, enum RANK_FUNC f) +{ + struct string label; + ds_init_empty (&label); + + if ( n_group_vars > 0 ) + { + struct string group_var_str; + int g; + + ds_init_empty (&group_var_str); + + for (g = 0 ; g < n_group_vars ; ++g ) + { + if ( g > 0 ) ds_put_cstr (&group_var_str, " "); + ds_put_cstr (&group_var_str, group_vars[g]->name); + } + + ds_put_format (&label, _("%s of %s by %s"), function_name[f], + src_var->name, ds_cstr (&group_var_str)); + ds_destroy (&group_var_str); + } + else + ds_put_format (&label,_("%s of %s"), function_name[f], src_var->name); + + dest_var->label = strdup (ds_cstr (&label) ); + + ds_destroy (&label); +} + + +static bool +rank_cmd (const struct sort_criteria *sc, + const struct rank_spec *rank_specs, int n_rank_specs) +{ + struct sort_criteria criteria; + bool result = true; + int i; + const int n_splits = dict_get_split_cnt (default_dict); + + criteria.crit_cnt = n_splits + n_group_vars + 1; + criteria.crits = xnmalloc (criteria.crit_cnt, sizeof *criteria.crits); + for (i = 0; i < n_splits ; i++) + { + struct variable *v = dict_get_split_vars (default_dict)[i]; + criteria.crits[i].fv = v->fv; + criteria.crits[i].width = v->width; + criteria.crits[i].dir = SRT_ASCEND; + } + for (i = 0; i < n_group_vars; i++) + { + criteria.crits[i + n_splits].fv = group_vars[i]->fv; + criteria.crits[i + n_splits].width = group_vars[i]->width; + criteria.crits[i + n_splits].dir = SRT_ASCEND; + } + for (i = 0 ; i < sc->crit_cnt ; ++i ) + { + struct casefile *out ; + struct casefile *cf ; + struct casereader *reader ; + struct casefile *sorted_cf ; + + /* Obtain active file in CF. */ + if (!procedure (NULL, NULL)) + return false; + cf = proc_capture_output (); + + /* Sort CF into SORTED_CF. */ + reader = casefile_get_destructive_reader (cf) ; + criteria.crits[criteria.crit_cnt - 1] = sc->crits[i]; + assert ( sc->crits[i].fv == src_vars[i]->fv ); + sorted_cf = sort_execute (reader, &criteria); + casefile_destroy (cf); + + out = rank_sorted_casefile (sorted_cf, &criteria, + rank_specs, n_rank_specs, + i, &src_vars[i]->miss) ; + if ( NULL == out ) + { + result = false ; + continue ; + } + + proc_set_source (storage_source_create (out)); + } + free (criteria.crits); + + return result ; +} + +/* Hardly a rank function !! */ +static double +rank_n (double c UNUSED, double cc UNUSED, double cc_1 UNUSED, + int i UNUSED, double w) +{ + return w; +} + + +static double +rank_rank (double c, double cc, double cc_1, + int i, double w UNUSED) +{ + double rank; + if ( c >= 1.0 ) + { + switch (cmd.ties) + { + case RANK_LOW: + rank = cc_1 + 1; + break; + case RANK_HIGH: + rank = cc; + break; + case RANK_MEAN: + rank = cc_1 + (c + 1.0)/ 2.0; + break; + case RANK_CONDENSE: + rank = i; + break; + default: + NOT_REACHED (); + } + } + else + { + switch (cmd.ties) + { + case RANK_LOW: + rank = cc_1; + break; + case RANK_HIGH: + rank = cc; + break; + case RANK_MEAN: + rank = cc_1 + c / 2.0 ; + break; + case RANK_CONDENSE: + rank = i; + break; + default: + NOT_REACHED (); + } + } + + return rank; +} + + +static double +rank_rfraction (double c, double cc, double cc_1, + int i, double w) +{ + return rank_rank (c, cc, cc_1, i, w) / w ; +} + + +static double +rank_percent (double c, double cc, double cc_1, + int i, double w) +{ + return rank_rank (c, cc, cc_1, i, w) * 100.0 / w ; +} + + +static double +rank_proportion (double c, double cc, double cc_1, + int i, double w) +{ + const double r = rank_rank (c, cc, cc_1, i, w) ; + + double f; + + switch ( cmd.fraction ) + { + case RANK_BLOM: + f = (r - 3.0/8.0) / (w + 0.25); + break; + case RANK_RANKIT: + f = (r - 0.5) / w ; + break; + case RANK_TUKEY: + f = (r - 1.0/3.0) / (w + 1.0/3.0); + break; + case RANK_VW: + f = r / ( w + 1.0); + break; + default: + NOT_REACHED (); + } + + + return (f > 0) ? f : SYSMIS; +} + +static double +rank_normal (double c, double cc, double cc_1, + int i, double w) +{ + double f = rank_proportion (c, cc, cc_1, i, w); + + return gsl_cdf_ugaussian_Pinv (f); +} + +static double +rank_ntiles (double c, double cc, double cc_1, + int i, double w) +{ + double r = rank_rank (c, cc, cc_1, i, w); + + + return ( floor (( r * k_ntiles) / ( w + 1) ) + 1); +} + +/* Expected value of the order statistics from an exponential distribution */ +static double +ee (int j, double w_star) +{ + int k; + double sum = 0.0; + + for (k = 1 ; k <= j; k++) + sum += 1.0 / ( w_star + 1 - k ); + + return sum; +} + + +static double +rank_savage (double c, double cc, double cc_1, + int i UNUSED, double w) +{ + double int_part; + const int i_1 = floor (cc_1); + const int i_2 = floor (cc); + + const double w_star = (modf (w, &int_part) == 0 ) ? w : floor (w) + 1; + + const double g_1 = cc_1 - i_1; + const double g_2 = cc - i_2; + + /* The second factor is infinite, when the first is zero. + Therefore, evaluate the second, only when the first is non-zero */ + const double expr1 = (1 - g_1) ? (1 - g_1) * ee(i_1+1, w_star) : ( 1 - g_1); + const double expr2 = g_2 ? g_2 * ee (i_2+1, w_star) : g_2 ; + + if ( i_1 == i_2 ) + return ee (i_1 + 1, w_star) - 1; + + if ( i_1 + 1 == i_2 ) + return ( ( expr1 + expr2 )/c ) - 1; + + if ( i_1 + 2 <= i_2 ) + { + int j; + double sigma = 0.0; + for (j = i_1 + 2 ; j <= i_2; ++j ) + sigma += ee (j, w_star); + return ( (expr1 + expr2 + sigma) / c) -1; + } + + NOT_REACHED(); +} + + +/* Rank the casefile belonging to CR, starting from the current + postition of CR continuing up to and including the ENDth case. + + RS points to an array containing the rank specifications to + use. N_RANK_SPECS is the number of elements of RS. + + + DEST_VAR_INDEX is the index into the rank_spec destvar element + to be used for this ranking. + + Prerequisites: 1. The casefile must be sorted according to CRITERION. + 2. W is the sum of the non-missing caseweights for this + range of the casefile. +*/ +static void +rank_cases (struct casereader *cr, + unsigned long end, + const struct sort_criterion *criterion, + const struct missing_values *mv, + double w, + const struct rank_spec *rs, + int n_rank_specs, + int dest_var_index, + struct casefile *dest) +{ + bool warn = true; + double cc = 0.0; + double cc_1; + int iter = 1; + + const int fv = criterion->fv; + const int width = criterion->width; + while (casereader_cnum (cr) < end) + { + struct casereader *lookahead; + const union value *this_value; + struct ccase this_case, lookahead_case; + double c; + int i; + size_t n = 0; + + if (!casereader_read_xfer (cr, &this_case)) + break; + + this_value = case_data (&this_case, fv); + c = dict_get_case_weight (default_dict, &this_case, &warn); + + lookahead = casereader_clone (cr); + n = 0; + while (casereader_cnum (lookahead) < end + && casereader_read_xfer (lookahead, &lookahead_case)) + { + const union value *lookahead_value = case_data (&lookahead_case, fv); + int diff = compare_values (this_value, lookahead_value, width); + + if (diff != 0) + { + /* Make sure the casefile was sorted */ + assert ( diff == ((criterion->dir == SRT_ASCEND) ? -1 :1)); + + case_destroy (&lookahead_case); + break; + } + + c += dict_get_case_weight (default_dict, &lookahead_case, &warn); + case_destroy (&lookahead_case); + n++; + } + casereader_destroy (lookahead); + + cc_1 = cc; + if ( !value_is_missing (mv, this_value) ) + cc += c; + + do + { + for (i = 0; i < n_rank_specs; ++i) + { + const int dest_idx = rs[i].destvars[dest_var_index]->fv; + + if ( value_is_missing (mv, this_value) ) + case_data_rw (&this_case, dest_idx)->f = SYSMIS; + else + case_data_rw (&this_case, dest_idx)->f = + rank_func[rs[i].rfunc](c, cc, cc_1, iter, w); + } + casefile_append_xfer (dest, &this_case); + } + while (n-- > 0 && casereader_read_xfer (cr, &this_case)); + + if ( !value_is_missing (mv, this_value) ) + iter++; + } + + /* If this isn't true, then all the results will be wrong */ + assert ( w == cc ); +} + +static bool +same_group (const struct ccase *a, const struct ccase *b, + const struct sort_criteria *crit) +{ + size_t i; + + for (i = 0; i < crit->crit_cnt - 1; i++) + { + struct sort_criterion *c = &crit->crits[i]; + if (compare_values (case_data (a, c->fv), case_data (b, c->fv), + c->width) != 0) + return false; + } + + return true; +} + +static struct casefile * +rank_sorted_casefile (struct casefile *cf, + const struct sort_criteria *crit, + const struct rank_spec *rs, + int n_rank_specs, + int dest_idx, + const struct missing_values *mv) +{ + struct casefile *dest = fastfile_create (casefile_get_value_cnt (cf)); + struct casereader *lookahead = casefile_get_reader (cf); + struct casereader *pos = casereader_clone (lookahead); + struct ccase group_case; + bool warn = true; + + struct sort_criterion *ultimate_crit = &crit->crits[crit->crit_cnt - 1]; + + if (casereader_read (lookahead, &group_case)) + { + struct ccase this_case; + const union value *this_value ; + double w = 0.0; + this_value = case_data( &group_case, ultimate_crit->fv); + + if ( !value_is_missing(mv, this_value) ) + w = dict_get_case_weight (default_dict, &group_case, &warn); + + while (casereader_read (lookahead, &this_case)) + { + const union value *this_value = + case_data(&this_case, ultimate_crit->fv); + double c = dict_get_case_weight (default_dict, &this_case, &warn); + if (!same_group (&group_case, &this_case, crit)) + { + rank_cases (pos, casereader_cnum (lookahead) - 1, + ultimate_crit, + mv, w, + rs, n_rank_specs, + dest_idx, dest); + + w = 0.0; + case_move (&group_case, &this_case); + } + if ( !value_is_missing (mv, this_value) ) + w += c; + } + rank_cases (pos, ULONG_MAX, ultimate_crit, mv, w, + rs, n_rank_specs, dest_idx, dest); + } + + if (casefile_error (dest)) + { + casefile_destroy (dest); + dest = NULL; + } + + casefile_destroy (cf); + return dest; +} + + +/* Transformation function to enumerate all the cases */ +static int +create_resort_key (void *key_var_, struct ccase *cc, casenum_t case_num) +{ + struct variable *key_var = key_var_; + + case_data_rw(cc, key_var->fv)->f = case_num; + + return TRNS_CONTINUE; +} + + +/* Create and return a new variable in which to store the ranks of SRC_VAR + accoring to the rank function F. + VNAME is the name of the variable to be created. + If VNAME is NULL, then a name will be automatically chosen. + */ +static struct variable * +create_rank_variable (enum RANK_FUNC f, + const struct variable *src_var, + const char *vname) +{ + int i; + struct variable *var = NULL; + char name[SHORT_NAME_LEN + 1]; + + if ( vname ) + var = dict_create_var(default_dict, vname, 0); + + if ( NULL == var ) + { + snprintf(name, SHORT_NAME_LEN + 1, "%c%s", + function_name[f][0], src_var->name); + + var = dict_create_var(default_dict, name, 0); + } + + i = 1; + while( NULL == var ) + { + char func_abb[4]; + snprintf(func_abb, 4, "%s", function_name[f]); + snprintf(name, SHORT_NAME_LEN + 1, "%s%03d", func_abb, + i); + + var = dict_create_var(default_dict, name, 0); + if (i++ >= 999) + break; + } + + i = 1; + while ( NULL == var ) + { + char func_abb[3]; + snprintf(func_abb, 3, "%s", function_name[f]); + + snprintf(name, SHORT_NAME_LEN + 1, + "RNK%s%02d", func_abb, i); + + var = dict_create_var(default_dict, name, 0); + if ( i++ >= 99 ) + break; + } + + if ( NULL == var ) + { + msg(ME, _("Cannot create new rank variable. All candidates in use.")); + return NULL; + } + + var->write = var->print = dest_format[f]; + + return var; +} int cmd_rank(void); +static void +rank_cleanup(void) +{ + int i; + + free (group_vars); + group_vars = NULL; + n_group_vars = 0; + + for (i = 0 ; i < n_rank_specs ; ++i ) + { + free (rank_specs[i].destvars); + } + + free (rank_specs); + rank_specs = NULL; + n_rank_specs = 0; + + sort_destroy_criteria (sc); + sc = NULL; + + free (src_vars); + src_vars = NULL; + n_src_vars = 0; +} + int cmd_rank(void) { + bool result; + struct variable *order; size_t i; n_rank_specs = 0; if ( !parse_rank(&cmd, NULL) ) + { + rank_cleanup (); return CMD_FAILURE; + } -#if 1 - for (i = 0 ; i < sc->crit_cnt ; ++i ) + /* If /MISSING=INCLUDE is set, then user missing values are ignored */ + if (cmd.miss == RANK_INCLUDE ) + value_is_missing = mv_is_value_system_missing; + else + value_is_missing = mv_is_value_missing; + + + /* Default to /RANK if no function subcommands are given */ + if ( !( cmd.sbc_normal || cmd.sbc_ntiles || cmd.sbc_proportion || + cmd.sbc_rfraction || cmd.sbc_savage || cmd.sbc_n || + cmd.sbc_percent || cmd.sbc_rank ) ) { - struct sort_criterion *crit = &sc->crits[i]; + assert ( n_rank_specs == 0 ); - printf("Dir: %d; Index: %d\n", crit->dir, crit->fv); + rank_specs = xmalloc (sizeof (*rank_specs)); + rank_specs[0].rfunc = RANK; + rank_specs[0].destvars = + xcalloc (sc->crit_cnt, sizeof (struct variable *)); + + n_rank_specs = 1; } - for (i = 0 ; i < n_group_vars ; ++i ) - printf("Group var: %s\n",group_vars[0]->name); + assert ( sc->crit_cnt == n_src_vars); + /* Create variables for all rank destinations which haven't + already been created with INTO. + Add labels to all the destination variables. + */ for (i = 0 ; i < n_rank_specs ; ++i ) { - int j; - printf("Ranks spec %d; Func: %d\n",i, rank_specs[i].rfunc); + int v; + for ( v = 0 ; v < n_src_vars ; v ++ ) + { + if ( rank_specs[i].destvars[v] == NULL ) + { + rank_specs[i].destvars[v] = + create_rank_variable (rank_specs[i].rfunc, src_vars[v], NULL); + } - for (j=0; j < sc->crit_cnt ; ++j ) - printf("Dest var is \"%s\"\n", rank_specs[i].destvars[j]->name); + create_var_label ( rank_specs[i].destvars[v], + src_vars[v], + rank_specs[i].rfunc); + } } -#endif + if ( cmd.print == RANK_YES ) + { + int v; - free(group_vars); + tab_output_text (0, _("Variables Created By RANK")); + tab_output_text (0, "\n"); - for (i = 0 ; i < n_rank_specs ; ++i ) - { - free(rank_specs[i].destvars); + for (i = 0 ; i < n_rank_specs ; ++i ) + { + for ( v = 0 ; v < n_src_vars ; v ++ ) + { + if ( n_group_vars > 0 ) + { + struct string varlist; + int g; + + ds_init_empty (&varlist); + for ( g = 0 ; g < n_group_vars ; ++g ) + { + ds_put_cstr (&varlist, group_vars[g]->name); + + if ( g < n_group_vars - 1) + ds_put_cstr (&varlist, " "); + } + + if ( rank_specs[i].rfunc == NORMAL || + rank_specs[i].rfunc == PROPORTION ) + tab_output_text (TAT_PRINTF, + _("%s into %s(%s of %s using %s BY %s)"), + src_vars[v]->name, + rank_specs[i].destvars[v]->name, + function_name[rank_specs[i].rfunc], + src_vars[v]->name, + fraction_name(), + ds_cstr (&varlist) + ); + + else + tab_output_text (TAT_PRINTF, + _("%s into %s(%s of %s BY %s)"), + src_vars[v]->name, + rank_specs[i].destvars[v]->name, + function_name[rank_specs[i].rfunc], + src_vars[v]->name, + ds_cstr (&varlist) + ); + ds_destroy (&varlist); + } + else + { + if ( rank_specs[i].rfunc == NORMAL || + rank_specs[i].rfunc == PROPORTION ) + tab_output_text (TAT_PRINTF, + _("%s into %s(%s of %s using %s)"), + src_vars[v]->name, + rank_specs[i].destvars[v]->name, + function_name[rank_specs[i].rfunc], + src_vars[v]->name, + fraction_name() + ); + + else + tab_output_text (TAT_PRINTF, + _("%s into %s(%s of %s)"), + src_vars[v]->name, + rank_specs[i].destvars[v]->name, + function_name[rank_specs[i].rfunc], + src_vars[v]->name + ); + } + } + } } - - free(rank_specs); - sort_destroy_criteria(sc); + if ( cmd.sbc_fraction && + ( ! cmd.sbc_normal && ! cmd.sbc_proportion) ) + msg(MW, _("FRACTION has been specified, but NORMAL and PROPORTION rank functions have not been requested. The FRACTION subcommand will be ignored.") ); + + /* Add a variable which we can sort by to get back the original + order */ + order = dict_create_var_assert (default_dict, "$ORDER_", 0); - return CMD_SUCCESS; + add_transformation (create_resort_key, 0, order); + + /* Do the ranking */ + result = rank_cmd (sc, rank_specs, n_rank_specs); + + /* Put the active file back in its original order */ + { + struct sort_criteria criteria; + struct sort_criterion restore_criterion ; + restore_criterion.fv = order->fv; + restore_criterion.width = 0; + restore_criterion.dir = SRT_ASCEND; + + criteria.crits = &restore_criterion; + criteria.crit_cnt = 1; + + sort_active_file_in_place (&criteria); } + /* ... and we don't need our sort key anymore. So delete it */ + dict_delete_var (default_dict, order); + + rank_cleanup(); + + return (result ? CMD_SUCCESS : CMD_CASCADING_FAILURE); +} /* Parser for the variables sub command @@ -149,7 +926,8 @@ rank_custom_variables(struct cmd_rank *cmd UNUSED, void *aux UNUSED) && token != T_ALL) return 2; - sc = sort_parse_criteria (default_dict, 0, 0, 0, terminators); + sc = sort_parse_criteria (default_dict, + &src_vars, &n_src_vars, 0, terminators); if ( lex_match(T_BY) ) { @@ -170,67 +948,16 @@ rank_custom_variables(struct cmd_rank *cmd UNUSED, void *aux UNUSED) } -/* Return a name for a new variable which ranks the variable VAR_NAME, - according to the ranking function F. - If IDX is non zero, then IDX is used as a disambiguating number. - FIXME: This is not very robust. -*/ -static char * -new_variable_name(const char *ranked_var_name, enum RANK_FUNC f, int idx) -{ - static char new_name[SHORT_NAME_LEN + 1]; - char temp[SHORT_NAME_LEN + 1]; - - if ( idx == 0 ) - { - switch (f) - { - case RANK: - case RFRACTION: - strcpy(new_name,"R"); - break; - - case NORMAL: - case N: - case NTILES: - strcpy(new_name,"N"); - break; - - case PERCENT: - case PROPORTION: - strcpy(new_name,"P"); - break; - - case SAVAGE: - strcpy(new_name,"S"); - break; - - default: - assert(false); - break; - } - - strncat(new_name, ranked_var_name, 7); - } - else - { - strncpy(temp, ranked_var_name, 3); - snprintf(new_name, SHORT_NAME_LEN, "%s%03d", temp, idx); - } - - return new_name; -} - /* Parse the [/rank INTO var1 var2 ... varN ] clause */ static int parse_rank_function(struct cmd_rank *cmd UNUSED, enum RANK_FUNC f) { - static const struct fmt_spec f8_2 = {FMT_F, 8, 2}; int var_count = 0; n_rank_specs++; rank_specs = xnrealloc(rank_specs, n_rank_specs, sizeof *rank_specs); rank_specs[n_rank_specs - 1].rfunc = f; + rank_specs[n_rank_specs - 1].destvars = NULL; rank_specs[n_rank_specs - 1].destvars = xcalloc (sc->crit_cnt, sizeof (struct variable *)); @@ -241,58 +968,26 @@ parse_rank_function(struct cmd_rank *cmd UNUSED, enum RANK_FUNC f) while( token == T_ID ) { - ++var_count; + if ( dict_lookup_var (default_dict, tokid) != NULL ) { msg(SE, _("Variable %s already exists."), tokid); return 0; } - if ( var_count > sc->crit_cnt ) + if ( var_count >= sc->crit_cnt ) { msg(SE, _("Too many variables in INTO clause.")); return 0; } - destvar = dict_create_var (default_dict, tokid, 0); - if ( destvar ) - { - destvar->print = destvar->write = f8_2; - } - - rank_specs[n_rank_specs - 1].destvars[var_count - 1] = destvar ; + destvar = create_rank_variable (f, src_vars[var_count], tokid); + rank_specs[n_rank_specs - 1].destvars[var_count] = destvar ; lex_get(); - + ++var_count; } } - /* Allocate rank variable names to all those which haven't had INTO - variables assigned */ - while (var_count < sc->crit_cnt) - { - static int idx=0; - struct variable *destvar ; - const struct variable *v = dict_get_var(default_dict, - sc->crits[var_count].fv); - - char *new_name; - - do { - new_name = new_variable_name(v->name, f, idx); - - destvar = dict_create_var (default_dict, new_name, 0); - if (!destvar ) - ++idx; - - } while( !destvar ) ; - - destvar->print = destvar->write = f8_2; - - rank_specs[n_rank_specs - 1].destvars[var_count] = destvar ; - - ++var_count; - } - return 1; } @@ -312,7 +1007,7 @@ rank_custom_normal(struct cmd_rank *cmd, void *aux UNUSED ) static int rank_custom_percent(struct cmd_rank *cmd, void *aux UNUSED ) { - return parse_rank_function(cmd, NORMAL); + return parse_rank_function (cmd, PERCENT); } static int @@ -347,6 +1042,7 @@ rank_custom_ntiles(struct cmd_rank *cmd, void *aux UNUSED ) { if ( lex_force_int() ) { + k_ntiles = lex_integer (); lex_get(); lex_force_match(')'); } @@ -358,5 +1054,3 @@ rank_custom_ntiles(struct cmd_rank *cmd, void *aux UNUSED ) return parse_rank_function(cmd, NTILES); } - - diff --git a/src/language/xforms/ChangeLog b/src/language/xforms/ChangeLog index dc3941f8..19bcf233 100644 --- a/src/language/xforms/ChangeLog +++ b/src/language/xforms/ChangeLog @@ -1,3 +1,7 @@ +Sat Oct 7 11:04:01 WST 2006 John Darrington + + * automake.mk fail.c: Added a debug transformation which always fails. + Sat May 6 16:02:55 2006 Ben Pfaff Get rid of `char *c' member in union value, for cleanliness. diff --git a/src/language/xforms/automake.mk b/src/language/xforms/automake.mk index 3e88a984..50716edf 100644 --- a/src/language/xforms/automake.mk +++ b/src/language/xforms/automake.mk @@ -6,6 +6,7 @@ noinst_LIBRARIES += src/language/xforms/libxforms.a src_language_xforms_libxforms_a_SOURCES = \ src/language/xforms/compute.c \ src/language/xforms/count.c \ + src/language/xforms/fail.c \ src/language/xforms/sample.c \ src/language/xforms/recode.c \ src/language/xforms/select-if.c diff --git a/src/language/xforms/fail.c b/src/language/xforms/fail.c new file mode 100644 index 00000000..153ff42e --- /dev/null +++ b/src/language/xforms/fail.c @@ -0,0 +1,51 @@ +/* PSPP - computes sample statistics. + Copyright (C) 2007 Free Software Foundation, Inc. + Written by John Darrington + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. */ + +#include + +#include + +#include +#include +#include +#include +#include + +static int trns_fail (void *x, struct ccase *c, casenum_t n); + + + +/* A transformation which is guaranteed to fail. */ + +static int +trns_fail (void *x UNUSED, struct ccase *c UNUSED, + casenum_t n UNUSED) +{ + return TRNS_ERROR; +} + + +int +cmd_debug_xform_fail (void) +{ + + add_transformation (trns_fail, NULL, NULL); + + return lex_end_of_command (); +} diff --git a/tests/ChangeLog b/tests/ChangeLog index 24bde940..de03c0db 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,7 @@ +Sat Oct 7 11:06:59 WST 2006 John Darrington + + * command/rank.sh: New file + Sun Jul 16 21:08:51 2006 Ben Pfaff * command/print.sh: Update output to match PRINT revisions. diff --git a/tests/automake.mk b/tests/automake.mk index 3482ef2d..76494efe 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -33,6 +33,7 @@ TESTS = \ tests/command/permissions.sh \ tests/command/print.sh \ tests/command/print-strings.sh \ + tests/command/rank.sh \ tests/command/rename.sh \ tests/command/regression.sh \ tests/command/sample.sh \ diff --git a/tests/command/rank.sh b/tests/command/rank.sh new file mode 100755 index 00000000..aac20882 --- /dev/null +++ b/tests/command/rank.sh @@ -0,0 +1,558 @@ +#!/bin/sh + +# This program tests the RANK command. + +TEMPDIR=/tmp/pspp-tst-$$ +TESTFILE=$TEMPDIR/`basename $0`.sps + +# ensure that top_srcdir and top_builddir are absolute +if [ -z "$top_srcdir" ] ; then top_srcdir=. ; fi +if [ -z "$top_builddir" ] ; then top_builddir=. ; fi +top_srcdir=`cd $top_srcdir; pwd` +top_builddir=`cd $top_builddir; pwd` + +PSPP=$top_builddir/src/ui/terminal/pspp + +STAT_CONFIG_PATH=$top_srcdir/config +export STAT_CONFIG_PATH + +LANG=C +export LANG + + +cleanup() +{ + if [ x"$PSPP_TEST_NO_CLEANUP" != x ] ; then + echo "NOT cleaning $TEMPDIR" + return ; + fi + rm -rf $TEMPDIR +} + + +fail() +{ + echo $activity + echo FAILED + cleanup; + exit 1; +} + + +no_result() +{ + echo $activity + echo NO RESULT; + cleanup; + exit 2; +} + +pass() +{ + cleanup; + exit 0; +} + +mkdir -p $TEMPDIR + +cd $TEMPDIR + + +# Some tests for proper behaviour in the face of invalid input. +activity="create file 1" +cat > $TESTFILE < $TEMPDIR/err +if [ $? -ne 1 ] ; then fail ; fi + +activity="diff 1" +perl -pi -e 's/^\s*$//g' $TEMPDIR/err +diff -b $TEMPDIR/err - < $TESTFILE < $TEMPDIR/errs +if [ $? -ne 1 ] ; then fail ; fi + +activity="compare errors" +perl -pi -e 's/^\s*$//g' $TEMPDIR/errs +diff -b $TEMPDIR/errs - << EOF +$TEMPDIR/rank.sh.sps:15: error: RANK: Syntax error expecting \`(' at end of command. +$TEMPDIR/rank.sh.sps:19: error: RANK: Syntax error expecting integer at \`d'. +$TEMPDIR/rank.sh.sps:25: error: RANK: Variable x already exists. +$TEMPDIR/rank.sh.sps:30: error: RANK: Too many variables in INTO clause. +$TEMPDIR/rank.sh.sps:33: warning: RANK: a is not a numeric variable. It will not be included in the variable list. +EOF +if [ $? -ne 0 ] ; then fail ; fi + +# Now some real tests. + +activity="create file 3" +cat > $TESTFILE < $TESTFILE < /dev/null +if [ $? -ne 0 ] ; then fail ; fi + + +activity="compare output 4" +perl -pi -e 's/^\s*$//g' $TEMPDIR/pspp.list +diff -b $TEMPDIR/pspp.list - << EOF +Variables Created By RANK +x into RNKRA01(RANK of x) + x rx RNKRA01 +-------- -------- --------- + 1.00 . 1.000 + 2.00 . 2.000 + 3.00 . 3.000 + 4.00 . 4.000 + 5.00 . 5.000 + 6.00 . 6.000 + 7.00 . 7.000 +EOF +if [ $? -ne 0 ] ; then fail ; fi + +pass; -- 2.30.2