1 /* PSPP - RANK. -*-c-*-
3 Copyright (C) 2005, 2006 Free Software Foundation, Inc.
4 Author: John Darrington <john@darrington.wattle.id.au>,
5 Ben Pfaff <blp@gnu.org>.
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
12 This program is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
24 #include "sort-criteria.h"
26 #include <data/dictionary.h>
27 #include <data/procedure.h>
28 #include <data/variable.h>
29 #include <data/case.h>
30 #include <data/casefile.h>
31 #include <data/fastfile.h>
32 #include <data/storage-stream.h>
33 #include <language/command.h>
34 #include <language/stats/sort-criteria.h>
36 #include <libpspp/compiler.h>
37 #include <math/sort.h>
38 #include <output/table.h>
39 #include <output/manager.h>
41 #include <gsl/gsl_cdf.h>
45 #define _(msgid) gettext (msgid)
61 +fraction=fraction:!blom/tukey/vw/rankit;
62 +ties=ties:!mean/low/high/condense;
63 missing=miss:!exclude/include.
68 typedef double (*rank_function_t) (double c, double cc, double cc_1,
71 static double rank_proportion (double c, double cc, double cc_1,
74 static double rank_normal (double c, double cc, double cc_1,
77 static double rank_percent (double c, double cc, double cc_1,
80 static double rank_rfraction (double c, double cc, double cc_1,
83 static double rank_rank (double c, double cc, double cc_1,
86 static double rank_n (double c, double cc, double cc_1,
89 static double rank_savage (double c, double cc, double cc_1,
92 static double rank_ntiles (double c, double cc, double cc_1,
109 static const struct fmt_spec dest_format[n_RANK_FUNCS] = {
110 {FMT_F, 9, 3}, /* rank */
111 {FMT_F, 6, 4}, /* normal */
112 {FMT_F, 6, 2}, /* percent */
113 {FMT_F, 6, 4}, /* rfraction */
114 {FMT_F, 6, 4}, /* proportion */
115 {FMT_F, 6, 0}, /* n */
116 {FMT_F, 3, 0}, /* ntiles */
117 {FMT_F, 8, 4} /* savage */
120 static const char * const function_name[n_RANK_FUNCS] = {
131 static const rank_function_t rank_func[n_RANK_FUNCS] = {
145 enum RANK_FUNC rfunc;
146 struct variable **destvars;
150 /* Function to use for testing for missing values */
151 static is_missing_func *value_is_missing;
153 static struct rank_spec *rank_specs;
154 static size_t n_rank_specs;
156 static struct sort_criteria *sc;
158 static struct variable **group_vars;
159 static size_t n_group_vars;
161 static struct variable **src_vars;
162 static size_t n_src_vars;
167 static struct cmd_rank cmd;
169 static struct casefile *rank_sorted_casefile (struct casefile *cf,
170 const struct sort_criteria *,
171 const struct dictionary *,
172 const struct rank_spec *rs,
175 const struct missing_values *miss
180 static char name[10];
181 switch ( cmd.fraction )
184 strcpy (name, "BLOM");
187 strcpy (name, "RANKIT");
190 strcpy (name, "TUKEY");
201 /* Create a label on DEST_VAR, describing its derivation from SRC_VAR and F */
203 create_var_label (struct variable *dest_var,
204 const struct variable *src_var, enum RANK_FUNC f)
207 ds_init_empty (&label);
209 if ( n_group_vars > 0 )
211 struct string group_var_str;
214 ds_init_empty (&group_var_str);
216 for (g = 0 ; g < n_group_vars ; ++g )
218 if ( g > 0 ) ds_put_cstr (&group_var_str, " ");
219 ds_put_cstr (&group_var_str, group_vars[g]->name);
222 ds_put_format (&label, _("%s of %s by %s"), function_name[f],
223 src_var->name, ds_cstr (&group_var_str));
224 ds_destroy (&group_var_str);
227 ds_put_format (&label,_("%s of %s"), function_name[f], src_var->name);
229 dest_var->label = strdup (ds_cstr (&label) );
236 rank_cmd (struct dataset *ds, const struct sort_criteria *sc,
237 const struct rank_spec *rank_specs, int n_rank_specs)
239 struct sort_criteria criteria;
242 const int n_splits = dict_get_split_cnt (dataset_dict (ds));
244 criteria.crit_cnt = n_splits + n_group_vars + 1;
245 criteria.crits = xnmalloc (criteria.crit_cnt, sizeof *criteria.crits);
246 for (i = 0; i < n_splits ; i++)
248 struct variable *v = dict_get_split_vars (dataset_dict (ds))[i];
249 criteria.crits[i].fv = v->fv;
250 criteria.crits[i].width = v->width;
251 criteria.crits[i].dir = SRT_ASCEND;
253 for (i = 0; i < n_group_vars; i++)
255 criteria.crits[i + n_splits].fv = group_vars[i]->fv;
256 criteria.crits[i + n_splits].width = group_vars[i]->width;
257 criteria.crits[i + n_splits].dir = SRT_ASCEND;
259 for (i = 0 ; i < sc->crit_cnt ; ++i )
261 struct casefile *out ;
262 struct casefile *cf ;
263 struct casereader *reader ;
264 struct casefile *sorted_cf ;
266 /* Obtain active file in CF. */
267 if (!procedure (ds, NULL, NULL))
270 cf = proc_capture_output (ds);
272 /* Sort CF into SORTED_CF. */
273 reader = casefile_get_destructive_reader (cf) ;
274 criteria.crits[criteria.crit_cnt - 1] = sc->crits[i];
275 assert ( sc->crits[i].fv == src_vars[i]->fv );
276 sorted_cf = sort_execute (reader, &criteria);
277 casefile_destroy (cf);
279 out = rank_sorted_casefile (sorted_cf, &criteria,
281 rank_specs, n_rank_specs,
282 i, &src_vars[i]->miss) ;
289 proc_set_source (ds, storage_source_create (out));
292 free (criteria.crits);
296 free (criteria.crits);
300 /* Hardly a rank function !! */
302 rank_n (double c UNUSED, double cc UNUSED, double cc_1 UNUSED,
303 int i UNUSED, double w)
310 rank_rank (double c, double cc, double cc_1,
311 int i, double w UNUSED)
325 rank = cc_1 + (c + 1.0)/ 2.0;
345 rank = cc_1 + c / 2.0 ;
360 rank_rfraction (double c, double cc, double cc_1,
363 return rank_rank (c, cc, cc_1, i, w) / w ;
368 rank_percent (double c, double cc, double cc_1,
371 return rank_rank (c, cc, cc_1, i, w) * 100.0 / w ;
376 rank_proportion (double c, double cc, double cc_1,
379 const double r = rank_rank (c, cc, cc_1, i, w) ;
383 switch ( cmd.fraction )
386 f = (r - 3.0/8.0) / (w + 0.25);
392 f = (r - 1.0/3.0) / (w + 1.0/3.0);
402 return (f > 0) ? f : SYSMIS;
406 rank_normal (double c, double cc, double cc_1,
409 double f = rank_proportion (c, cc, cc_1, i, w);
411 return gsl_cdf_ugaussian_Pinv (f);
415 rank_ntiles (double c, double cc, double cc_1,
418 double r = rank_rank (c, cc, cc_1, i, w);
421 return ( floor (( r * k_ntiles) / ( w + 1) ) + 1);
424 /* Expected value of the order statistics from an exponential distribution */
426 ee (int j, double w_star)
431 for (k = 1 ; k <= j; k++)
432 sum += 1.0 / ( w_star + 1 - k );
439 rank_savage (double c, double cc, double cc_1,
440 int i UNUSED, double w)
443 const int i_1 = floor (cc_1);
444 const int i_2 = floor (cc);
446 const double w_star = (modf (w, &int_part) == 0 ) ? w : floor (w) + 1;
448 const double g_1 = cc_1 - i_1;
449 const double g_2 = cc - i_2;
451 /* The second factor is infinite, when the first is zero.
452 Therefore, evaluate the second, only when the first is non-zero */
453 const double expr1 = (1 - g_1) ? (1 - g_1) * ee(i_1+1, w_star) : ( 1 - g_1);
454 const double expr2 = g_2 ? g_2 * ee (i_2+1, w_star) : g_2 ;
457 return ee (i_1 + 1, w_star) - 1;
459 if ( i_1 + 1 == i_2 )
460 return ( ( expr1 + expr2 )/c ) - 1;
462 if ( i_1 + 2 <= i_2 )
466 for (j = i_1 + 2 ; j <= i_2; ++j )
467 sigma += ee (j, w_star);
468 return ( (expr1 + expr2 + sigma) / c) -1;
475 /* Rank the casefile belonging to CR, starting from the current
476 postition of CR continuing up to and including the ENDth case.
478 RS points to an array containing the rank specifications to
479 use. N_RANK_SPECS is the number of elements of RS.
482 DEST_VAR_INDEX is the index into the rank_spec destvar element
483 to be used for this ranking.
485 Prerequisites: 1. The casefile must be sorted according to CRITERION.
486 2. W is the sum of the non-missing caseweights for this
487 range of the casefile.
490 rank_cases (struct casereader *cr,
492 const struct dictionary *dict,
493 const struct sort_criterion *criterion,
494 const struct missing_values *mv,
496 const struct rank_spec *rs,
499 struct casefile *dest)
506 const int fv = criterion->fv;
507 const int width = criterion->width;
509 while (casereader_cnum (cr) < end)
511 struct casereader *lookahead;
512 const union value *this_value;
513 struct ccase this_case, lookahead_case;
518 if (!casereader_read_xfer (cr, &this_case))
521 this_value = case_data (&this_case, fv);
522 c = dict_get_case_weight (dict, &this_case, &warn);
524 lookahead = casereader_clone (cr);
526 while (casereader_cnum (lookahead) < end
527 && casereader_read_xfer (lookahead, &lookahead_case))
529 const union value *lookahead_value = case_data (&lookahead_case, fv);
530 int diff = compare_values (this_value, lookahead_value, width);
534 /* Make sure the casefile was sorted */
535 assert ( diff == ((criterion->dir == SRT_ASCEND) ? -1 :1));
537 case_destroy (&lookahead_case);
541 c += dict_get_case_weight (dict, &lookahead_case, &warn);
542 case_destroy (&lookahead_case);
545 casereader_destroy (lookahead);
548 if ( !value_is_missing (mv, this_value) )
553 for (i = 0; i < n_rank_specs; ++i)
555 const int dest_idx = rs[i].destvars[dest_var_index]->fv;
557 if ( value_is_missing (mv, this_value) )
558 case_data_rw (&this_case, dest_idx)->f = SYSMIS;
560 case_data_rw (&this_case, dest_idx)->f =
561 rank_func[rs[i].rfunc](c, cc, cc_1, iter, w);
563 casefile_append_xfer (dest, &this_case);
565 while (n-- > 0 && casereader_read_xfer (cr, &this_case));
567 if ( !value_is_missing (mv, this_value) )
571 /* If this isn't true, then all the results will be wrong */
576 same_group (const struct ccase *a, const struct ccase *b,
577 const struct sort_criteria *crit)
581 for (i = 0; i < crit->crit_cnt - 1; i++)
583 struct sort_criterion *c = &crit->crits[i];
584 if (compare_values (case_data (a, c->fv), case_data (b, c->fv),
592 static struct casefile *
593 rank_sorted_casefile (struct casefile *cf,
594 const struct sort_criteria *crit,
595 const struct dictionary *dict,
596 const struct rank_spec *rs,
599 const struct missing_values *mv)
601 struct casefile *dest = fastfile_create (casefile_get_value_cnt (cf));
602 struct casereader *lookahead = casefile_get_reader (cf);
603 struct casereader *pos = casereader_clone (lookahead);
604 struct ccase group_case;
607 struct sort_criterion *ultimate_crit = &crit->crits[crit->crit_cnt - 1];
609 if (casereader_read (lookahead, &group_case))
611 struct ccase this_case;
612 const union value *this_value ;
614 this_value = case_data( &group_case, ultimate_crit->fv);
616 if ( !value_is_missing(mv, this_value) )
617 w = dict_get_case_weight (dict, &group_case, &warn);
619 while (casereader_read (lookahead, &this_case))
621 const union value *this_value =
622 case_data(&this_case, ultimate_crit->fv);
623 double c = dict_get_case_weight (dict, &this_case, &warn);
624 if (!same_group (&group_case, &this_case, crit))
626 rank_cases (pos, casereader_cnum (lookahead) - 1,
634 case_destroy (&group_case);
635 case_move (&group_case, &this_case);
637 if ( !value_is_missing (mv, this_value) )
639 case_destroy (&this_case);
641 case_destroy (&group_case);
642 rank_cases (pos, ULONG_MAX, dict, ultimate_crit, mv, w,
643 rs, n_rank_specs, dest_idx, dest);
646 if (casefile_error (dest))
648 casefile_destroy (dest);
652 casefile_destroy (cf);
657 /* Transformation function to enumerate all the cases */
659 create_resort_key (void *key_var_, struct ccase *cc, casenumber case_num)
661 struct variable *key_var = key_var_;
663 case_data_rw(cc, key_var->fv)->f = case_num;
665 return TRNS_CONTINUE;
669 /* Create and return a new variable in which to store the ranks of SRC_VAR
670 accoring to the rank function F.
671 VNAME is the name of the variable to be created.
672 If VNAME is NULL, then a name will be automatically chosen.
674 static struct variable *
675 create_rank_variable (struct dictionary *dict, enum RANK_FUNC f,
676 const struct variable *src_var,
680 struct variable *var = NULL;
681 char name[SHORT_NAME_LEN + 1];
684 var = dict_create_var(dict, vname, 0);
688 snprintf(name, SHORT_NAME_LEN + 1, "%c%s",
689 function_name[f][0], src_var->name);
691 var = dict_create_var(dict, name, 0);
698 snprintf(func_abb, 4, "%s", function_name[f]);
699 snprintf(name, SHORT_NAME_LEN + 1, "%s%03d", func_abb,
702 var = dict_create_var(dict, name, 0);
708 while ( NULL == var )
711 snprintf(func_abb, 3, "%s", function_name[f]);
713 snprintf(name, SHORT_NAME_LEN + 1,
714 "RNK%s%02d", func_abb, i);
716 var = dict_create_var(dict, name, 0);
723 msg(ME, _("Cannot create new rank variable. All candidates in use."));
727 var->write = var->print = dest_format[f];
732 int cmd_rank(struct dataset *ds);
743 for (i = 0 ; i < n_rank_specs ; ++i )
745 free (rank_specs[i].destvars);
752 sort_destroy_criteria (sc);
761 cmd_rank (struct dataset *ds)
764 struct variable *order;
768 if ( !parse_rank (ds, &cmd, NULL) )
774 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
775 if (cmd.miss == RANK_INCLUDE )
776 value_is_missing = mv_is_value_system_missing;
778 value_is_missing = mv_is_value_missing;
781 /* Default to /RANK if no function subcommands are given */
782 if ( !( cmd.sbc_normal || cmd.sbc_ntiles || cmd.sbc_proportion ||
783 cmd.sbc_rfraction || cmd.sbc_savage || cmd.sbc_n ||
784 cmd.sbc_percent || cmd.sbc_rank ) )
786 assert ( n_rank_specs == 0 );
788 rank_specs = xmalloc (sizeof (*rank_specs));
789 rank_specs[0].rfunc = RANK;
790 rank_specs[0].destvars =
791 xcalloc (sc->crit_cnt, sizeof (struct variable *));
796 assert ( sc->crit_cnt == n_src_vars);
798 /* Create variables for all rank destinations which haven't
799 already been created with INTO.
800 Add labels to all the destination variables.
802 for (i = 0 ; i < n_rank_specs ; ++i )
805 for ( v = 0 ; v < n_src_vars ; v ++ )
807 if ( rank_specs[i].destvars[v] == NULL )
809 rank_specs[i].destvars[v] =
810 create_rank_variable (dataset_dict(ds), rank_specs[i].rfunc, src_vars[v], NULL);
813 create_var_label ( rank_specs[i].destvars[v],
815 rank_specs[i].rfunc);
819 if ( cmd.print == RANK_YES )
823 tab_output_text (0, _("Variables Created By RANK"));
824 tab_output_text (0, "\n");
826 for (i = 0 ; i < n_rank_specs ; ++i )
828 for ( v = 0 ; v < n_src_vars ; v ++ )
830 if ( n_group_vars > 0 )
832 struct string varlist;
835 ds_init_empty (&varlist);
836 for ( g = 0 ; g < n_group_vars ; ++g )
838 ds_put_cstr (&varlist, group_vars[g]->name);
840 if ( g < n_group_vars - 1)
841 ds_put_cstr (&varlist, " ");
844 if ( rank_specs[i].rfunc == NORMAL ||
845 rank_specs[i].rfunc == PROPORTION )
846 tab_output_text (TAT_PRINTF,
847 _("%s into %s(%s of %s using %s BY %s)"),
849 rank_specs[i].destvars[v]->name,
850 function_name[rank_specs[i].rfunc],
857 tab_output_text (TAT_PRINTF,
858 _("%s into %s(%s of %s BY %s)"),
860 rank_specs[i].destvars[v]->name,
861 function_name[rank_specs[i].rfunc],
865 ds_destroy (&varlist);
869 if ( rank_specs[i].rfunc == NORMAL ||
870 rank_specs[i].rfunc == PROPORTION )
871 tab_output_text (TAT_PRINTF,
872 _("%s into %s(%s of %s using %s)"),
874 rank_specs[i].destvars[v]->name,
875 function_name[rank_specs[i].rfunc],
881 tab_output_text (TAT_PRINTF,
882 _("%s into %s(%s of %s)"),
884 rank_specs[i].destvars[v]->name,
885 function_name[rank_specs[i].rfunc],
893 if ( cmd.sbc_fraction &&
894 ( ! cmd.sbc_normal && ! cmd.sbc_proportion) )
895 msg(MW, _("FRACTION has been specified, but NORMAL and PROPORTION rank functions have not been requested. The FRACTION subcommand will be ignored.") );
897 /* Add a variable which we can sort by to get back the original
899 order = dict_create_var_assert (dataset_dict (ds), "$ORDER_", 0);
901 add_transformation (ds, create_resort_key, 0, order);
904 result = rank_cmd (ds, sc, rank_specs, n_rank_specs);
906 /* Put the active file back in its original order */
908 struct sort_criteria criteria;
909 struct sort_criterion restore_criterion ;
910 restore_criterion.fv = order->fv;
911 restore_criterion.width = 0;
912 restore_criterion.dir = SRT_ASCEND;
914 criteria.crits = &restore_criterion;
915 criteria.crit_cnt = 1;
917 sort_active_file_in_place (ds, &criteria);
920 /* ... and we don't need our sort key anymore. So delete it */
921 dict_delete_var (dataset_dict (ds), order);
925 return (result ? CMD_SUCCESS : CMD_CASCADING_FAILURE);
929 /* Parser for the variables sub command
930 Returns 1 on success */
932 rank_custom_variables (struct dataset *ds, struct cmd_rank *cmd UNUSED, void *aux UNUSED)
934 static const int terminators[2] = {T_BY, 0};
938 if ((token != T_ID || dict_lookup_var (dataset_dict (ds), tokid) == NULL)
942 sc = sort_parse_criteria (dataset_dict (ds),
943 &src_vars, &n_src_vars, 0, terminators);
945 if ( lex_match(T_BY) )
947 if ((token != T_ID || dict_lookup_var (dataset_dict (ds), tokid) == NULL))
952 if (!parse_variables (dataset_dict (ds), &group_vars, &n_group_vars,
953 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
964 /* Parse the [/rank INTO var1 var2 ... varN ] clause */
966 parse_rank_function(struct dictionary *dict, struct cmd_rank *cmd UNUSED, enum RANK_FUNC f)
971 rank_specs = xnrealloc(rank_specs, n_rank_specs, sizeof *rank_specs);
972 rank_specs[n_rank_specs - 1].rfunc = f;
973 rank_specs[n_rank_specs - 1].destvars = NULL;
975 rank_specs[n_rank_specs - 1].destvars =
976 xcalloc (sc->crit_cnt, sizeof (struct variable *));
978 if (lex_match_id("INTO"))
980 struct variable *destvar;
982 while( token == T_ID )
985 if ( dict_lookup_var (dict, tokid) != NULL )
987 msg(SE, _("Variable %s already exists."), tokid);
990 if ( var_count >= sc->crit_cnt )
992 msg(SE, _("Too many variables in INTO clause."));
996 destvar = create_rank_variable (dict, f, src_vars[var_count], tokid);
997 rank_specs[n_rank_specs - 1].destvars[var_count] = destvar ;
1009 rank_custom_rank(struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1011 struct dictionary *dict = dataset_dict (ds);
1013 return parse_rank_function (dict, cmd, RANK);
1017 rank_custom_normal(struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1019 struct dictionary *dict = dataset_dict (ds);
1021 return parse_rank_function (dict, cmd, NORMAL);
1025 rank_custom_percent(struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1027 struct dictionary *dict = dataset_dict (ds);
1029 return parse_rank_function (dict, cmd, PERCENT);
1033 rank_custom_rfraction(struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1035 struct dictionary *dict = dataset_dict (ds);
1037 return parse_rank_function (dict, cmd, RFRACTION);
1041 rank_custom_proportion(struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1043 struct dictionary *dict = dataset_dict (ds);
1045 return parse_rank_function (dict, cmd, PROPORTION);
1049 rank_custom_n (struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1051 struct dictionary *dict = dataset_dict (ds);
1053 return parse_rank_function (dict, cmd, N);
1057 rank_custom_savage(struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1059 struct dictionary *dict = dataset_dict (ds);
1061 return parse_rank_function (dict, cmd, SAVAGE);
1066 rank_custom_ntiles (struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1068 struct dictionary *dict = dataset_dict (ds);
1070 if ( lex_force_match('(') )
1072 if ( lex_force_int() )
1074 k_ntiles = lex_integer ();
1076 lex_force_match(')');
1084 return parse_rank_function(dict, cmd, NTILES);