1 /* PSPP - RANK. -*-c-*-
3 Copyright (C) 2005, 2006 Free Software Foundation, Inc.
4 Author: John Darrington <john@darrington.wattle.id.au>,
5 Ben Pfaff <blp@gnu.org>.
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
12 This program is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
24 #include "sort-criteria.h"
26 #include <data/dictionary.h>
27 #include <data/procedure.h>
28 #include <data/variable.h>
29 #include <data/case.h>
30 #include <data/casefile.h>
31 #include <data/fastfile.h>
32 #include <data/storage-stream.h>
33 #include <language/command.h>
34 #include <language/stats/sort-criteria.h>
36 #include <libpspp/compiler.h>
37 #include <math/sort.h>
38 #include <output/table.h>
39 #include <output/manager.h>
41 #include <gsl/gsl_cdf.h>
45 #define _(msgid) gettext (msgid)
61 +fraction=fraction:!blom/tukey/vw/rankit;
62 +ties=ties:!mean/low/high/condense;
63 missing=miss:!exclude/include.
68 typedef double (*rank_function_t) (double c, double cc, double cc_1,
71 static double rank_proportion (double c, double cc, double cc_1,
74 static double rank_normal (double c, double cc, double cc_1,
77 static double rank_percent (double c, double cc, double cc_1,
80 static double rank_rfraction (double c, double cc, double cc_1,
83 static double rank_rank (double c, double cc, double cc_1,
86 static double rank_n (double c, double cc, double cc_1,
89 static double rank_savage (double c, double cc, double cc_1,
92 static double rank_ntiles (double c, double cc, double cc_1,
109 static const struct fmt_spec dest_format[n_RANK_FUNCS] = {
110 {FMT_F, 9, 3}, /* rank */
111 {FMT_F, 6, 4}, /* normal */
112 {FMT_F, 6, 2}, /* percent */
113 {FMT_F, 6, 4}, /* rfraction */
114 {FMT_F, 6, 4}, /* proportion */
115 {FMT_F, 6, 0}, /* n */
116 {FMT_F, 3, 0}, /* ntiles */
117 {FMT_F, 8, 4} /* savage */
120 static const char * const function_name[n_RANK_FUNCS] = {
131 static const rank_function_t rank_func[n_RANK_FUNCS] = {
145 enum RANK_FUNC rfunc;
146 struct variable **destvars;
150 /* Function to use for testing for missing values */
151 static mv_is_missing_func *value_is_missing;
153 static struct rank_spec *rank_specs;
154 static size_t n_rank_specs;
156 static struct sort_criteria *sc;
158 static struct variable **group_vars;
159 static size_t n_group_vars;
161 static struct variable **src_vars;
162 static size_t n_src_vars;
167 static struct cmd_rank cmd;
169 static struct casefile *rank_sorted_casefile (struct casefile *cf,
170 const struct sort_criteria *,
171 const struct dictionary *,
172 const struct rank_spec *rs,
175 const struct missing_values *miss
180 static char name[10];
181 switch ( cmd.fraction )
184 strcpy (name, "BLOM");
187 strcpy (name, "RANKIT");
190 strcpy (name, "TUKEY");
201 /* Create a label on DEST_VAR, describing its derivation from SRC_VAR and F */
203 create_var_label (struct variable *dest_var,
204 const struct variable *src_var, enum RANK_FUNC f)
207 ds_init_empty (&label);
209 if ( n_group_vars > 0 )
211 struct string group_var_str;
214 ds_init_empty (&group_var_str);
216 for (g = 0 ; g < n_group_vars ; ++g )
218 if ( g > 0 ) ds_put_cstr (&group_var_str, " ");
219 ds_put_cstr (&group_var_str, var_get_name (group_vars[g]));
222 ds_put_format (&label, _("%s of %s by %s"), function_name[f],
223 var_get_name (src_var), ds_cstr (&group_var_str));
224 ds_destroy (&group_var_str);
227 ds_put_format (&label, _("%s of %s"),
228 function_name[f], var_get_name (src_var));
230 var_set_label (dest_var, ds_cstr (&label));
237 rank_cmd (struct dataset *ds, const struct sort_criteria *sc,
238 const struct rank_spec *rank_specs, int n_rank_specs)
240 struct sort_criteria criteria;
243 const int n_splits = dict_get_split_cnt (dataset_dict (ds));
245 criteria.crit_cnt = n_splits + n_group_vars + 1;
246 criteria.crits = xnmalloc (criteria.crit_cnt, sizeof *criteria.crits);
247 for (i = 0; i < n_splits ; i++)
249 struct variable *v = dict_get_split_vars (dataset_dict (ds))[i];
250 criteria.crits[i].fv = v->fv;
251 criteria.crits[i].width = var_get_width (v);
252 criteria.crits[i].dir = SRT_ASCEND;
254 for (i = 0; i < n_group_vars; i++)
256 criteria.crits[i + n_splits].fv = group_vars[i]->fv;
257 criteria.crits[i + n_splits].width = var_get_width (group_vars[i]);
258 criteria.crits[i + n_splits].dir = SRT_ASCEND;
260 for (i = 0 ; i < sc->crit_cnt ; ++i )
262 struct casefile *out ;
263 struct casefile *cf ;
264 struct casereader *reader ;
265 struct casefile *sorted_cf ;
267 /* Obtain active file in CF. */
268 if (!procedure (ds, NULL, NULL))
271 cf = proc_capture_output (ds);
273 /* Sort CF into SORTED_CF. */
274 reader = casefile_get_destructive_reader (cf) ;
275 criteria.crits[criteria.crit_cnt - 1] = sc->crits[i];
276 assert ( sc->crits[i].fv == src_vars[i]->fv );
277 sorted_cf = sort_execute (reader, &criteria);
278 casefile_destroy (cf);
280 out = rank_sorted_casefile (sorted_cf, &criteria,
282 rank_specs, n_rank_specs,
283 i, var_get_missing_values (src_vars[i]));
290 proc_set_source (ds, storage_source_create (out));
293 free (criteria.crits);
297 free (criteria.crits);
301 /* Hardly a rank function !! */
303 rank_n (double c UNUSED, double cc UNUSED, double cc_1 UNUSED,
304 int i UNUSED, double w)
311 rank_rank (double c, double cc, double cc_1,
312 int i, double w UNUSED)
326 rank = cc_1 + (c + 1.0)/ 2.0;
346 rank = cc_1 + c / 2.0 ;
361 rank_rfraction (double c, double cc, double cc_1,
364 return rank_rank (c, cc, cc_1, i, w) / w ;
369 rank_percent (double c, double cc, double cc_1,
372 return rank_rank (c, cc, cc_1, i, w) * 100.0 / w ;
377 rank_proportion (double c, double cc, double cc_1,
380 const double r = rank_rank (c, cc, cc_1, i, w) ;
384 switch ( cmd.fraction )
387 f = (r - 3.0/8.0) / (w + 0.25);
393 f = (r - 1.0/3.0) / (w + 1.0/3.0);
403 return (f > 0) ? f : SYSMIS;
407 rank_normal (double c, double cc, double cc_1,
410 double f = rank_proportion (c, cc, cc_1, i, w);
412 return gsl_cdf_ugaussian_Pinv (f);
416 rank_ntiles (double c, double cc, double cc_1,
419 double r = rank_rank (c, cc, cc_1, i, w);
422 return ( floor (( r * k_ntiles) / ( w + 1) ) + 1);
425 /* Expected value of the order statistics from an exponential distribution */
427 ee (int j, double w_star)
432 for (k = 1 ; k <= j; k++)
433 sum += 1.0 / ( w_star + 1 - k );
440 rank_savage (double c, double cc, double cc_1,
441 int i UNUSED, double w)
444 const int i_1 = floor (cc_1);
445 const int i_2 = floor (cc);
447 const double w_star = (modf (w, &int_part) == 0 ) ? w : floor (w) + 1;
449 const double g_1 = cc_1 - i_1;
450 const double g_2 = cc - i_2;
452 /* The second factor is infinite, when the first is zero.
453 Therefore, evaluate the second, only when the first is non-zero */
454 const double expr1 = (1 - g_1) ? (1 - g_1) * ee(i_1+1, w_star) : ( 1 - g_1);
455 const double expr2 = g_2 ? g_2 * ee (i_2+1, w_star) : g_2 ;
458 return ee (i_1 + 1, w_star) - 1;
460 if ( i_1 + 1 == i_2 )
461 return ( ( expr1 + expr2 )/c ) - 1;
463 if ( i_1 + 2 <= i_2 )
467 for (j = i_1 + 2 ; j <= i_2; ++j )
468 sigma += ee (j, w_star);
469 return ( (expr1 + expr2 + sigma) / c) -1;
476 /* Rank the casefile belonging to CR, starting from the current
477 postition of CR continuing up to and including the ENDth case.
479 RS points to an array containing the rank specifications to
480 use. N_RANK_SPECS is the number of elements of RS.
483 DEST_VAR_INDEX is the index into the rank_spec destvar element
484 to be used for this ranking.
486 Prerequisites: 1. The casefile must be sorted according to CRITERION.
487 2. W is the sum of the non-missing caseweights for this
488 range of the casefile.
491 rank_cases (struct casereader *cr,
493 const struct dictionary *dict,
494 const struct sort_criterion *criterion,
495 const struct missing_values *mv,
497 const struct rank_spec *rs,
500 struct casefile *dest)
507 const int fv = criterion->fv;
508 const int width = criterion->width;
510 while (casereader_cnum (cr) < end)
512 struct casereader *lookahead;
513 const union value *this_value;
514 struct ccase this_case, lookahead_case;
519 if (!casereader_read_xfer (cr, &this_case))
522 this_value = case_data (&this_case, fv);
523 c = dict_get_case_weight (dict, &this_case, &warn);
525 lookahead = casereader_clone (cr);
527 while (casereader_cnum (lookahead) < end
528 && casereader_read_xfer (lookahead, &lookahead_case))
530 const union value *lookahead_value = case_data (&lookahead_case, fv);
531 int diff = compare_values (this_value, lookahead_value, width);
535 /* Make sure the casefile was sorted */
536 assert ( diff == ((criterion->dir == SRT_ASCEND) ? -1 :1));
538 case_destroy (&lookahead_case);
542 c += dict_get_case_weight (dict, &lookahead_case, &warn);
543 case_destroy (&lookahead_case);
546 casereader_destroy (lookahead);
549 if ( !value_is_missing (mv, this_value) )
554 for (i = 0; i < n_rank_specs; ++i)
556 const int dest_idx = rs[i].destvars[dest_var_index]->fv;
558 if ( value_is_missing (mv, this_value) )
559 case_data_rw (&this_case, dest_idx)->f = SYSMIS;
561 case_data_rw (&this_case, dest_idx)->f =
562 rank_func[rs[i].rfunc](c, cc, cc_1, iter, w);
564 casefile_append_xfer (dest, &this_case);
566 while (n-- > 0 && casereader_read_xfer (cr, &this_case));
568 if ( !value_is_missing (mv, this_value) )
572 /* If this isn't true, then all the results will be wrong */
577 same_group (const struct ccase *a, const struct ccase *b,
578 const struct sort_criteria *crit)
582 for (i = 0; i < crit->crit_cnt - 1; i++)
584 struct sort_criterion *c = &crit->crits[i];
585 if (compare_values (case_data (a, c->fv), case_data (b, c->fv),
593 static struct casefile *
594 rank_sorted_casefile (struct casefile *cf,
595 const struct sort_criteria *crit,
596 const struct dictionary *dict,
597 const struct rank_spec *rs,
600 const struct missing_values *mv)
602 struct casefile *dest = fastfile_create (casefile_get_value_cnt (cf));
603 struct casereader *lookahead = casefile_get_reader (cf, NULL);
604 struct casereader *pos = casereader_clone (lookahead);
605 struct ccase group_case;
608 struct sort_criterion *ultimate_crit = &crit->crits[crit->crit_cnt - 1];
610 if (casereader_read (lookahead, &group_case))
612 struct ccase this_case;
613 const union value *this_value ;
615 this_value = case_data( &group_case, ultimate_crit->fv);
617 if ( !value_is_missing(mv, this_value) )
618 w = dict_get_case_weight (dict, &group_case, &warn);
620 while (casereader_read (lookahead, &this_case))
622 const union value *this_value =
623 case_data(&this_case, ultimate_crit->fv);
624 double c = dict_get_case_weight (dict, &this_case, &warn);
625 if (!same_group (&group_case, &this_case, crit))
627 rank_cases (pos, casereader_cnum (lookahead) - 1,
635 case_destroy (&group_case);
636 case_move (&group_case, &this_case);
638 if ( !value_is_missing (mv, this_value) )
640 case_destroy (&this_case);
642 case_destroy (&group_case);
643 rank_cases (pos, ULONG_MAX, dict, ultimate_crit, mv, w,
644 rs, n_rank_specs, dest_idx, dest);
647 if (casefile_error (dest))
649 casefile_destroy (dest);
653 casefile_destroy (cf);
658 /* Transformation function to enumerate all the cases */
660 create_resort_key (void *key_var_, struct ccase *cc, casenumber case_num)
662 struct variable *key_var = key_var_;
664 case_data_rw(cc, key_var->fv)->f = case_num;
666 return TRNS_CONTINUE;
670 /* Create and return a new variable in which to store the ranks of SRC_VAR
671 accoring to the rank function F.
672 VNAME is the name of the variable to be created.
673 If VNAME is NULL, then a name will be automatically chosen.
675 static struct variable *
676 create_rank_variable (struct dictionary *dict, enum RANK_FUNC f,
677 const struct variable *src_var,
681 struct variable *var = NULL;
682 char name[SHORT_NAME_LEN + 1];
685 var = dict_create_var(dict, vname, 0);
689 snprintf (name, SHORT_NAME_LEN + 1, "%c%s",
690 function_name[f][0], var_get_name (src_var));
692 var = dict_create_var(dict, name, 0);
699 snprintf(func_abb, 4, "%s", function_name[f]);
700 snprintf(name, SHORT_NAME_LEN + 1, "%s%03d", func_abb,
703 var = dict_create_var(dict, name, 0);
709 while ( NULL == var )
712 snprintf(func_abb, 3, "%s", function_name[f]);
714 snprintf(name, SHORT_NAME_LEN + 1,
715 "RNK%s%02d", func_abb, i);
717 var = dict_create_var(dict, name, 0);
724 msg(ME, _("Cannot create new rank variable. All candidates in use."));
728 var_set_both_formats (var, &dest_format[f]);
743 for (i = 0 ; i < n_rank_specs ; ++i )
745 free (rank_specs[i].destvars);
752 sort_destroy_criteria (sc);
761 cmd_rank (struct lexer *lexer, struct dataset *ds)
764 struct variable *order;
768 if ( !parse_rank (lexer, ds, &cmd, NULL) )
774 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
775 if (cmd.miss == RANK_INCLUDE )
776 value_is_missing = mv_is_value_system_missing;
778 value_is_missing = mv_is_value_missing;
781 /* Default to /RANK if no function subcommands are given */
782 if ( !( cmd.sbc_normal || cmd.sbc_ntiles || cmd.sbc_proportion ||
783 cmd.sbc_rfraction || cmd.sbc_savage || cmd.sbc_n ||
784 cmd.sbc_percent || cmd.sbc_rank ) )
786 assert ( n_rank_specs == 0 );
788 rank_specs = xmalloc (sizeof (*rank_specs));
789 rank_specs[0].rfunc = RANK;
790 rank_specs[0].destvars =
791 xcalloc (sc->crit_cnt, sizeof (struct variable *));
796 assert ( sc->crit_cnt == n_src_vars);
798 /* Create variables for all rank destinations which haven't
799 already been created with INTO.
800 Add labels to all the destination variables.
802 for (i = 0 ; i < n_rank_specs ; ++i )
805 for ( v = 0 ; v < n_src_vars ; v ++ )
807 if ( rank_specs[i].destvars[v] == NULL )
809 rank_specs[i].destvars[v] =
810 create_rank_variable (dataset_dict(ds), rank_specs[i].rfunc, src_vars[v], NULL);
813 create_var_label ( rank_specs[i].destvars[v],
815 rank_specs[i].rfunc);
819 if ( cmd.print == RANK_YES )
823 tab_output_text (0, _("Variables Created By RANK"));
824 tab_output_text (0, "\n");
826 for (i = 0 ; i < n_rank_specs ; ++i )
828 for ( v = 0 ; v < n_src_vars ; v ++ )
830 if ( n_group_vars > 0 )
832 struct string varlist;
835 ds_init_empty (&varlist);
836 for ( g = 0 ; g < n_group_vars ; ++g )
838 ds_put_cstr (&varlist, var_get_name (group_vars[g]));
840 if ( g < n_group_vars - 1)
841 ds_put_cstr (&varlist, " ");
844 if ( rank_specs[i].rfunc == NORMAL ||
845 rank_specs[i].rfunc == PROPORTION )
846 tab_output_text (TAT_PRINTF,
847 _("%s into %s(%s of %s using %s BY %s)"),
848 var_get_name (src_vars[v]),
849 var_get_name (rank_specs[i].destvars[v]),
850 function_name[rank_specs[i].rfunc],
851 var_get_name (src_vars[v]),
857 tab_output_text (TAT_PRINTF,
858 _("%s into %s(%s of %s BY %s)"),
859 var_get_name (src_vars[v]),
860 var_get_name (rank_specs[i].destvars[v]),
861 function_name[rank_specs[i].rfunc],
862 var_get_name (src_vars[v]),
865 ds_destroy (&varlist);
869 if ( rank_specs[i].rfunc == NORMAL ||
870 rank_specs[i].rfunc == PROPORTION )
871 tab_output_text (TAT_PRINTF,
872 _("%s into %s(%s of %s using %s)"),
873 var_get_name (src_vars[v]),
874 var_get_name (rank_specs[i].destvars[v]),
875 function_name[rank_specs[i].rfunc],
876 var_get_name (src_vars[v]),
881 tab_output_text (TAT_PRINTF,
882 _("%s into %s(%s of %s)"),
883 var_get_name (src_vars[v]),
884 var_get_name (rank_specs[i].destvars[v]),
885 function_name[rank_specs[i].rfunc],
886 var_get_name (src_vars[v])
893 if ( cmd.sbc_fraction &&
894 ( ! cmd.sbc_normal && ! cmd.sbc_proportion) )
895 msg(MW, _("FRACTION has been specified, but NORMAL and PROPORTION rank functions have not been requested. The FRACTION subcommand will be ignored.") );
897 /* Add a variable which we can sort by to get back the original
899 order = dict_create_var_assert (dataset_dict (ds), "$ORDER_", 0);
901 add_transformation (ds, create_resort_key, 0, order);
904 result = rank_cmd (ds, sc, rank_specs, n_rank_specs);
906 /* Put the active file back in its original order */
908 struct sort_criteria criteria;
909 struct sort_criterion restore_criterion ;
910 restore_criterion.fv = order->fv;
911 restore_criterion.width = 0;
912 restore_criterion.dir = SRT_ASCEND;
914 criteria.crits = &restore_criterion;
915 criteria.crit_cnt = 1;
917 sort_active_file_in_place (ds, &criteria);
920 /* ... and we don't need our sort key anymore. So delete it */
921 dict_delete_var (dataset_dict (ds), order);
925 return (result ? CMD_SUCCESS : CMD_CASCADING_FAILURE);
929 /* Parser for the variables sub command
930 Returns 1 on success */
932 rank_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd UNUSED, void *aux UNUSED)
934 static const int terminators[2] = {T_BY, 0};
936 lex_match (lexer, '=');
938 if ((lex_token (lexer) != T_ID || dict_lookup_var (dataset_dict (ds), lex_tokid (lexer)) == NULL)
939 && lex_token (lexer) != T_ALL)
942 sc = sort_parse_criteria (lexer, dataset_dict (ds),
943 &src_vars, &n_src_vars, 0, terminators);
945 if ( lex_match (lexer, T_BY) )
947 if ((lex_token (lexer) != T_ID || dict_lookup_var (dataset_dict (ds), lex_tokid (lexer)) == NULL))
952 if (!parse_variables (lexer, dataset_dict (ds),
953 &group_vars, &n_group_vars,
954 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
965 /* Parse the [/rank INTO var1 var2 ... varN ] clause */
967 parse_rank_function (struct lexer *lexer, struct dictionary *dict, struct cmd_rank *cmd UNUSED, enum RANK_FUNC f)
972 rank_specs = xnrealloc(rank_specs, n_rank_specs, sizeof *rank_specs);
973 rank_specs[n_rank_specs - 1].rfunc = f;
974 rank_specs[n_rank_specs - 1].destvars = NULL;
976 rank_specs[n_rank_specs - 1].destvars =
977 xcalloc (sc->crit_cnt, sizeof (struct variable *));
979 if (lex_match_id (lexer, "INTO"))
981 struct variable *destvar;
983 while( lex_token (lexer) == T_ID )
986 if ( dict_lookup_var (dict, lex_tokid (lexer)) != NULL )
988 msg(SE, _("Variable %s already exists."), lex_tokid (lexer));
991 if ( var_count >= sc->crit_cnt )
993 msg(SE, _("Too many variables in INTO clause."));
997 destvar = create_rank_variable (dict, f, src_vars[var_count], lex_tokid (lexer));
998 rank_specs[n_rank_specs - 1].destvars[var_count] = destvar ;
1010 rank_custom_rank (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1012 struct dictionary *dict = dataset_dict (ds);
1014 return parse_rank_function (lexer, dict, cmd, RANK);
1018 rank_custom_normal (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1020 struct dictionary *dict = dataset_dict (ds);
1022 return parse_rank_function (lexer, dict, cmd, NORMAL);
1026 rank_custom_percent (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1028 struct dictionary *dict = dataset_dict (ds);
1030 return parse_rank_function (lexer, dict, cmd, PERCENT);
1034 rank_custom_rfraction (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1036 struct dictionary *dict = dataset_dict (ds);
1038 return parse_rank_function (lexer, dict, cmd, RFRACTION);
1042 rank_custom_proportion (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1044 struct dictionary *dict = dataset_dict (ds);
1046 return parse_rank_function (lexer, dict, cmd, PROPORTION);
1050 rank_custom_n (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1052 struct dictionary *dict = dataset_dict (ds);
1054 return parse_rank_function (lexer, dict, cmd, N);
1058 rank_custom_savage (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1060 struct dictionary *dict = dataset_dict (ds);
1062 return parse_rank_function (lexer, dict, cmd, SAVAGE);
1067 rank_custom_ntiles (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1069 struct dictionary *dict = dataset_dict (ds);
1071 if ( lex_force_match (lexer, '(') )
1073 if ( lex_force_int (lexer) )
1075 k_ntiles = lex_integer (lexer);
1077 lex_force_match (lexer, ')');
1085 return parse_rank_function (lexer, dict, cmd, NTILES);