1 /* PSPP - RANK. -*-c-*-
3 Copyright (C) 2005, 2006 Free Software Foundation, Inc.
4 Ben Pfaff <blp@gnu.org>.
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 #include "sort-criteria.h"
25 #include <data/dictionary.h>
26 #include <data/format.h>
27 #include <data/missing-values.h>
28 #include <data/procedure.h>
29 #include <data/variable.h>
30 #include <data/case.h>
31 #include <data/casefile.h>
32 #include <data/fastfile.h>
33 #include <data/storage-stream.h>
34 #include <language/command.h>
35 #include <language/stats/sort-criteria.h>
37 #include <libpspp/compiler.h>
38 #include <math/sort.h>
39 #include <output/table.h>
40 #include <output/manager.h>
42 #include <gsl/gsl_cdf.h>
46 #define _(msgid) gettext (msgid)
62 +fraction=fraction:!blom/tukey/vw/rankit;
63 +ties=ties:!mean/low/high/condense;
64 missing=miss:!exclude/include.
69 typedef double (*rank_function_t) (double c, double cc, double cc_1,
72 static double rank_proportion (double c, double cc, double cc_1,
75 static double rank_normal (double c, double cc, double cc_1,
78 static double rank_percent (double c, double cc, double cc_1,
81 static double rank_rfraction (double c, double cc, double cc_1,
84 static double rank_rank (double c, double cc, double cc_1,
87 static double rank_n (double c, double cc, double cc_1,
90 static double rank_savage (double c, double cc, double cc_1,
93 static double rank_ntiles (double c, double cc, double cc_1,
110 static const struct fmt_spec dest_format[n_RANK_FUNCS] = {
111 {FMT_F, 9, 3}, /* rank */
112 {FMT_F, 6, 4}, /* normal */
113 {FMT_F, 6, 2}, /* percent */
114 {FMT_F, 6, 4}, /* rfraction */
115 {FMT_F, 6, 4}, /* proportion */
116 {FMT_F, 6, 0}, /* n */
117 {FMT_F, 3, 0}, /* ntiles */
118 {FMT_F, 8, 4} /* savage */
121 static const char * const function_name[n_RANK_FUNCS] = {
132 static const rank_function_t rank_func[n_RANK_FUNCS] = {
146 enum RANK_FUNC rfunc;
147 struct variable **destvars;
151 /* Function to use for testing for missing values */
152 static mv_is_missing_func *value_is_missing;
154 static struct rank_spec *rank_specs;
155 static size_t n_rank_specs;
157 static struct sort_criteria *sc;
159 static struct variable **group_vars;
160 static size_t n_group_vars;
162 static struct variable **src_vars;
163 static size_t n_src_vars;
168 static struct cmd_rank cmd;
170 static struct casefile *rank_sorted_casefile (struct casefile *cf,
171 const struct sort_criteria *,
172 const struct dictionary *,
173 const struct rank_spec *rs,
176 const struct missing_values *miss
181 static char name[10];
182 switch ( cmd.fraction )
185 strcpy (name, "BLOM");
188 strcpy (name, "RANKIT");
191 strcpy (name, "TUKEY");
202 /* Create a label on DEST_VAR, describing its derivation from SRC_VAR and F */
204 create_var_label (struct variable *dest_var,
205 const struct variable *src_var, enum RANK_FUNC f)
208 ds_init_empty (&label);
210 if ( n_group_vars > 0 )
212 struct string group_var_str;
215 ds_init_empty (&group_var_str);
217 for (g = 0 ; g < n_group_vars ; ++g )
219 if ( g > 0 ) ds_put_cstr (&group_var_str, " ");
220 ds_put_cstr (&group_var_str, var_get_name (group_vars[g]));
223 ds_put_format (&label, _("%s of %s by %s"), function_name[f],
224 var_get_name (src_var), ds_cstr (&group_var_str));
225 ds_destroy (&group_var_str);
228 ds_put_format (&label, _("%s of %s"),
229 function_name[f], var_get_name (src_var));
231 var_set_label (dest_var, ds_cstr (&label));
238 rank_cmd (struct dataset *ds, const struct sort_criteria *sc,
239 const struct rank_spec *rank_specs, int n_rank_specs)
241 struct sort_criteria criteria;
244 const int n_splits = dict_get_split_cnt (dataset_dict (ds));
246 criteria.crit_cnt = n_splits + n_group_vars + 1;
247 criteria.crits = xnmalloc (criteria.crit_cnt, sizeof *criteria.crits);
248 for (i = 0; i < n_splits ; i++)
250 struct variable *v = dict_get_split_vars (dataset_dict (ds))[i];
251 criteria.crits[i].fv = var_get_case_index (v);
252 criteria.crits[i].width = var_get_width (v);
253 criteria.crits[i].dir = SRT_ASCEND;
255 for (i = 0; i < n_group_vars; i++)
257 criteria.crits[i + n_splits].fv = var_get_case_index (group_vars[i]);
258 criteria.crits[i + n_splits].width = var_get_width (group_vars[i]);
259 criteria.crits[i + n_splits].dir = SRT_ASCEND;
261 for (i = 0 ; i < sc->crit_cnt ; ++i )
263 struct casefile *out ;
264 struct casefile *cf ;
265 struct casereader *reader ;
266 struct casefile *sorted_cf ;
268 /* Obtain active file in CF. */
269 if (!procedure (ds, NULL, NULL))
272 cf = proc_capture_output (ds);
274 /* Sort CF into SORTED_CF. */
275 reader = casefile_get_destructive_reader (cf) ;
276 criteria.crits[criteria.crit_cnt - 1] = sc->crits[i];
277 assert ( sc->crits[i].fv == var_get_case_index (src_vars[i]) );
278 sorted_cf = sort_execute (reader, &criteria);
279 casefile_destroy (cf);
281 out = rank_sorted_casefile (sorted_cf, &criteria,
283 rank_specs, n_rank_specs,
284 i, var_get_missing_values (src_vars[i]));
291 proc_set_source (ds, storage_source_create (out));
294 free (criteria.crits);
298 free (criteria.crits);
302 /* Hardly a rank function !! */
304 rank_n (double c UNUSED, double cc UNUSED, double cc_1 UNUSED,
305 int i UNUSED, double w)
312 rank_rank (double c, double cc, double cc_1,
313 int i, double w UNUSED)
327 rank = cc_1 + (c + 1.0)/ 2.0;
347 rank = cc_1 + c / 2.0 ;
362 rank_rfraction (double c, double cc, double cc_1,
365 return rank_rank (c, cc, cc_1, i, w) / w ;
370 rank_percent (double c, double cc, double cc_1,
373 return rank_rank (c, cc, cc_1, i, w) * 100.0 / w ;
378 rank_proportion (double c, double cc, double cc_1,
381 const double r = rank_rank (c, cc, cc_1, i, w) ;
385 switch ( cmd.fraction )
388 f = (r - 3.0/8.0) / (w + 0.25);
394 f = (r - 1.0/3.0) / (w + 1.0/3.0);
404 return (f > 0) ? f : SYSMIS;
408 rank_normal (double c, double cc, double cc_1,
411 double f = rank_proportion (c, cc, cc_1, i, w);
413 return gsl_cdf_ugaussian_Pinv (f);
417 rank_ntiles (double c, double cc, double cc_1,
420 double r = rank_rank (c, cc, cc_1, i, w);
423 return ( floor (( r * k_ntiles) / ( w + 1) ) + 1);
426 /* Expected value of the order statistics from an exponential distribution */
428 ee (int j, double w_star)
433 for (k = 1 ; k <= j; k++)
434 sum += 1.0 / ( w_star + 1 - k );
441 rank_savage (double c, double cc, double cc_1,
442 int i UNUSED, double w)
445 const int i_1 = floor (cc_1);
446 const int i_2 = floor (cc);
448 const double w_star = (modf (w, &int_part) == 0 ) ? w : floor (w) + 1;
450 const double g_1 = cc_1 - i_1;
451 const double g_2 = cc - i_2;
453 /* The second factor is infinite, when the first is zero.
454 Therefore, evaluate the second, only when the first is non-zero */
455 const double expr1 = (1 - g_1) ? (1 - g_1) * ee(i_1+1, w_star) : ( 1 - g_1);
456 const double expr2 = g_2 ? g_2 * ee (i_2+1, w_star) : g_2 ;
459 return ee (i_1 + 1, w_star) - 1;
461 if ( i_1 + 1 == i_2 )
462 return ( ( expr1 + expr2 )/c ) - 1;
464 if ( i_1 + 2 <= i_2 )
468 for (j = i_1 + 2 ; j <= i_2; ++j )
469 sigma += ee (j, w_star);
470 return ( (expr1 + expr2 + sigma) / c) -1;
477 /* Rank the casefile belonging to CR, starting from the current
478 postition of CR continuing up to and including the ENDth case.
480 RS points to an array containing the rank specifications to
481 use. N_RANK_SPECS is the number of elements of RS.
484 DEST_VAR_INDEX is the index into the rank_spec destvar element
485 to be used for this ranking.
487 Prerequisites: 1. The casefile must be sorted according to CRITERION.
488 2. W is the sum of the non-missing caseweights for this
489 range of the casefile.
492 rank_cases (struct casereader *cr,
494 const struct dictionary *dict,
495 const struct sort_criterion *criterion,
496 const struct missing_values *mv,
498 const struct rank_spec *rs,
501 struct casefile *dest)
508 const int fv = criterion->fv;
509 const int width = criterion->width;
511 while (casereader_cnum (cr) < end)
513 struct casereader *lookahead;
514 const union value *this_value;
515 struct ccase this_case, lookahead_case;
520 if (!casereader_read_xfer (cr, &this_case))
523 this_value = case_data_idx (&this_case, fv);
524 c = dict_get_case_weight (dict, &this_case, &warn);
526 lookahead = casereader_clone (cr);
528 while (casereader_cnum (lookahead) < end
529 && casereader_read_xfer (lookahead, &lookahead_case))
531 const union value *lookahead_value = case_data_idx (&lookahead_case, fv);
532 int diff = compare_values (this_value, lookahead_value, width);
536 /* Make sure the casefile was sorted */
537 assert ( diff == ((criterion->dir == SRT_ASCEND) ? -1 :1));
539 case_destroy (&lookahead_case);
543 c += dict_get_case_weight (dict, &lookahead_case, &warn);
544 case_destroy (&lookahead_case);
547 casereader_destroy (lookahead);
550 if ( !value_is_missing (mv, this_value) )
555 for (i = 0; i < n_rank_specs; ++i)
557 const struct variable *dst_var = rs[i].destvars[dest_var_index];
559 if ( value_is_missing (mv, this_value) )
560 case_data_rw (&this_case, dst_var)->f = SYSMIS;
562 case_data_rw (&this_case, dst_var)->f =
563 rank_func[rs[i].rfunc](c, cc, cc_1, iter, w);
565 casefile_append_xfer (dest, &this_case);
567 while (n-- > 0 && casereader_read_xfer (cr, &this_case));
569 if ( !value_is_missing (mv, this_value) )
573 /* If this isn't true, then all the results will be wrong */
578 same_group (const struct ccase *a, const struct ccase *b,
579 const struct sort_criteria *crit)
583 for (i = 0; i < crit->crit_cnt - 1; i++)
585 struct sort_criterion *c = &crit->crits[i];
586 if (compare_values (case_data_idx (a, c->fv),
587 case_data_idx (b, c->fv), c->width) != 0)
594 static struct casefile *
595 rank_sorted_casefile (struct casefile *cf,
596 const struct sort_criteria *crit,
597 const struct dictionary *dict,
598 const struct rank_spec *rs,
601 const struct missing_values *mv)
603 struct casefile *dest = fastfile_create (casefile_get_value_cnt (cf));
604 struct casereader *lookahead = casefile_get_reader (cf, NULL);
605 struct casereader *pos = casereader_clone (lookahead);
606 struct ccase group_case;
609 struct sort_criterion *ultimate_crit = &crit->crits[crit->crit_cnt - 1];
611 if (casereader_read (lookahead, &group_case))
613 struct ccase this_case;
614 const union value *this_value ;
616 this_value = case_data_idx( &group_case, ultimate_crit->fv);
618 if ( !value_is_missing(mv, this_value) )
619 w = dict_get_case_weight (dict, &group_case, &warn);
621 while (casereader_read (lookahead, &this_case))
623 const union value *this_value =
624 case_data_idx(&this_case, ultimate_crit->fv);
625 double c = dict_get_case_weight (dict, &this_case, &warn);
626 if (!same_group (&group_case, &this_case, crit))
628 rank_cases (pos, casereader_cnum (lookahead) - 1,
636 case_destroy (&group_case);
637 case_move (&group_case, &this_case);
639 if ( !value_is_missing (mv, this_value) )
641 case_destroy (&this_case);
643 case_destroy (&group_case);
644 rank_cases (pos, ULONG_MAX, dict, ultimate_crit, mv, w,
645 rs, n_rank_specs, dest_idx, dest);
648 if (casefile_error (dest))
650 casefile_destroy (dest);
654 casefile_destroy (cf);
659 /* Transformation function to enumerate all the cases */
661 create_resort_key (void *key_var_, struct ccase *cc, casenumber case_num)
663 struct variable *key_var = key_var_;
665 case_data_rw(cc, key_var)->f = case_num;
667 return TRNS_CONTINUE;
671 /* Create and return a new variable in which to store the ranks of SRC_VAR
672 accoring to the rank function F.
673 VNAME is the name of the variable to be created.
674 If VNAME is NULL, then a name will be automatically chosen.
676 static struct variable *
677 create_rank_variable (struct dictionary *dict, enum RANK_FUNC f,
678 const struct variable *src_var,
682 struct variable *var = NULL;
683 char name[SHORT_NAME_LEN + 1];
686 var = dict_create_var(dict, vname, 0);
690 snprintf (name, SHORT_NAME_LEN + 1, "%c%s",
691 function_name[f][0], var_get_name (src_var));
693 var = dict_create_var(dict, name, 0);
700 snprintf(func_abb, 4, "%s", function_name[f]);
701 snprintf(name, SHORT_NAME_LEN + 1, "%s%03d", func_abb,
704 var = dict_create_var(dict, name, 0);
710 while ( NULL == var )
713 snprintf(func_abb, 3, "%s", function_name[f]);
715 snprintf(name, SHORT_NAME_LEN + 1,
716 "RNK%s%02d", func_abb, i);
718 var = dict_create_var(dict, name, 0);
725 msg(ME, _("Cannot create new rank variable. All candidates in use."));
729 var_set_both_formats (var, &dest_format[f]);
744 for (i = 0 ; i < n_rank_specs ; ++i )
746 free (rank_specs[i].destvars);
753 sort_destroy_criteria (sc);
762 cmd_rank (struct lexer *lexer, struct dataset *ds)
765 struct variable *order;
769 if ( !parse_rank (lexer, ds, &cmd, NULL) )
775 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
776 if (cmd.miss == RANK_INCLUDE )
777 value_is_missing = mv_is_value_system_missing;
779 value_is_missing = mv_is_value_missing;
782 /* Default to /RANK if no function subcommands are given */
783 if ( !( cmd.sbc_normal || cmd.sbc_ntiles || cmd.sbc_proportion ||
784 cmd.sbc_rfraction || cmd.sbc_savage || cmd.sbc_n ||
785 cmd.sbc_percent || cmd.sbc_rank ) )
787 assert ( n_rank_specs == 0 );
789 rank_specs = xmalloc (sizeof (*rank_specs));
790 rank_specs[0].rfunc = RANK;
791 rank_specs[0].destvars =
792 xcalloc (sc->crit_cnt, sizeof (struct variable *));
797 assert ( sc->crit_cnt == n_src_vars);
799 /* Create variables for all rank destinations which haven't
800 already been created with INTO.
801 Add labels to all the destination variables.
803 for (i = 0 ; i < n_rank_specs ; ++i )
806 for ( v = 0 ; v < n_src_vars ; v ++ )
808 if ( rank_specs[i].destvars[v] == NULL )
810 rank_specs[i].destvars[v] =
811 create_rank_variable (dataset_dict(ds), rank_specs[i].rfunc, src_vars[v], NULL);
814 create_var_label ( rank_specs[i].destvars[v],
816 rank_specs[i].rfunc);
820 if ( cmd.print == RANK_YES )
824 tab_output_text (0, _("Variables Created By RANK"));
825 tab_output_text (0, "\n");
827 for (i = 0 ; i < n_rank_specs ; ++i )
829 for ( v = 0 ; v < n_src_vars ; v ++ )
831 if ( n_group_vars > 0 )
833 struct string varlist;
836 ds_init_empty (&varlist);
837 for ( g = 0 ; g < n_group_vars ; ++g )
839 ds_put_cstr (&varlist, var_get_name (group_vars[g]));
841 if ( g < n_group_vars - 1)
842 ds_put_cstr (&varlist, " ");
845 if ( rank_specs[i].rfunc == NORMAL ||
846 rank_specs[i].rfunc == PROPORTION )
847 tab_output_text (TAT_PRINTF,
848 _("%s into %s(%s of %s using %s BY %s)"),
849 var_get_name (src_vars[v]),
850 var_get_name (rank_specs[i].destvars[v]),
851 function_name[rank_specs[i].rfunc],
852 var_get_name (src_vars[v]),
858 tab_output_text (TAT_PRINTF,
859 _("%s into %s(%s of %s BY %s)"),
860 var_get_name (src_vars[v]),
861 var_get_name (rank_specs[i].destvars[v]),
862 function_name[rank_specs[i].rfunc],
863 var_get_name (src_vars[v]),
866 ds_destroy (&varlist);
870 if ( rank_specs[i].rfunc == NORMAL ||
871 rank_specs[i].rfunc == PROPORTION )
872 tab_output_text (TAT_PRINTF,
873 _("%s into %s(%s of %s using %s)"),
874 var_get_name (src_vars[v]),
875 var_get_name (rank_specs[i].destvars[v]),
876 function_name[rank_specs[i].rfunc],
877 var_get_name (src_vars[v]),
882 tab_output_text (TAT_PRINTF,
883 _("%s into %s(%s of %s)"),
884 var_get_name (src_vars[v]),
885 var_get_name (rank_specs[i].destvars[v]),
886 function_name[rank_specs[i].rfunc],
887 var_get_name (src_vars[v])
894 if ( cmd.sbc_fraction &&
895 ( ! cmd.sbc_normal && ! cmd.sbc_proportion) )
896 msg(MW, _("FRACTION has been specified, but NORMAL and PROPORTION rank functions have not been requested. The FRACTION subcommand will be ignored.") );
898 /* Add a variable which we can sort by to get back the original
900 order = dict_create_var_assert (dataset_dict (ds), "$ORDER_", 0);
902 add_transformation (ds, create_resort_key, 0, order);
905 result = rank_cmd (ds, sc, rank_specs, n_rank_specs);
907 /* Put the active file back in its original order */
909 struct sort_criteria criteria;
910 struct sort_criterion restore_criterion ;
911 restore_criterion.fv = var_get_case_index (order);
912 restore_criterion.width = 0;
913 restore_criterion.dir = SRT_ASCEND;
915 criteria.crits = &restore_criterion;
916 criteria.crit_cnt = 1;
918 sort_active_file_in_place (ds, &criteria);
921 /* ... and we don't need our sort key anymore. So delete it */
922 dict_delete_var (dataset_dict (ds), order);
926 return (result ? CMD_SUCCESS : CMD_CASCADING_FAILURE);
930 /* Parser for the variables sub command
931 Returns 1 on success */
933 rank_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd UNUSED, void *aux UNUSED)
935 static const int terminators[2] = {T_BY, 0};
937 lex_match (lexer, '=');
939 if ((lex_token (lexer) != T_ID || dict_lookup_var (dataset_dict (ds), lex_tokid (lexer)) == NULL)
940 && lex_token (lexer) != T_ALL)
943 sc = sort_parse_criteria (lexer, dataset_dict (ds),
944 &src_vars, &n_src_vars, 0, terminators);
946 if ( lex_match (lexer, T_BY) )
948 if ((lex_token (lexer) != T_ID || dict_lookup_var (dataset_dict (ds), lex_tokid (lexer)) == NULL))
953 if (!parse_variables (lexer, dataset_dict (ds),
954 &group_vars, &n_group_vars,
955 PV_NO_DUPLICATE | PV_NO_SCRATCH) )
966 /* Parse the [/rank INTO var1 var2 ... varN ] clause */
968 parse_rank_function (struct lexer *lexer, struct dictionary *dict, struct cmd_rank *cmd UNUSED, enum RANK_FUNC f)
973 rank_specs = xnrealloc(rank_specs, n_rank_specs, sizeof *rank_specs);
974 rank_specs[n_rank_specs - 1].rfunc = f;
975 rank_specs[n_rank_specs - 1].destvars = NULL;
977 rank_specs[n_rank_specs - 1].destvars =
978 xcalloc (sc->crit_cnt, sizeof (struct variable *));
980 if (lex_match_id (lexer, "INTO"))
982 struct variable *destvar;
984 while( lex_token (lexer) == T_ID )
987 if ( dict_lookup_var (dict, lex_tokid (lexer)) != NULL )
989 msg(SE, _("Variable %s already exists."), lex_tokid (lexer));
992 if ( var_count >= sc->crit_cnt )
994 msg(SE, _("Too many variables in INTO clause."));
998 destvar = create_rank_variable (dict, f, src_vars[var_count], lex_tokid (lexer));
999 rank_specs[n_rank_specs - 1].destvars[var_count] = destvar ;
1011 rank_custom_rank (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1013 struct dictionary *dict = dataset_dict (ds);
1015 return parse_rank_function (lexer, dict, cmd, RANK);
1019 rank_custom_normal (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1021 struct dictionary *dict = dataset_dict (ds);
1023 return parse_rank_function (lexer, dict, cmd, NORMAL);
1027 rank_custom_percent (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1029 struct dictionary *dict = dataset_dict (ds);
1031 return parse_rank_function (lexer, dict, cmd, PERCENT);
1035 rank_custom_rfraction (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1037 struct dictionary *dict = dataset_dict (ds);
1039 return parse_rank_function (lexer, dict, cmd, RFRACTION);
1043 rank_custom_proportion (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1045 struct dictionary *dict = dataset_dict (ds);
1047 return parse_rank_function (lexer, dict, cmd, PROPORTION);
1051 rank_custom_n (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1053 struct dictionary *dict = dataset_dict (ds);
1055 return parse_rank_function (lexer, dict, cmd, N);
1059 rank_custom_savage (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1061 struct dictionary *dict = dataset_dict (ds);
1063 return parse_rank_function (lexer, dict, cmd, SAVAGE);
1068 rank_custom_ntiles (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1070 struct dictionary *dict = dataset_dict (ds);
1072 if ( lex_force_match (lexer, '(') )
1074 if ( lex_force_int (lexer) )
1076 k_ntiles = lex_integer (lexer);
1078 lex_force_match (lexer, ')');
1086 return parse_rank_function (lexer, dict, cmd, NTILES);