1 /* PSPP - RANK. -*-c-*-
3 Copyright (C) 2005, 2006 Free Software Foundation, Inc.
4 Ben Pfaff <blp@gnu.org>.
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 #include "sort-criteria.h"
25 #include <data/dictionary.h>
26 #include <data/format.h>
27 #include <data/missing-values.h>
28 #include <data/procedure.h>
29 #include <data/variable.h>
30 #include <data/case.h>
31 #include <data/casefile.h>
32 #include <data/fastfile.h>
33 #include <data/storage-stream.h>
34 #include <language/command.h>
35 #include <language/stats/sort-criteria.h>
37 #include <libpspp/compiler.h>
38 #include <math/sort.h>
39 #include <output/table.h>
40 #include <output/manager.h>
42 #include <gsl/gsl_cdf.h>
46 #define _(msgid) gettext (msgid)
62 +fraction=fraction:!blom/tukey/vw/rankit;
63 +ties=ties:!mean/low/high/condense;
64 missing=miss:!exclude/include.
69 typedef double (*rank_function_t) (double c, double cc, double cc_1,
72 static double rank_proportion (double c, double cc, double cc_1,
75 static double rank_normal (double c, double cc, double cc_1,
78 static double rank_percent (double c, double cc, double cc_1,
81 static double rank_rfraction (double c, double cc, double cc_1,
84 static double rank_rank (double c, double cc, double cc_1,
87 static double rank_n (double c, double cc, double cc_1,
90 static double rank_savage (double c, double cc, double cc_1,
93 static double rank_ntiles (double c, double cc, double cc_1,
110 static const struct fmt_spec dest_format[n_RANK_FUNCS] = {
111 {FMT_F, 9, 3}, /* rank */
112 {FMT_F, 6, 4}, /* normal */
113 {FMT_F, 6, 2}, /* percent */
114 {FMT_F, 6, 4}, /* rfraction */
115 {FMT_F, 6, 4}, /* proportion */
116 {FMT_F, 6, 0}, /* n */
117 {FMT_F, 3, 0}, /* ntiles */
118 {FMT_F, 8, 4} /* savage */
121 static const char * const function_name[n_RANK_FUNCS] = {
132 static const rank_function_t rank_func[n_RANK_FUNCS] = {
146 enum RANK_FUNC rfunc;
147 struct variable **destvars;
151 /* Categories of missing values to exclude. */
152 static enum mv_class exclude_values;
154 static struct rank_spec *rank_specs;
155 static size_t n_rank_specs;
157 static struct sort_criteria *sc;
159 static struct variable **group_vars;
160 static size_t n_group_vars;
162 static struct variable **src_vars;
163 static size_t n_src_vars;
168 static struct cmd_rank cmd;
170 static struct casefile *rank_sorted_casefile (struct casefile *cf,
171 const struct sort_criteria *,
172 const struct dictionary *,
173 const struct rank_spec *rs,
176 const struct missing_values *miss
181 static char name[10];
182 switch ( cmd.fraction )
185 strcpy (name, "BLOM");
188 strcpy (name, "RANKIT");
191 strcpy (name, "TUKEY");
202 /* Create a label on DEST_VAR, describing its derivation from SRC_VAR and F */
204 create_var_label (struct variable *dest_var,
205 const struct variable *src_var, enum RANK_FUNC f)
208 ds_init_empty (&label);
210 if ( n_group_vars > 0 )
212 struct string group_var_str;
215 ds_init_empty (&group_var_str);
217 for (g = 0 ; g < n_group_vars ; ++g )
219 if ( g > 0 ) ds_put_cstr (&group_var_str, " ");
220 ds_put_cstr (&group_var_str, var_get_name (group_vars[g]));
223 ds_put_format (&label, _("%s of %s by %s"), function_name[f],
224 var_get_name (src_var), ds_cstr (&group_var_str));
225 ds_destroy (&group_var_str);
228 ds_put_format (&label, _("%s of %s"),
229 function_name[f], var_get_name (src_var));
231 var_set_label (dest_var, ds_cstr (&label));
238 rank_cmd (struct dataset *ds, const struct sort_criteria *sc,
239 const struct rank_spec *rank_specs, int n_rank_specs)
241 struct sort_criteria criteria;
244 const int n_splits = dict_get_split_cnt (dataset_dict (ds));
246 criteria.crit_cnt = n_splits + n_group_vars + 1;
247 criteria.crits = xnmalloc (criteria.crit_cnt, sizeof *criteria.crits);
248 for (i = 0; i < n_splits ; i++)
250 struct variable *v = dict_get_split_vars (dataset_dict (ds))[i];
251 criteria.crits[i].fv = var_get_case_index (v);
252 criteria.crits[i].width = var_get_width (v);
253 criteria.crits[i].dir = SRT_ASCEND;
255 for (i = 0; i < n_group_vars; i++)
257 criteria.crits[i + n_splits].fv = var_get_case_index (group_vars[i]);
258 criteria.crits[i + n_splits].width = var_get_width (group_vars[i]);
259 criteria.crits[i + n_splits].dir = SRT_ASCEND;
261 for (i = 0 ; i < sc->crit_cnt ; ++i )
263 struct casefile *out ;
264 struct casefile *cf ;
265 struct casereader *reader ;
266 struct casefile *sorted_cf ;
268 /* Obtain active file in CF. */
269 if (!procedure (ds, NULL, NULL))
272 cf = proc_capture_output (ds);
274 /* Sort CF into SORTED_CF. */
275 reader = casefile_get_destructive_reader (cf) ;
276 criteria.crits[criteria.crit_cnt - 1] = sc->crits[i];
277 assert ( sc->crits[i].fv == var_get_case_index (src_vars[i]) );
278 sorted_cf = sort_execute (reader, &criteria, NULL);
279 casefile_destroy (cf);
281 out = rank_sorted_casefile (sorted_cf, &criteria,
283 rank_specs, n_rank_specs,
284 i, var_get_missing_values (src_vars[i]));
291 proc_set_source (ds, storage_source_create (out));
294 free (criteria.crits);
298 free (criteria.crits);
302 /* Hardly a rank function !! */
304 rank_n (double c UNUSED, double cc UNUSED, double cc_1 UNUSED,
305 int i UNUSED, double w)
312 rank_rank (double c, double cc, double cc_1,
313 int i, double w UNUSED)
327 rank = cc_1 + (c + 1.0)/ 2.0;
347 rank = cc_1 + c / 2.0 ;
362 rank_rfraction (double c, double cc, double cc_1,
365 return rank_rank (c, cc, cc_1, i, w) / w ;
370 rank_percent (double c, double cc, double cc_1,
373 return rank_rank (c, cc, cc_1, i, w) * 100.0 / w ;
378 rank_proportion (double c, double cc, double cc_1,
381 const double r = rank_rank (c, cc, cc_1, i, w) ;
385 switch ( cmd.fraction )
388 f = (r - 3.0/8.0) / (w + 0.25);
394 f = (r - 1.0/3.0) / (w + 1.0/3.0);
404 return (f > 0) ? f : SYSMIS;
408 rank_normal (double c, double cc, double cc_1,
411 double f = rank_proportion (c, cc, cc_1, i, w);
413 return gsl_cdf_ugaussian_Pinv (f);
417 rank_ntiles (double c, double cc, double cc_1,
420 double r = rank_rank (c, cc, cc_1, i, w);
423 return ( floor (( r * k_ntiles) / ( w + 1) ) + 1);
426 /* Expected value of the order statistics from an exponential distribution */
428 ee (int j, double w_star)
433 for (k = 1 ; k <= j; k++)
434 sum += 1.0 / ( w_star + 1 - k );
441 rank_savage (double c, double cc, double cc_1,
442 int i UNUSED, double w)
445 const int i_1 = floor (cc_1);
446 const int i_2 = floor (cc);
448 const double w_star = (modf (w, &int_part) == 0 ) ? w : floor (w) + 1;
450 const double g_1 = cc_1 - i_1;
451 const double g_2 = cc - i_2;
453 /* The second factor is infinite, when the first is zero.
454 Therefore, evaluate the second, only when the first is non-zero */
455 const double expr1 = (1 - g_1) ? (1 - g_1) * ee(i_1+1, w_star) : ( 1 - g_1);
456 const double expr2 = g_2 ? g_2 * ee (i_2+1, w_star) : g_2 ;
459 return ee (i_1 + 1, w_star) - 1;
461 if ( i_1 + 1 == i_2 )
462 return ( ( expr1 + expr2 )/c ) - 1;
464 if ( i_1 + 2 <= i_2 )
468 for (j = i_1 + 2 ; j <= i_2; ++j )
469 sigma += ee (j, w_star);
470 return ( (expr1 + expr2 + sigma) / c) -1;
477 /* Rank the casefile belonging to CR, starting from the current
478 postition of CR continuing up to and including the ENDth case.
480 RS points to an array containing the rank specifications to
481 use. N_RANK_SPECS is the number of elements of RS.
484 DEST_VAR_INDEX is the index into the rank_spec destvar element
485 to be used for this ranking.
487 Prerequisites: 1. The casefile must be sorted according to CRITERION.
488 2. W is the sum of the non-missing caseweights for this
489 range of the casefile.
492 rank_cases (struct casereader *cr,
494 const struct dictionary *dict,
495 const struct sort_criterion *criterion,
496 const struct missing_values *mv,
498 const struct rank_spec *rs,
501 struct casefile *dest)
508 const int fv = criterion->fv;
509 const int width = criterion->width;
511 while (casereader_cnum (cr) < end)
513 struct casereader *lookahead;
514 const union value *this_value;
515 struct ccase this_case, lookahead_case;
520 if (!casereader_read_xfer (cr, &this_case))
523 this_value = case_data_idx (&this_case, fv);
524 c = dict_get_case_weight (dict, &this_case, &warn);
526 lookahead = casereader_clone (cr);
528 while (casereader_cnum (lookahead) < end
529 && casereader_read_xfer (lookahead, &lookahead_case))
531 const union value *lookahead_value = case_data_idx (&lookahead_case, fv);
532 int diff = compare_values (this_value, lookahead_value, width);
536 /* Make sure the casefile was sorted */
537 assert ( diff == ((criterion->dir == SRT_ASCEND) ? -1 :1));
539 case_destroy (&lookahead_case);
543 c += dict_get_case_weight (dict, &lookahead_case, &warn);
544 case_destroy (&lookahead_case);
547 casereader_destroy (lookahead);
550 if ( !mv_is_value_missing (mv, this_value, exclude_values) )
555 for (i = 0; i < n_rank_specs; ++i)
557 const struct variable *dst_var = rs[i].destvars[dest_var_index];
559 if ( mv_is_value_missing (mv, this_value, exclude_values) )
560 case_data_rw (&this_case, dst_var)->f = SYSMIS;
562 case_data_rw (&this_case, dst_var)->f =
563 rank_func[rs[i].rfunc](c, cc, cc_1, iter, w);
565 casefile_append_xfer (dest, &this_case);
567 while (n-- > 0 && casereader_read_xfer (cr, &this_case));
569 if ( !mv_is_value_missing (mv, this_value, exclude_values) )
573 /* If this isn't true, then all the results will be wrong */
578 same_group (const struct ccase *a, const struct ccase *b,
579 const struct sort_criteria *crit)
583 for (i = 0; i < crit->crit_cnt - 1; i++)
585 struct sort_criterion *c = &crit->crits[i];
586 if (compare_values (case_data_idx (a, c->fv),
587 case_data_idx (b, c->fv), c->width) != 0)
594 static struct casefile *
595 rank_sorted_casefile (struct casefile *cf,
596 const struct sort_criteria *crit,
597 const struct dictionary *dict,
598 const struct rank_spec *rs,
601 const struct missing_values *mv)
603 struct casefile *dest = fastfile_create (casefile_get_value_cnt (cf));
604 struct casereader *lookahead = casefile_get_reader (cf, NULL);
605 struct casereader *pos = casereader_clone (lookahead);
606 struct ccase group_case;
609 struct sort_criterion *ultimate_crit = &crit->crits[crit->crit_cnt - 1];
611 if (casereader_read (lookahead, &group_case))
613 struct ccase this_case;
614 const union value *this_value ;
616 this_value = case_data_idx( &group_case, ultimate_crit->fv);
618 if ( !mv_is_value_missing (mv, this_value, exclude_values) )
619 w = dict_get_case_weight (dict, &group_case, &warn);
621 while (casereader_read (lookahead, &this_case))
623 const union value *this_value =
624 case_data_idx(&this_case, ultimate_crit->fv);
625 double c = dict_get_case_weight (dict, &this_case, &warn);
626 if (!same_group (&group_case, &this_case, crit))
628 rank_cases (pos, casereader_cnum (lookahead) - 1,
636 case_destroy (&group_case);
637 case_move (&group_case, &this_case);
639 if ( !mv_is_value_missing (mv, this_value, exclude_values) )
641 case_destroy (&this_case);
643 case_destroy (&group_case);
644 rank_cases (pos, ULONG_MAX, dict, ultimate_crit, mv, w,
645 rs, n_rank_specs, dest_idx, dest);
648 if (casefile_error (dest))
650 casefile_destroy (dest);
654 casefile_destroy (cf);
659 /* Transformation function to enumerate all the cases */
661 create_resort_key (void *key_var_, struct ccase *cc, casenumber case_num)
663 struct variable *key_var = key_var_;
665 case_data_rw(cc, key_var)->f = case_num;
667 return TRNS_CONTINUE;
671 /* Create and return a new variable in which to store the ranks of SRC_VAR
672 accoring to the rank function F.
673 VNAME is the name of the variable to be created.
674 If VNAME is NULL, then a name will be automatically chosen.
676 static struct variable *
677 create_rank_variable (struct dictionary *dict, enum RANK_FUNC f,
678 const struct variable *src_var,
682 struct variable *var = NULL;
683 char name[SHORT_NAME_LEN + 1];
686 var = dict_create_var(dict, vname, 0);
690 snprintf (name, SHORT_NAME_LEN + 1, "%c%s",
691 function_name[f][0], var_get_name (src_var));
693 var = dict_create_var(dict, name, 0);
700 snprintf(func_abb, 4, "%s", function_name[f]);
701 snprintf(name, SHORT_NAME_LEN + 1, "%s%03d", func_abb,
704 var = dict_create_var(dict, name, 0);
710 while ( NULL == var )
713 snprintf(func_abb, 3, "%s", function_name[f]);
715 snprintf(name, SHORT_NAME_LEN + 1,
716 "RNK%s%02d", func_abb, i);
718 var = dict_create_var(dict, name, 0);
725 msg(ME, _("Cannot create new rank variable. All candidates in use."));
729 var_set_both_formats (var, &dest_format[f]);
744 for (i = 0 ; i < n_rank_specs ; ++i )
746 free (rank_specs[i].destvars);
753 sort_destroy_criteria (sc);
762 cmd_rank (struct lexer *lexer, struct dataset *ds)
765 struct variable *order;
769 if ( !parse_rank (lexer, ds, &cmd, NULL) )
775 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
776 exclude_values = cmd.miss == RANK_INCLUDE ? MV_SYSTEM : MV_ANY;
778 /* Default to /RANK if no function subcommands are given */
779 if ( !( cmd.sbc_normal || cmd.sbc_ntiles || cmd.sbc_proportion ||
780 cmd.sbc_rfraction || cmd.sbc_savage || cmd.sbc_n ||
781 cmd.sbc_percent || cmd.sbc_rank ) )
783 assert ( n_rank_specs == 0 );
785 rank_specs = xmalloc (sizeof (*rank_specs));
786 rank_specs[0].rfunc = RANK;
787 rank_specs[0].destvars =
788 xcalloc (sc->crit_cnt, sizeof (struct variable *));
793 assert ( sc->crit_cnt == n_src_vars);
795 /* Create variables for all rank destinations which haven't
796 already been created with INTO.
797 Add labels to all the destination variables.
799 for (i = 0 ; i < n_rank_specs ; ++i )
802 for ( v = 0 ; v < n_src_vars ; v ++ )
804 if ( rank_specs[i].destvars[v] == NULL )
806 rank_specs[i].destvars[v] =
807 create_rank_variable (dataset_dict(ds), rank_specs[i].rfunc, src_vars[v], NULL);
810 create_var_label ( rank_specs[i].destvars[v],
812 rank_specs[i].rfunc);
816 if ( cmd.print == RANK_YES )
820 tab_output_text (0, _("Variables Created By RANK"));
821 tab_output_text (0, "\n");
823 for (i = 0 ; i < n_rank_specs ; ++i )
825 for ( v = 0 ; v < n_src_vars ; v ++ )
827 if ( n_group_vars > 0 )
829 struct string varlist;
832 ds_init_empty (&varlist);
833 for ( g = 0 ; g < n_group_vars ; ++g )
835 ds_put_cstr (&varlist, var_get_name (group_vars[g]));
837 if ( g < n_group_vars - 1)
838 ds_put_cstr (&varlist, " ");
841 if ( rank_specs[i].rfunc == NORMAL ||
842 rank_specs[i].rfunc == PROPORTION )
843 tab_output_text (TAT_PRINTF,
844 _("%s into %s(%s of %s using %s BY %s)"),
845 var_get_name (src_vars[v]),
846 var_get_name (rank_specs[i].destvars[v]),
847 function_name[rank_specs[i].rfunc],
848 var_get_name (src_vars[v]),
854 tab_output_text (TAT_PRINTF,
855 _("%s into %s(%s of %s BY %s)"),
856 var_get_name (src_vars[v]),
857 var_get_name (rank_specs[i].destvars[v]),
858 function_name[rank_specs[i].rfunc],
859 var_get_name (src_vars[v]),
862 ds_destroy (&varlist);
866 if ( rank_specs[i].rfunc == NORMAL ||
867 rank_specs[i].rfunc == PROPORTION )
868 tab_output_text (TAT_PRINTF,
869 _("%s into %s(%s of %s using %s)"),
870 var_get_name (src_vars[v]),
871 var_get_name (rank_specs[i].destvars[v]),
872 function_name[rank_specs[i].rfunc],
873 var_get_name (src_vars[v]),
878 tab_output_text (TAT_PRINTF,
879 _("%s into %s(%s of %s)"),
880 var_get_name (src_vars[v]),
881 var_get_name (rank_specs[i].destvars[v]),
882 function_name[rank_specs[i].rfunc],
883 var_get_name (src_vars[v])
890 if ( cmd.sbc_fraction &&
891 ( ! cmd.sbc_normal && ! cmd.sbc_proportion) )
892 msg(MW, _("FRACTION has been specified, but NORMAL and PROPORTION rank functions have not been requested. The FRACTION subcommand will be ignored.") );
894 /* Add a variable which we can sort by to get back the original
896 order = dict_create_var_assert (dataset_dict (ds), "$ORDER_", 0);
898 add_transformation (ds, create_resort_key, 0, order);
901 result = rank_cmd (ds, sc, rank_specs, n_rank_specs);
903 /* Put the active file back in its original order */
905 struct sort_criteria criteria;
906 struct sort_criterion restore_criterion ;
907 restore_criterion.fv = var_get_case_index (order);
908 restore_criterion.width = 0;
909 restore_criterion.dir = SRT_ASCEND;
911 criteria.crits = &restore_criterion;
912 criteria.crit_cnt = 1;
914 sort_active_file_in_place (ds, &criteria);
917 /* ... and we don't need our sort key anymore. So delete it */
918 dict_delete_var (dataset_dict (ds), order);
922 return (result ? CMD_SUCCESS : CMD_CASCADING_FAILURE);
926 /* Parser for the variables sub command
927 Returns 1 on success */
929 rank_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd UNUSED, void *aux UNUSED)
931 static const int terminators[2] = {T_BY, 0};
933 lex_match (lexer, '=');
935 if ((lex_token (lexer) != T_ID || dict_lookup_var (dataset_dict (ds), lex_tokid (lexer)) == NULL)
936 && lex_token (lexer) != T_ALL)
939 sc = sort_parse_criteria (lexer, dataset_dict (ds),
940 &src_vars, &n_src_vars, 0, terminators);
942 if ( lex_match (lexer, T_BY) )
944 if ((lex_token (lexer) != T_ID || dict_lookup_var (dataset_dict (ds), lex_tokid (lexer)) == NULL))
949 if (!parse_variables (lexer, dataset_dict (ds),
950 &group_vars, &n_group_vars,
951 PV_NO_DUPLICATE | PV_NO_SCRATCH) )
962 /* Parse the [/rank INTO var1 var2 ... varN ] clause */
964 parse_rank_function (struct lexer *lexer, struct dictionary *dict, struct cmd_rank *cmd UNUSED, enum RANK_FUNC f)
969 rank_specs = xnrealloc(rank_specs, n_rank_specs, sizeof *rank_specs);
970 rank_specs[n_rank_specs - 1].rfunc = f;
971 rank_specs[n_rank_specs - 1].destvars = NULL;
973 rank_specs[n_rank_specs - 1].destvars =
974 xcalloc (sc->crit_cnt, sizeof (struct variable *));
976 if (lex_match_id (lexer, "INTO"))
978 struct variable *destvar;
980 while( lex_token (lexer) == T_ID )
983 if ( dict_lookup_var (dict, lex_tokid (lexer)) != NULL )
985 msg(SE, _("Variable %s already exists."), lex_tokid (lexer));
988 if ( var_count >= sc->crit_cnt )
990 msg(SE, _("Too many variables in INTO clause."));
994 destvar = create_rank_variable (dict, f, src_vars[var_count], lex_tokid (lexer));
995 rank_specs[n_rank_specs - 1].destvars[var_count] = destvar ;
1007 rank_custom_rank (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1009 struct dictionary *dict = dataset_dict (ds);
1011 return parse_rank_function (lexer, dict, cmd, RANK);
1015 rank_custom_normal (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1017 struct dictionary *dict = dataset_dict (ds);
1019 return parse_rank_function (lexer, dict, cmd, NORMAL);
1023 rank_custom_percent (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1025 struct dictionary *dict = dataset_dict (ds);
1027 return parse_rank_function (lexer, dict, cmd, PERCENT);
1031 rank_custom_rfraction (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1033 struct dictionary *dict = dataset_dict (ds);
1035 return parse_rank_function (lexer, dict, cmd, RFRACTION);
1039 rank_custom_proportion (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1041 struct dictionary *dict = dataset_dict (ds);
1043 return parse_rank_function (lexer, dict, cmd, PROPORTION);
1047 rank_custom_n (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1049 struct dictionary *dict = dataset_dict (ds);
1051 return parse_rank_function (lexer, dict, cmd, N);
1055 rank_custom_savage (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1057 struct dictionary *dict = dataset_dict (ds);
1059 return parse_rank_function (lexer, dict, cmd, SAVAGE);
1064 rank_custom_ntiles (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1066 struct dictionary *dict = dataset_dict (ds);
1068 if ( lex_force_match (lexer, '(') )
1070 if ( lex_force_int (lexer) )
1072 k_ntiles = lex_integer (lexer);
1074 lex_force_match (lexer, ')');
1082 return parse_rank_function (lexer, dict, cmd, NTILES);