1 /* PSPP - RANK. -*-c-*-
3 Copyright (C) 2005, 2006 Free Software Foundation, Inc.
4 Author: John Darrington <john@darrington.wattle.id.au>,
5 Ben Pfaff <blp@gnu.org>.
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
12 This program is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
24 #include "sort-criteria.h"
26 #include <data/dictionary.h>
27 #include <data/format.h>
28 #include <data/missing-values.h>
29 #include <data/procedure.h>
30 #include <data/variable.h>
31 #include <data/case.h>
32 #include <data/casefile.h>
33 #include <data/fastfile.h>
34 #include <data/storage-stream.h>
35 #include <language/command.h>
36 #include <language/stats/sort-criteria.h>
38 #include <libpspp/compiler.h>
39 #include <math/sort.h>
40 #include <output/table.h>
41 #include <output/manager.h>
43 #include <gsl/gsl_cdf.h>
47 #define _(msgid) gettext (msgid)
63 +fraction=fraction:!blom/tukey/vw/rankit;
64 +ties=ties:!mean/low/high/condense;
65 missing=miss:!exclude/include.
70 typedef double (*rank_function_t) (double c, double cc, double cc_1,
73 static double rank_proportion (double c, double cc, double cc_1,
76 static double rank_normal (double c, double cc, double cc_1,
79 static double rank_percent (double c, double cc, double cc_1,
82 static double rank_rfraction (double c, double cc, double cc_1,
85 static double rank_rank (double c, double cc, double cc_1,
88 static double rank_n (double c, double cc, double cc_1,
91 static double rank_savage (double c, double cc, double cc_1,
94 static double rank_ntiles (double c, double cc, double cc_1,
111 static const struct fmt_spec dest_format[n_RANK_FUNCS] = {
112 {FMT_F, 9, 3}, /* rank */
113 {FMT_F, 6, 4}, /* normal */
114 {FMT_F, 6, 2}, /* percent */
115 {FMT_F, 6, 4}, /* rfraction */
116 {FMT_F, 6, 4}, /* proportion */
117 {FMT_F, 6, 0}, /* n */
118 {FMT_F, 3, 0}, /* ntiles */
119 {FMT_F, 8, 4} /* savage */
122 static const char * const function_name[n_RANK_FUNCS] = {
133 static const rank_function_t rank_func[n_RANK_FUNCS] = {
147 enum RANK_FUNC rfunc;
148 struct variable **destvars;
152 /* Function to use for testing for missing values */
153 static mv_is_missing_func *value_is_missing;
155 static struct rank_spec *rank_specs;
156 static size_t n_rank_specs;
158 static struct sort_criteria *sc;
160 static struct variable **group_vars;
161 static size_t n_group_vars;
163 static struct variable **src_vars;
164 static size_t n_src_vars;
169 static struct cmd_rank cmd;
171 static struct casefile *rank_sorted_casefile (struct casefile *cf,
172 const struct sort_criteria *,
173 const struct dictionary *,
174 const struct rank_spec *rs,
177 const struct missing_values *miss
182 static char name[10];
183 switch ( cmd.fraction )
186 strcpy (name, "BLOM");
189 strcpy (name, "RANKIT");
192 strcpy (name, "TUKEY");
203 /* Create a label on DEST_VAR, describing its derivation from SRC_VAR and F */
205 create_var_label (struct variable *dest_var,
206 const struct variable *src_var, enum RANK_FUNC f)
209 ds_init_empty (&label);
211 if ( n_group_vars > 0 )
213 struct string group_var_str;
216 ds_init_empty (&group_var_str);
218 for (g = 0 ; g < n_group_vars ; ++g )
220 if ( g > 0 ) ds_put_cstr (&group_var_str, " ");
221 ds_put_cstr (&group_var_str, var_get_name (group_vars[g]));
224 ds_put_format (&label, _("%s of %s by %s"), function_name[f],
225 var_get_name (src_var), ds_cstr (&group_var_str));
226 ds_destroy (&group_var_str);
229 ds_put_format (&label, _("%s of %s"),
230 function_name[f], var_get_name (src_var));
232 var_set_label (dest_var, ds_cstr (&label));
239 rank_cmd (struct dataset *ds, const struct sort_criteria *sc,
240 const struct rank_spec *rank_specs, int n_rank_specs)
242 struct sort_criteria criteria;
245 const int n_splits = dict_get_split_cnt (dataset_dict (ds));
247 criteria.crit_cnt = n_splits + n_group_vars + 1;
248 criteria.crits = xnmalloc (criteria.crit_cnt, sizeof *criteria.crits);
249 for (i = 0; i < n_splits ; i++)
251 struct variable *v = dict_get_split_vars (dataset_dict (ds))[i];
252 criteria.crits[i].fv = var_get_case_index (v);
253 criteria.crits[i].width = var_get_width (v);
254 criteria.crits[i].dir = SRT_ASCEND;
256 for (i = 0; i < n_group_vars; i++)
258 criteria.crits[i + n_splits].fv = var_get_case_index (group_vars[i]);
259 criteria.crits[i + n_splits].width = var_get_width (group_vars[i]);
260 criteria.crits[i + n_splits].dir = SRT_ASCEND;
262 for (i = 0 ; i < sc->crit_cnt ; ++i )
264 struct casefile *out ;
265 struct casefile *cf ;
266 struct casereader *reader ;
267 struct casefile *sorted_cf ;
269 /* Obtain active file in CF. */
270 if (!procedure (ds, NULL, NULL))
273 cf = proc_capture_output (ds);
275 /* Sort CF into SORTED_CF. */
276 reader = casefile_get_destructive_reader (cf) ;
277 criteria.crits[criteria.crit_cnt - 1] = sc->crits[i];
278 assert ( sc->crits[i].fv == var_get_case_index (src_vars[i]) );
279 sorted_cf = sort_execute (reader, &criteria);
280 casefile_destroy (cf);
282 out = rank_sorted_casefile (sorted_cf, &criteria,
284 rank_specs, n_rank_specs,
285 i, var_get_missing_values (src_vars[i]));
292 proc_set_source (ds, storage_source_create (out));
295 free (criteria.crits);
299 free (criteria.crits);
303 /* Hardly a rank function !! */
305 rank_n (double c UNUSED, double cc UNUSED, double cc_1 UNUSED,
306 int i UNUSED, double w)
313 rank_rank (double c, double cc, double cc_1,
314 int i, double w UNUSED)
328 rank = cc_1 + (c + 1.0)/ 2.0;
348 rank = cc_1 + c / 2.0 ;
363 rank_rfraction (double c, double cc, double cc_1,
366 return rank_rank (c, cc, cc_1, i, w) / w ;
371 rank_percent (double c, double cc, double cc_1,
374 return rank_rank (c, cc, cc_1, i, w) * 100.0 / w ;
379 rank_proportion (double c, double cc, double cc_1,
382 const double r = rank_rank (c, cc, cc_1, i, w) ;
386 switch ( cmd.fraction )
389 f = (r - 3.0/8.0) / (w + 0.25);
395 f = (r - 1.0/3.0) / (w + 1.0/3.0);
405 return (f > 0) ? f : SYSMIS;
409 rank_normal (double c, double cc, double cc_1,
412 double f = rank_proportion (c, cc, cc_1, i, w);
414 return gsl_cdf_ugaussian_Pinv (f);
418 rank_ntiles (double c, double cc, double cc_1,
421 double r = rank_rank (c, cc, cc_1, i, w);
424 return ( floor (( r * k_ntiles) / ( w + 1) ) + 1);
427 /* Expected value of the order statistics from an exponential distribution */
429 ee (int j, double w_star)
434 for (k = 1 ; k <= j; k++)
435 sum += 1.0 / ( w_star + 1 - k );
442 rank_savage (double c, double cc, double cc_1,
443 int i UNUSED, double w)
446 const int i_1 = floor (cc_1);
447 const int i_2 = floor (cc);
449 const double w_star = (modf (w, &int_part) == 0 ) ? w : floor (w) + 1;
451 const double g_1 = cc_1 - i_1;
452 const double g_2 = cc - i_2;
454 /* The second factor is infinite, when the first is zero.
455 Therefore, evaluate the second, only when the first is non-zero */
456 const double expr1 = (1 - g_1) ? (1 - g_1) * ee(i_1+1, w_star) : ( 1 - g_1);
457 const double expr2 = g_2 ? g_2 * ee (i_2+1, w_star) : g_2 ;
460 return ee (i_1 + 1, w_star) - 1;
462 if ( i_1 + 1 == i_2 )
463 return ( ( expr1 + expr2 )/c ) - 1;
465 if ( i_1 + 2 <= i_2 )
469 for (j = i_1 + 2 ; j <= i_2; ++j )
470 sigma += ee (j, w_star);
471 return ( (expr1 + expr2 + sigma) / c) -1;
478 /* Rank the casefile belonging to CR, starting from the current
479 postition of CR continuing up to and including the ENDth case.
481 RS points to an array containing the rank specifications to
482 use. N_RANK_SPECS is the number of elements of RS.
485 DEST_VAR_INDEX is the index into the rank_spec destvar element
486 to be used for this ranking.
488 Prerequisites: 1. The casefile must be sorted according to CRITERION.
489 2. W is the sum of the non-missing caseweights for this
490 range of the casefile.
493 rank_cases (struct casereader *cr,
495 const struct dictionary *dict,
496 const struct sort_criterion *criterion,
497 const struct missing_values *mv,
499 const struct rank_spec *rs,
502 struct casefile *dest)
509 const int fv = criterion->fv;
510 const int width = criterion->width;
512 while (casereader_cnum (cr) < end)
514 struct casereader *lookahead;
515 const union value *this_value;
516 struct ccase this_case, lookahead_case;
521 if (!casereader_read_xfer (cr, &this_case))
524 this_value = case_data_idx (&this_case, fv);
525 c = dict_get_case_weight (dict, &this_case, &warn);
527 lookahead = casereader_clone (cr);
529 while (casereader_cnum (lookahead) < end
530 && casereader_read_xfer (lookahead, &lookahead_case))
532 const union value *lookahead_value = case_data_idx (&lookahead_case, fv);
533 int diff = compare_values (this_value, lookahead_value, width);
537 /* Make sure the casefile was sorted */
538 assert ( diff == ((criterion->dir == SRT_ASCEND) ? -1 :1));
540 case_destroy (&lookahead_case);
544 c += dict_get_case_weight (dict, &lookahead_case, &warn);
545 case_destroy (&lookahead_case);
548 casereader_destroy (lookahead);
551 if ( !value_is_missing (mv, this_value) )
556 for (i = 0; i < n_rank_specs; ++i)
558 const struct variable *dst_var = rs[i].destvars[dest_var_index];
560 if ( value_is_missing (mv, this_value) )
561 case_data_rw (&this_case, dst_var)->f = SYSMIS;
563 case_data_rw (&this_case, dst_var)->f =
564 rank_func[rs[i].rfunc](c, cc, cc_1, iter, w);
566 casefile_append_xfer (dest, &this_case);
568 while (n-- > 0 && casereader_read_xfer (cr, &this_case));
570 if ( !value_is_missing (mv, this_value) )
574 /* If this isn't true, then all the results will be wrong */
579 same_group (const struct ccase *a, const struct ccase *b,
580 const struct sort_criteria *crit)
584 for (i = 0; i < crit->crit_cnt - 1; i++)
586 struct sort_criterion *c = &crit->crits[i];
587 if (compare_values (case_data_idx (a, c->fv),
588 case_data_idx (b, c->fv), c->width) != 0)
595 static struct casefile *
596 rank_sorted_casefile (struct casefile *cf,
597 const struct sort_criteria *crit,
598 const struct dictionary *dict,
599 const struct rank_spec *rs,
602 const struct missing_values *mv)
604 struct casefile *dest = fastfile_create (casefile_get_value_cnt (cf));
605 struct casereader *lookahead = casefile_get_reader (cf, NULL);
606 struct casereader *pos = casereader_clone (lookahead);
607 struct ccase group_case;
610 struct sort_criterion *ultimate_crit = &crit->crits[crit->crit_cnt - 1];
612 if (casereader_read (lookahead, &group_case))
614 struct ccase this_case;
615 const union value *this_value ;
617 this_value = case_data_idx( &group_case, ultimate_crit->fv);
619 if ( !value_is_missing(mv, this_value) )
620 w = dict_get_case_weight (dict, &group_case, &warn);
622 while (casereader_read (lookahead, &this_case))
624 const union value *this_value =
625 case_data_idx(&this_case, ultimate_crit->fv);
626 double c = dict_get_case_weight (dict, &this_case, &warn);
627 if (!same_group (&group_case, &this_case, crit))
629 rank_cases (pos, casereader_cnum (lookahead) - 1,
637 case_destroy (&group_case);
638 case_move (&group_case, &this_case);
640 if ( !value_is_missing (mv, this_value) )
642 case_destroy (&this_case);
644 case_destroy (&group_case);
645 rank_cases (pos, ULONG_MAX, dict, ultimate_crit, mv, w,
646 rs, n_rank_specs, dest_idx, dest);
649 if (casefile_error (dest))
651 casefile_destroy (dest);
655 casefile_destroy (cf);
660 /* Transformation function to enumerate all the cases */
662 create_resort_key (void *key_var_, struct ccase *cc, casenumber case_num)
664 struct variable *key_var = key_var_;
666 case_data_rw(cc, key_var)->f = case_num;
668 return TRNS_CONTINUE;
672 /* Create and return a new variable in which to store the ranks of SRC_VAR
673 accoring to the rank function F.
674 VNAME is the name of the variable to be created.
675 If VNAME is NULL, then a name will be automatically chosen.
677 static struct variable *
678 create_rank_variable (struct dictionary *dict, enum RANK_FUNC f,
679 const struct variable *src_var,
683 struct variable *var = NULL;
684 char name[SHORT_NAME_LEN + 1];
687 var = dict_create_var(dict, vname, 0);
691 snprintf (name, SHORT_NAME_LEN + 1, "%c%s",
692 function_name[f][0], var_get_name (src_var));
694 var = dict_create_var(dict, name, 0);
701 snprintf(func_abb, 4, "%s", function_name[f]);
702 snprintf(name, SHORT_NAME_LEN + 1, "%s%03d", func_abb,
705 var = dict_create_var(dict, name, 0);
711 while ( NULL == var )
714 snprintf(func_abb, 3, "%s", function_name[f]);
716 snprintf(name, SHORT_NAME_LEN + 1,
717 "RNK%s%02d", func_abb, i);
719 var = dict_create_var(dict, name, 0);
726 msg(ME, _("Cannot create new rank variable. All candidates in use."));
730 var_set_both_formats (var, &dest_format[f]);
745 for (i = 0 ; i < n_rank_specs ; ++i )
747 free (rank_specs[i].destvars);
754 sort_destroy_criteria (sc);
763 cmd_rank (struct lexer *lexer, struct dataset *ds)
766 struct variable *order;
770 if ( !parse_rank (lexer, ds, &cmd, NULL) )
776 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
777 if (cmd.miss == RANK_INCLUDE )
778 value_is_missing = mv_is_value_system_missing;
780 value_is_missing = mv_is_value_missing;
783 /* Default to /RANK if no function subcommands are given */
784 if ( !( cmd.sbc_normal || cmd.sbc_ntiles || cmd.sbc_proportion ||
785 cmd.sbc_rfraction || cmd.sbc_savage || cmd.sbc_n ||
786 cmd.sbc_percent || cmd.sbc_rank ) )
788 assert ( n_rank_specs == 0 );
790 rank_specs = xmalloc (sizeof (*rank_specs));
791 rank_specs[0].rfunc = RANK;
792 rank_specs[0].destvars =
793 xcalloc (sc->crit_cnt, sizeof (struct variable *));
798 assert ( sc->crit_cnt == n_src_vars);
800 /* Create variables for all rank destinations which haven't
801 already been created with INTO.
802 Add labels to all the destination variables.
804 for (i = 0 ; i < n_rank_specs ; ++i )
807 for ( v = 0 ; v < n_src_vars ; v ++ )
809 if ( rank_specs[i].destvars[v] == NULL )
811 rank_specs[i].destvars[v] =
812 create_rank_variable (dataset_dict(ds), rank_specs[i].rfunc, src_vars[v], NULL);
815 create_var_label ( rank_specs[i].destvars[v],
817 rank_specs[i].rfunc);
821 if ( cmd.print == RANK_YES )
825 tab_output_text (0, _("Variables Created By RANK"));
826 tab_output_text (0, "\n");
828 for (i = 0 ; i < n_rank_specs ; ++i )
830 for ( v = 0 ; v < n_src_vars ; v ++ )
832 if ( n_group_vars > 0 )
834 struct string varlist;
837 ds_init_empty (&varlist);
838 for ( g = 0 ; g < n_group_vars ; ++g )
840 ds_put_cstr (&varlist, var_get_name (group_vars[g]));
842 if ( g < n_group_vars - 1)
843 ds_put_cstr (&varlist, " ");
846 if ( rank_specs[i].rfunc == NORMAL ||
847 rank_specs[i].rfunc == PROPORTION )
848 tab_output_text (TAT_PRINTF,
849 _("%s into %s(%s of %s using %s BY %s)"),
850 var_get_name (src_vars[v]),
851 var_get_name (rank_specs[i].destvars[v]),
852 function_name[rank_specs[i].rfunc],
853 var_get_name (src_vars[v]),
859 tab_output_text (TAT_PRINTF,
860 _("%s into %s(%s of %s BY %s)"),
861 var_get_name (src_vars[v]),
862 var_get_name (rank_specs[i].destvars[v]),
863 function_name[rank_specs[i].rfunc],
864 var_get_name (src_vars[v]),
867 ds_destroy (&varlist);
871 if ( rank_specs[i].rfunc == NORMAL ||
872 rank_specs[i].rfunc == PROPORTION )
873 tab_output_text (TAT_PRINTF,
874 _("%s into %s(%s of %s using %s)"),
875 var_get_name (src_vars[v]),
876 var_get_name (rank_specs[i].destvars[v]),
877 function_name[rank_specs[i].rfunc],
878 var_get_name (src_vars[v]),
883 tab_output_text (TAT_PRINTF,
884 _("%s into %s(%s of %s)"),
885 var_get_name (src_vars[v]),
886 var_get_name (rank_specs[i].destvars[v]),
887 function_name[rank_specs[i].rfunc],
888 var_get_name (src_vars[v])
895 if ( cmd.sbc_fraction &&
896 ( ! cmd.sbc_normal && ! cmd.sbc_proportion) )
897 msg(MW, _("FRACTION has been specified, but NORMAL and PROPORTION rank functions have not been requested. The FRACTION subcommand will be ignored.") );
899 /* Add a variable which we can sort by to get back the original
901 order = dict_create_var_assert (dataset_dict (ds), "$ORDER_", 0);
903 add_transformation (ds, create_resort_key, 0, order);
906 result = rank_cmd (ds, sc, rank_specs, n_rank_specs);
908 /* Put the active file back in its original order */
910 struct sort_criteria criteria;
911 struct sort_criterion restore_criterion ;
912 restore_criterion.fv = var_get_case_index (order);
913 restore_criterion.width = 0;
914 restore_criterion.dir = SRT_ASCEND;
916 criteria.crits = &restore_criterion;
917 criteria.crit_cnt = 1;
919 sort_active_file_in_place (ds, &criteria);
922 /* ... and we don't need our sort key anymore. So delete it */
923 dict_delete_var (dataset_dict (ds), order);
927 return (result ? CMD_SUCCESS : CMD_CASCADING_FAILURE);
931 /* Parser for the variables sub command
932 Returns 1 on success */
934 rank_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd UNUSED, void *aux UNUSED)
936 static const int terminators[2] = {T_BY, 0};
938 lex_match (lexer, '=');
940 if ((lex_token (lexer) != T_ID || dict_lookup_var (dataset_dict (ds), lex_tokid (lexer)) == NULL)
941 && lex_token (lexer) != T_ALL)
944 sc = sort_parse_criteria (lexer, dataset_dict (ds),
945 &src_vars, &n_src_vars, 0, terminators);
947 if ( lex_match (lexer, T_BY) )
949 if ((lex_token (lexer) != T_ID || dict_lookup_var (dataset_dict (ds), lex_tokid (lexer)) == NULL))
954 if (!parse_variables (lexer, dataset_dict (ds),
955 &group_vars, &n_group_vars,
956 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
967 /* Parse the [/rank INTO var1 var2 ... varN ] clause */
969 parse_rank_function (struct lexer *lexer, struct dictionary *dict, struct cmd_rank *cmd UNUSED, enum RANK_FUNC f)
974 rank_specs = xnrealloc(rank_specs, n_rank_specs, sizeof *rank_specs);
975 rank_specs[n_rank_specs - 1].rfunc = f;
976 rank_specs[n_rank_specs - 1].destvars = NULL;
978 rank_specs[n_rank_specs - 1].destvars =
979 xcalloc (sc->crit_cnt, sizeof (struct variable *));
981 if (lex_match_id (lexer, "INTO"))
983 struct variable *destvar;
985 while( lex_token (lexer) == T_ID )
988 if ( dict_lookup_var (dict, lex_tokid (lexer)) != NULL )
990 msg(SE, _("Variable %s already exists."), lex_tokid (lexer));
993 if ( var_count >= sc->crit_cnt )
995 msg(SE, _("Too many variables in INTO clause."));
999 destvar = create_rank_variable (dict, f, src_vars[var_count], lex_tokid (lexer));
1000 rank_specs[n_rank_specs - 1].destvars[var_count] = destvar ;
1012 rank_custom_rank (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1014 struct dictionary *dict = dataset_dict (ds);
1016 return parse_rank_function (lexer, dict, cmd, RANK);
1020 rank_custom_normal (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1022 struct dictionary *dict = dataset_dict (ds);
1024 return parse_rank_function (lexer, dict, cmd, NORMAL);
1028 rank_custom_percent (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1030 struct dictionary *dict = dataset_dict (ds);
1032 return parse_rank_function (lexer, dict, cmd, PERCENT);
1036 rank_custom_rfraction (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1038 struct dictionary *dict = dataset_dict (ds);
1040 return parse_rank_function (lexer, dict, cmd, RFRACTION);
1044 rank_custom_proportion (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1046 struct dictionary *dict = dataset_dict (ds);
1048 return parse_rank_function (lexer, dict, cmd, PROPORTION);
1052 rank_custom_n (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1054 struct dictionary *dict = dataset_dict (ds);
1056 return parse_rank_function (lexer, dict, cmd, N);
1060 rank_custom_savage (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1062 struct dictionary *dict = dataset_dict (ds);
1064 return parse_rank_function (lexer, dict, cmd, SAVAGE);
1069 rank_custom_ntiles (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1071 struct dictionary *dict = dataset_dict (ds);
1073 if ( lex_force_match (lexer, '(') )
1075 if ( lex_force_int (lexer) )
1077 k_ntiles = lex_integer (lexer);
1079 lex_force_match (lexer, ')');
1087 return parse_rank_function (lexer, dict, cmd, NTILES);