1 /* PSPP - RANK. -*-c-*-
2 Copyright (C) 2005, 2006, 2007 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 #include "sort-criteria.h"
23 #include <data/dictionary.h>
24 #include <data/format.h>
25 #include <data/missing-values.h>
26 #include <data/procedure.h>
27 #include <data/variable.h>
28 #include <data/case.h>
29 #include <data/casefile.h>
30 #include <data/fastfile.h>
31 #include <data/storage-stream.h>
32 #include <language/command.h>
33 #include <language/stats/sort-criteria.h>
35 #include <libpspp/compiler.h>
36 #include <math/sort.h>
37 #include <output/table.h>
38 #include <output/manager.h>
40 #include <gsl/gsl_cdf.h>
44 #define _(msgid) gettext (msgid)
60 +fraction=fraction:!blom/tukey/vw/rankit;
61 +ties=ties:!mean/low/high/condense;
62 missing=miss:!exclude/include.
67 typedef double (*rank_function_t) (double c, double cc, double cc_1,
70 static double rank_proportion (double c, double cc, double cc_1,
73 static double rank_normal (double c, double cc, double cc_1,
76 static double rank_percent (double c, double cc, double cc_1,
79 static double rank_rfraction (double c, double cc, double cc_1,
82 static double rank_rank (double c, double cc, double cc_1,
85 static double rank_n (double c, double cc, double cc_1,
88 static double rank_savage (double c, double cc, double cc_1,
91 static double rank_ntiles (double c, double cc, double cc_1,
108 static const struct fmt_spec dest_format[n_RANK_FUNCS] = {
109 {FMT_F, 9, 3}, /* rank */
110 {FMT_F, 6, 4}, /* normal */
111 {FMT_F, 6, 2}, /* percent */
112 {FMT_F, 6, 4}, /* rfraction */
113 {FMT_F, 6, 4}, /* proportion */
114 {FMT_F, 6, 0}, /* n */
115 {FMT_F, 3, 0}, /* ntiles */
116 {FMT_F, 8, 4} /* savage */
119 static const char * const function_name[n_RANK_FUNCS] = {
130 static const rank_function_t rank_func[n_RANK_FUNCS] = {
144 enum RANK_FUNC rfunc;
145 struct variable **destvars;
149 /* Categories of missing values to exclude. */
150 static enum mv_class exclude_values;
152 static struct rank_spec *rank_specs;
153 static size_t n_rank_specs;
155 static struct sort_criteria *sc;
157 static const struct variable **group_vars;
158 static size_t n_group_vars;
160 static const struct variable **src_vars;
161 static size_t n_src_vars;
166 static struct cmd_rank cmd;
168 static struct casefile *rank_sorted_casefile (struct casefile *cf,
169 const struct sort_criteria *,
170 const struct dictionary *,
171 const struct rank_spec *rs,
174 const struct missing_values *miss
179 static char name[10];
180 switch ( cmd.fraction )
183 strcpy (name, "BLOM");
186 strcpy (name, "RANKIT");
189 strcpy (name, "TUKEY");
200 /* Create a label on DEST_VAR, describing its derivation from SRC_VAR and F */
202 create_var_label (struct variable *dest_var,
203 const struct variable *src_var, enum RANK_FUNC f)
206 ds_init_empty (&label);
208 if ( n_group_vars > 0 )
210 struct string group_var_str;
213 ds_init_empty (&group_var_str);
215 for (g = 0 ; g < n_group_vars ; ++g )
217 if ( g > 0 ) ds_put_cstr (&group_var_str, " ");
218 ds_put_cstr (&group_var_str, var_get_name (group_vars[g]));
221 ds_put_format (&label, _("%s of %s by %s"), function_name[f],
222 var_get_name (src_var), ds_cstr (&group_var_str));
223 ds_destroy (&group_var_str);
226 ds_put_format (&label, _("%s of %s"),
227 function_name[f], var_get_name (src_var));
229 var_set_label (dest_var, ds_cstr (&label));
236 rank_cmd (struct dataset *ds, const struct sort_criteria *sc,
237 const struct rank_spec *rank_specs, int n_rank_specs)
239 struct sort_criteria criteria;
242 const int n_splits = dict_get_split_cnt (dataset_dict (ds));
244 criteria.crit_cnt = n_splits + n_group_vars + 1;
245 criteria.crits = xnmalloc (criteria.crit_cnt, sizeof *criteria.crits);
246 for (i = 0; i < n_splits ; i++)
248 const struct variable *v = dict_get_split_vars (dataset_dict (ds))[i];
249 criteria.crits[i].fv = var_get_case_index (v);
250 criteria.crits[i].width = var_get_width (v);
251 criteria.crits[i].dir = SRT_ASCEND;
253 for (i = 0; i < n_group_vars; i++)
255 criteria.crits[i + n_splits].fv = var_get_case_index (group_vars[i]);
256 criteria.crits[i + n_splits].width = var_get_width (group_vars[i]);
257 criteria.crits[i + n_splits].dir = SRT_ASCEND;
259 for (i = 0 ; i < sc->crit_cnt ; ++i )
261 struct casefile *out ;
262 struct casefile *cf ;
263 struct casereader *reader ;
264 struct casefile *sorted_cf ;
266 /* Obtain active file in CF. */
267 if (!procedure (ds, NULL, NULL))
270 cf = proc_capture_output (ds);
272 /* Sort CF into SORTED_CF. */
273 reader = casefile_get_destructive_reader (cf) ;
274 criteria.crits[criteria.crit_cnt - 1] = sc->crits[i];
275 assert ( sc->crits[i].fv == var_get_case_index (src_vars[i]) );
276 sorted_cf = sort_execute (reader, &criteria, NULL);
277 casefile_destroy (cf);
279 out = rank_sorted_casefile (sorted_cf, &criteria,
281 rank_specs, n_rank_specs,
282 i, var_get_missing_values (src_vars[i]));
289 proc_set_source (ds, storage_source_create (out));
292 free (criteria.crits);
296 free (criteria.crits);
300 /* Hardly a rank function !! */
302 rank_n (double c UNUSED, double cc UNUSED, double cc_1 UNUSED,
303 int i UNUSED, double w)
310 rank_rank (double c, double cc, double cc_1,
311 int i, double w UNUSED)
325 rank = cc_1 + (c + 1.0)/ 2.0;
345 rank = cc_1 + c / 2.0 ;
360 rank_rfraction (double c, double cc, double cc_1,
363 return rank_rank (c, cc, cc_1, i, w) / w ;
368 rank_percent (double c, double cc, double cc_1,
371 return rank_rank (c, cc, cc_1, i, w) * 100.0 / w ;
376 rank_proportion (double c, double cc, double cc_1,
379 const double r = rank_rank (c, cc, cc_1, i, w) ;
383 switch ( cmd.fraction )
386 f = (r - 3.0/8.0) / (w + 0.25);
392 f = (r - 1.0/3.0) / (w + 1.0/3.0);
402 return (f > 0) ? f : SYSMIS;
406 rank_normal (double c, double cc, double cc_1,
409 double f = rank_proportion (c, cc, cc_1, i, w);
411 return gsl_cdf_ugaussian_Pinv (f);
415 rank_ntiles (double c, double cc, double cc_1,
418 double r = rank_rank (c, cc, cc_1, i, w);
421 return ( floor (( r * k_ntiles) / ( w + 1) ) + 1);
424 /* Expected value of the order statistics from an exponential distribution */
426 ee (int j, double w_star)
431 for (k = 1 ; k <= j; k++)
432 sum += 1.0 / ( w_star + 1 - k );
439 rank_savage (double c, double cc, double cc_1,
440 int i UNUSED, double w)
443 const int i_1 = floor (cc_1);
444 const int i_2 = floor (cc);
446 const double w_star = (modf (w, &int_part) == 0 ) ? w : floor (w) + 1;
448 const double g_1 = cc_1 - i_1;
449 const double g_2 = cc - i_2;
451 /* The second factor is infinite, when the first is zero.
452 Therefore, evaluate the second, only when the first is non-zero */
453 const double expr1 = (1 - g_1) ? (1 - g_1) * ee(i_1+1, w_star) : ( 1 - g_1);
454 const double expr2 = g_2 ? g_2 * ee (i_2+1, w_star) : g_2 ;
457 return ee (i_1 + 1, w_star) - 1;
459 if ( i_1 + 1 == i_2 )
460 return ( ( expr1 + expr2 )/c ) - 1;
462 if ( i_1 + 2 <= i_2 )
466 for (j = i_1 + 2 ; j <= i_2; ++j )
467 sigma += ee (j, w_star);
468 return ( (expr1 + expr2 + sigma) / c) -1;
475 /* Rank the casefile belonging to CR, starting from the current
476 postition of CR continuing up to and including the ENDth case.
478 RS points to an array containing the rank specifications to
479 use. N_RANK_SPECS is the number of elements of RS.
482 DEST_VAR_INDEX is the index into the rank_spec destvar element
483 to be used for this ranking.
485 Prerequisites: 1. The casefile must be sorted according to CRITERION.
486 2. W is the sum of the non-missing caseweights for this
487 range of the casefile.
490 rank_cases (struct casereader *cr,
492 const struct dictionary *dict,
493 const struct sort_criterion *criterion,
494 const struct missing_values *mv,
496 const struct rank_spec *rs,
499 struct casefile *dest)
506 const int fv = criterion->fv;
507 const int width = criterion->width;
509 while (casereader_cnum (cr) < end)
511 struct casereader *lookahead;
512 const union value *this_value;
513 bool this_value_is_missing;
514 struct ccase this_case, lookahead_case;
519 if (!casereader_read_xfer (cr, &this_case))
522 this_value = case_data_idx (&this_case, fv);
523 this_value_is_missing = mv_is_value_missing (mv, this_value,
525 c = dict_get_case_weight (dict, &this_case, &warn);
527 lookahead = casereader_clone (cr);
529 while (casereader_cnum (lookahead) < end
530 && casereader_read_xfer (lookahead, &lookahead_case))
532 const union value *lookahead_value = case_data_idx (&lookahead_case, fv);
533 int diff = compare_values (this_value, lookahead_value, width);
537 /* Make sure the casefile was sorted */
538 assert ( diff == ((criterion->dir == SRT_ASCEND) ? -1 :1));
540 case_destroy (&lookahead_case);
544 c += dict_get_case_weight (dict, &lookahead_case, &warn);
545 case_destroy (&lookahead_case);
548 casereader_destroy (lookahead);
551 if ( !this_value_is_missing )
556 for (i = 0; i < n_rank_specs; ++i)
558 const struct variable *dst_var = rs[i].destvars[dest_var_index];
560 if (this_value_is_missing)
561 case_data_rw (&this_case, dst_var)->f = SYSMIS;
563 case_data_rw (&this_case, dst_var)->f =
564 rank_func[rs[i].rfunc](c, cc, cc_1, iter, w);
566 casefile_append_xfer (dest, &this_case);
568 while (n-- > 0 && casereader_read_xfer (cr, &this_case));
570 if ( !this_value_is_missing )
574 /* If this isn't true, then all the results will be wrong */
579 same_group (const struct ccase *a, const struct ccase *b,
580 const struct sort_criteria *crit)
584 for (i = 0; i < crit->crit_cnt - 1; i++)
586 struct sort_criterion *c = &crit->crits[i];
587 if (compare_values (case_data_idx (a, c->fv),
588 case_data_idx (b, c->fv), c->width) != 0)
595 static struct casefile *
596 rank_sorted_casefile (struct casefile *cf,
597 const struct sort_criteria *crit,
598 const struct dictionary *dict,
599 const struct rank_spec *rs,
602 const struct missing_values *mv)
604 struct casefile *dest = fastfile_create (casefile_get_value_cnt (cf));
605 struct casereader *lookahead = casefile_get_reader (cf, NULL);
606 struct casereader *pos = casereader_clone (lookahead);
607 struct ccase group_case;
610 struct sort_criterion *ultimate_crit = &crit->crits[crit->crit_cnt - 1];
612 if (casereader_read (lookahead, &group_case))
614 struct ccase this_case;
615 const union value *this_value ;
617 this_value = case_data_idx( &group_case, ultimate_crit->fv);
619 if ( !mv_is_value_missing (mv, this_value, exclude_values) )
620 w = dict_get_case_weight (dict, &group_case, &warn);
622 while (casereader_read (lookahead, &this_case))
624 const union value *this_value =
625 case_data_idx(&this_case, ultimate_crit->fv);
626 double c = dict_get_case_weight (dict, &this_case, &warn);
627 if (!same_group (&group_case, &this_case, crit))
629 rank_cases (pos, casereader_cnum (lookahead) - 1,
637 case_destroy (&group_case);
638 case_move (&group_case, &this_case);
640 if ( !mv_is_value_missing (mv, this_value, exclude_values) )
642 case_destroy (&this_case);
644 case_destroy (&group_case);
645 rank_cases (pos, ULONG_MAX, dict, ultimate_crit, mv, w,
646 rs, n_rank_specs, dest_idx, dest);
649 if (casefile_error (dest))
651 casefile_destroy (dest);
655 casefile_destroy (cf);
660 /* Transformation function to enumerate all the cases */
662 create_resort_key (void *key_var_, struct ccase *cc, casenumber case_num)
664 struct variable *key_var = key_var_;
666 case_data_rw(cc, key_var)->f = case_num;
668 return TRNS_CONTINUE;
672 /* Create and return a new variable in which to store the ranks of SRC_VAR
673 accoring to the rank function F.
674 VNAME is the name of the variable to be created.
675 If VNAME is NULL, then a name will be automatically chosen.
677 static struct variable *
678 create_rank_variable (struct dictionary *dict, enum RANK_FUNC f,
679 const struct variable *src_var,
683 struct variable *var = NULL;
684 char name[SHORT_NAME_LEN + 1];
687 var = dict_create_var(dict, vname, 0);
691 snprintf (name, SHORT_NAME_LEN + 1, "%c%s",
692 function_name[f][0], var_get_name (src_var));
694 var = dict_create_var(dict, name, 0);
701 snprintf(func_abb, 4, "%s", function_name[f]);
702 snprintf(name, SHORT_NAME_LEN + 1, "%s%03d", func_abb,
705 var = dict_create_var(dict, name, 0);
711 while ( NULL == var )
714 snprintf(func_abb, 3, "%s", function_name[f]);
716 snprintf(name, SHORT_NAME_LEN + 1,
717 "RNK%s%02d", func_abb, i);
719 var = dict_create_var(dict, name, 0);
726 msg(ME, _("Cannot create new rank variable. All candidates in use."));
730 var_set_both_formats (var, &dest_format[f]);
745 for (i = 0 ; i < n_rank_specs ; ++i )
746 free (rank_specs[i].destvars);
752 sort_destroy_criteria (sc);
761 cmd_rank (struct lexer *lexer, struct dataset *ds)
764 struct variable *order;
768 if ( !parse_rank (lexer, ds, &cmd, NULL) )
774 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
775 exclude_values = cmd.miss == RANK_INCLUDE ? MV_SYSTEM : MV_ANY;
777 /* Default to /RANK if no function subcommands are given */
778 if ( !( cmd.sbc_normal || cmd.sbc_ntiles || cmd.sbc_proportion ||
779 cmd.sbc_rfraction || cmd.sbc_savage || cmd.sbc_n ||
780 cmd.sbc_percent || cmd.sbc_rank ) )
782 assert ( n_rank_specs == 0 );
784 rank_specs = xmalloc (sizeof (*rank_specs));
785 rank_specs[0].rfunc = RANK;
786 rank_specs[0].destvars =
787 xcalloc (sc->crit_cnt, sizeof (struct variable *));
792 assert ( sc->crit_cnt == n_src_vars);
794 /* Create variables for all rank destinations which haven't
795 already been created with INTO.
796 Add labels to all the destination variables.
798 for (i = 0 ; i < n_rank_specs ; ++i )
801 for ( v = 0 ; v < n_src_vars ; v ++ )
803 if ( rank_specs[i].destvars[v] == NULL )
805 rank_specs[i].destvars[v] =
806 create_rank_variable (dataset_dict(ds), rank_specs[i].rfunc, src_vars[v], NULL);
809 create_var_label ( rank_specs[i].destvars[v],
811 rank_specs[i].rfunc);
815 if ( cmd.print == RANK_YES )
819 tab_output_text (0, _("Variables Created By RANK"));
820 tab_output_text (0, "\n");
822 for (i = 0 ; i < n_rank_specs ; ++i )
824 for ( v = 0 ; v < n_src_vars ; v ++ )
826 if ( n_group_vars > 0 )
828 struct string varlist;
831 ds_init_empty (&varlist);
832 for ( g = 0 ; g < n_group_vars ; ++g )
834 ds_put_cstr (&varlist, var_get_name (group_vars[g]));
836 if ( g < n_group_vars - 1)
837 ds_put_cstr (&varlist, " ");
840 if ( rank_specs[i].rfunc == NORMAL ||
841 rank_specs[i].rfunc == PROPORTION )
842 tab_output_text (TAT_PRINTF,
843 _("%s into %s(%s of %s using %s BY %s)"),
844 var_get_name (src_vars[v]),
845 var_get_name (rank_specs[i].destvars[v]),
846 function_name[rank_specs[i].rfunc],
847 var_get_name (src_vars[v]),
853 tab_output_text (TAT_PRINTF,
854 _("%s into %s(%s of %s BY %s)"),
855 var_get_name (src_vars[v]),
856 var_get_name (rank_specs[i].destvars[v]),
857 function_name[rank_specs[i].rfunc],
858 var_get_name (src_vars[v]),
861 ds_destroy (&varlist);
865 if ( rank_specs[i].rfunc == NORMAL ||
866 rank_specs[i].rfunc == PROPORTION )
867 tab_output_text (TAT_PRINTF,
868 _("%s into %s(%s of %s using %s)"),
869 var_get_name (src_vars[v]),
870 var_get_name (rank_specs[i].destvars[v]),
871 function_name[rank_specs[i].rfunc],
872 var_get_name (src_vars[v]),
877 tab_output_text (TAT_PRINTF,
878 _("%s into %s(%s of %s)"),
879 var_get_name (src_vars[v]),
880 var_get_name (rank_specs[i].destvars[v]),
881 function_name[rank_specs[i].rfunc],
882 var_get_name (src_vars[v])
889 if ( cmd.sbc_fraction &&
890 ( ! cmd.sbc_normal && ! cmd.sbc_proportion) )
891 msg(MW, _("FRACTION has been specified, but NORMAL and PROPORTION rank functions have not been requested. The FRACTION subcommand will be ignored.") );
893 /* Add a variable which we can sort by to get back the original
895 order = dict_create_var_assert (dataset_dict (ds), "$ORDER_", 0);
897 add_transformation (ds, create_resort_key, 0, order);
900 result = rank_cmd (ds, sc, rank_specs, n_rank_specs);
902 /* Put the active file back in its original order */
904 struct sort_criteria criteria;
905 struct sort_criterion restore_criterion ;
906 restore_criterion.fv = var_get_case_index (order);
907 restore_criterion.width = 0;
908 restore_criterion.dir = SRT_ASCEND;
910 criteria.crits = &restore_criterion;
911 criteria.crit_cnt = 1;
913 sort_active_file_in_place (ds, &criteria);
916 /* ... and we don't need our sort key anymore. So delete it */
917 dict_delete_var (dataset_dict (ds), order);
922 return (result ? CMD_SUCCESS : CMD_CASCADING_FAILURE);
926 /* Parser for the variables sub command
927 Returns 1 on success */
929 rank_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd UNUSED, void *aux UNUSED)
931 static const int terminators[2] = {T_BY, 0};
933 lex_match (lexer, '=');
935 if ((lex_token (lexer) != T_ID || dict_lookup_var (dataset_dict (ds), lex_tokid (lexer)) == NULL)
936 && lex_token (lexer) != T_ALL)
939 sc = sort_parse_criteria (lexer, dataset_dict (ds),
940 &src_vars, &n_src_vars, 0, terminators);
942 if ( lex_match (lexer, T_BY) )
944 if ((lex_token (lexer) != T_ID || dict_lookup_var (dataset_dict (ds), lex_tokid (lexer)) == NULL))
949 if (!parse_variables_const (lexer, dataset_dict (ds),
950 &group_vars, &n_group_vars,
951 PV_NO_DUPLICATE | PV_NO_SCRATCH) )
962 /* Parse the [/rank INTO var1 var2 ... varN ] clause */
964 parse_rank_function (struct lexer *lexer, struct dictionary *dict, struct cmd_rank *cmd UNUSED, enum RANK_FUNC f)
969 rank_specs = xnrealloc(rank_specs, n_rank_specs, sizeof *rank_specs);
970 rank_specs[n_rank_specs - 1].rfunc = f;
971 rank_specs[n_rank_specs - 1].destvars = NULL;
973 rank_specs[n_rank_specs - 1].destvars =
974 xcalloc (sc->crit_cnt, sizeof (struct variable *));
976 if (lex_match_id (lexer, "INTO"))
978 struct variable *destvar;
980 while( lex_token (lexer) == T_ID )
983 if ( dict_lookup_var (dict, lex_tokid (lexer)) != NULL )
985 msg(SE, _("Variable %s already exists."), lex_tokid (lexer));
988 if ( var_count >= sc->crit_cnt )
990 msg(SE, _("Too many variables in INTO clause."));
994 destvar = create_rank_variable (dict, f, src_vars[var_count], lex_tokid (lexer));
995 rank_specs[n_rank_specs - 1].destvars[var_count] = destvar ;
1007 rank_custom_rank (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1009 struct dictionary *dict = dataset_dict (ds);
1011 return parse_rank_function (lexer, dict, cmd, RANK);
1015 rank_custom_normal (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1017 struct dictionary *dict = dataset_dict (ds);
1019 return parse_rank_function (lexer, dict, cmd, NORMAL);
1023 rank_custom_percent (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1025 struct dictionary *dict = dataset_dict (ds);
1027 return parse_rank_function (lexer, dict, cmd, PERCENT);
1031 rank_custom_rfraction (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1033 struct dictionary *dict = dataset_dict (ds);
1035 return parse_rank_function (lexer, dict, cmd, RFRACTION);
1039 rank_custom_proportion (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1041 struct dictionary *dict = dataset_dict (ds);
1043 return parse_rank_function (lexer, dict, cmd, PROPORTION);
1047 rank_custom_n (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1049 struct dictionary *dict = dataset_dict (ds);
1051 return parse_rank_function (lexer, dict, cmd, N);
1055 rank_custom_savage (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1057 struct dictionary *dict = dataset_dict (ds);
1059 return parse_rank_function (lexer, dict, cmd, SAVAGE);
1064 rank_custom_ntiles (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1066 struct dictionary *dict = dataset_dict (ds);
1068 if ( lex_force_match (lexer, '(') )
1070 if ( lex_force_int (lexer) )
1072 k_ntiles = lex_integer (lexer);
1074 lex_force_match (lexer, ')');
1082 return parse_rank_function (lexer, dict, cmd, NTILES);