1 /* PSPP - RANK. -*-c-*-
2 Copyright (C) 2005, 2006, 2007 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 #include "sort-criteria.h"
23 #include <data/dictionary.h>
24 #include <data/format.h>
25 #include <data/missing-values.h>
26 #include <data/procedure.h>
27 #include <data/variable.h>
28 #include <data/case.h>
29 #include <data/casefile.h>
30 #include <data/fastfile.h>
31 #include <data/storage-stream.h>
32 #include <language/command.h>
33 #include <language/stats/sort-criteria.h>
35 #include <libpspp/compiler.h>
36 #include <math/sort.h>
37 #include <output/table.h>
38 #include <output/manager.h>
40 #include <gsl/gsl_cdf.h>
44 #define _(msgid) gettext (msgid)
60 +fraction=fraction:!blom/tukey/vw/rankit;
61 +ties=ties:!mean/low/high/condense;
62 missing=miss:!exclude/include.
67 typedef double (*rank_function_t) (double c, double cc, double cc_1,
70 static double rank_proportion (double c, double cc, double cc_1,
73 static double rank_normal (double c, double cc, double cc_1,
76 static double rank_percent (double c, double cc, double cc_1,
79 static double rank_rfraction (double c, double cc, double cc_1,
82 static double rank_rank (double c, double cc, double cc_1,
85 static double rank_n (double c, double cc, double cc_1,
88 static double rank_savage (double c, double cc, double cc_1,
91 static double rank_ntiles (double c, double cc, double cc_1,
108 static const struct fmt_spec dest_format[n_RANK_FUNCS] = {
109 {FMT_F, 9, 3}, /* rank */
110 {FMT_F, 6, 4}, /* normal */
111 {FMT_F, 6, 2}, /* percent */
112 {FMT_F, 6, 4}, /* rfraction */
113 {FMT_F, 6, 4}, /* proportion */
114 {FMT_F, 6, 0}, /* n */
115 {FMT_F, 3, 0}, /* ntiles */
116 {FMT_F, 8, 4} /* savage */
119 static const char * const function_name[n_RANK_FUNCS] = {
130 static const rank_function_t rank_func[n_RANK_FUNCS] = {
144 enum RANK_FUNC rfunc;
145 struct variable **destvars;
149 /* Categories of missing values to exclude. */
150 static enum mv_class exclude_values;
152 static struct rank_spec *rank_specs;
153 static size_t n_rank_specs;
155 static struct sort_criteria *sc;
157 static struct variable **group_vars;
158 static size_t n_group_vars;
160 static struct variable **src_vars;
161 static size_t n_src_vars;
166 static struct cmd_rank cmd;
168 static struct casefile *rank_sorted_casefile (struct casefile *cf,
169 const struct sort_criteria *,
170 const struct dictionary *,
171 const struct rank_spec *rs,
174 const struct missing_values *miss
179 static char name[10];
180 switch ( cmd.fraction )
183 strcpy (name, "BLOM");
186 strcpy (name, "RANKIT");
189 strcpy (name, "TUKEY");
200 /* Create a label on DEST_VAR, describing its derivation from SRC_VAR and F */
202 create_var_label (struct variable *dest_var,
203 const struct variable *src_var, enum RANK_FUNC f)
206 ds_init_empty (&label);
208 if ( n_group_vars > 0 )
210 struct string group_var_str;
213 ds_init_empty (&group_var_str);
215 for (g = 0 ; g < n_group_vars ; ++g )
217 if ( g > 0 ) ds_put_cstr (&group_var_str, " ");
218 ds_put_cstr (&group_var_str, var_get_name (group_vars[g]));
221 ds_put_format (&label, _("%s of %s by %s"), function_name[f],
222 var_get_name (src_var), ds_cstr (&group_var_str));
223 ds_destroy (&group_var_str);
226 ds_put_format (&label, _("%s of %s"),
227 function_name[f], var_get_name (src_var));
229 var_set_label (dest_var, ds_cstr (&label));
236 rank_cmd (struct dataset *ds, const struct sort_criteria *sc,
237 const struct rank_spec *rank_specs, int n_rank_specs)
239 struct sort_criteria criteria;
242 const int n_splits = dict_get_split_cnt (dataset_dict (ds));
244 criteria.crit_cnt = n_splits + n_group_vars + 1;
245 criteria.crits = xnmalloc (criteria.crit_cnt, sizeof *criteria.crits);
246 for (i = 0; i < n_splits ; i++)
248 struct variable *v = dict_get_split_vars (dataset_dict (ds))[i];
249 criteria.crits[i].fv = var_get_case_index (v);
250 criteria.crits[i].width = var_get_width (v);
251 criteria.crits[i].dir = SRT_ASCEND;
253 for (i = 0; i < n_group_vars; i++)
255 criteria.crits[i + n_splits].fv = var_get_case_index (group_vars[i]);
256 criteria.crits[i + n_splits].width = var_get_width (group_vars[i]);
257 criteria.crits[i + n_splits].dir = SRT_ASCEND;
259 for (i = 0 ; i < sc->crit_cnt ; ++i )
261 struct casefile *out ;
262 struct casefile *cf ;
263 struct casereader *reader ;
264 struct casefile *sorted_cf ;
266 /* Obtain active file in CF. */
267 if (!procedure (ds, NULL, NULL))
270 cf = proc_capture_output (ds);
272 /* Sort CF into SORTED_CF. */
273 reader = casefile_get_destructive_reader (cf) ;
274 criteria.crits[criteria.crit_cnt - 1] = sc->crits[i];
275 assert ( sc->crits[i].fv == var_get_case_index (src_vars[i]) );
276 sorted_cf = sort_execute (reader, &criteria, NULL);
277 casefile_destroy (cf);
279 out = rank_sorted_casefile (sorted_cf, &criteria,
281 rank_specs, n_rank_specs,
282 i, var_get_missing_values (src_vars[i]));
289 proc_set_source (ds, storage_source_create (out));
292 free (criteria.crits);
296 free (criteria.crits);
300 /* Hardly a rank function !! */
302 rank_n (double c UNUSED, double cc UNUSED, double cc_1 UNUSED,
303 int i UNUSED, double w)
310 rank_rank (double c, double cc, double cc_1,
311 int i, double w UNUSED)
325 rank = cc_1 + (c + 1.0)/ 2.0;
345 rank = cc_1 + c / 2.0 ;
360 rank_rfraction (double c, double cc, double cc_1,
363 return rank_rank (c, cc, cc_1, i, w) / w ;
368 rank_percent (double c, double cc, double cc_1,
371 return rank_rank (c, cc, cc_1, i, w) * 100.0 / w ;
376 rank_proportion (double c, double cc, double cc_1,
379 const double r = rank_rank (c, cc, cc_1, i, w) ;
383 switch ( cmd.fraction )
386 f = (r - 3.0/8.0) / (w + 0.25);
392 f = (r - 1.0/3.0) / (w + 1.0/3.0);
402 return (f > 0) ? f : SYSMIS;
406 rank_normal (double c, double cc, double cc_1,
409 double f = rank_proportion (c, cc, cc_1, i, w);
411 return gsl_cdf_ugaussian_Pinv (f);
415 rank_ntiles (double c, double cc, double cc_1,
418 double r = rank_rank (c, cc, cc_1, i, w);
421 return ( floor (( r * k_ntiles) / ( w + 1) ) + 1);
424 /* Expected value of the order statistics from an exponential distribution */
426 ee (int j, double w_star)
431 for (k = 1 ; k <= j; k++)
432 sum += 1.0 / ( w_star + 1 - k );
439 rank_savage (double c, double cc, double cc_1,
440 int i UNUSED, double w)
443 const int i_1 = floor (cc_1);
444 const int i_2 = floor (cc);
446 const double w_star = (modf (w, &int_part) == 0 ) ? w : floor (w) + 1;
448 const double g_1 = cc_1 - i_1;
449 const double g_2 = cc - i_2;
451 /* The second factor is infinite, when the first is zero.
452 Therefore, evaluate the second, only when the first is non-zero */
453 const double expr1 = (1 - g_1) ? (1 - g_1) * ee(i_1+1, w_star) : ( 1 - g_1);
454 const double expr2 = g_2 ? g_2 * ee (i_2+1, w_star) : g_2 ;
457 return ee (i_1 + 1, w_star) - 1;
459 if ( i_1 + 1 == i_2 )
460 return ( ( expr1 + expr2 )/c ) - 1;
462 if ( i_1 + 2 <= i_2 )
466 for (j = i_1 + 2 ; j <= i_2; ++j )
467 sigma += ee (j, w_star);
468 return ( (expr1 + expr2 + sigma) / c) -1;
475 /* Rank the casefile belonging to CR, starting from the current
476 postition of CR continuing up to and including the ENDth case.
478 RS points to an array containing the rank specifications to
479 use. N_RANK_SPECS is the number of elements of RS.
482 DEST_VAR_INDEX is the index into the rank_spec destvar element
483 to be used for this ranking.
485 Prerequisites: 1. The casefile must be sorted according to CRITERION.
486 2. W is the sum of the non-missing caseweights for this
487 range of the casefile.
490 rank_cases (struct casereader *cr,
492 const struct dictionary *dict,
493 const struct sort_criterion *criterion,
494 const struct missing_values *mv,
496 const struct rank_spec *rs,
499 struct casefile *dest)
506 const int fv = criterion->fv;
507 const int width = criterion->width;
509 while (casereader_cnum (cr) < end)
511 struct casereader *lookahead;
512 const union value *this_value;
513 struct ccase this_case, lookahead_case;
518 if (!casereader_read_xfer (cr, &this_case))
521 this_value = case_data_idx (&this_case, fv);
522 c = dict_get_case_weight (dict, &this_case, &warn);
524 lookahead = casereader_clone (cr);
526 while (casereader_cnum (lookahead) < end
527 && casereader_read_xfer (lookahead, &lookahead_case))
529 const union value *lookahead_value = case_data_idx (&lookahead_case, fv);
530 int diff = compare_values (this_value, lookahead_value, width);
534 /* Make sure the casefile was sorted */
535 assert ( diff == ((criterion->dir == SRT_ASCEND) ? -1 :1));
537 case_destroy (&lookahead_case);
541 c += dict_get_case_weight (dict, &lookahead_case, &warn);
542 case_destroy (&lookahead_case);
545 casereader_destroy (lookahead);
548 if ( !mv_is_value_missing (mv, this_value, exclude_values) )
553 for (i = 0; i < n_rank_specs; ++i)
555 const struct variable *dst_var = rs[i].destvars[dest_var_index];
557 if ( mv_is_value_missing (mv, this_value, exclude_values) )
558 case_data_rw (&this_case, dst_var)->f = SYSMIS;
560 case_data_rw (&this_case, dst_var)->f =
561 rank_func[rs[i].rfunc](c, cc, cc_1, iter, w);
563 casefile_append_xfer (dest, &this_case);
565 while (n-- > 0 && casereader_read_xfer (cr, &this_case));
567 if ( !mv_is_value_missing (mv, this_value, exclude_values) )
571 /* If this isn't true, then all the results will be wrong */
576 same_group (const struct ccase *a, const struct ccase *b,
577 const struct sort_criteria *crit)
581 for (i = 0; i < crit->crit_cnt - 1; i++)
583 struct sort_criterion *c = &crit->crits[i];
584 if (compare_values (case_data_idx (a, c->fv),
585 case_data_idx (b, c->fv), c->width) != 0)
592 static struct casefile *
593 rank_sorted_casefile (struct casefile *cf,
594 const struct sort_criteria *crit,
595 const struct dictionary *dict,
596 const struct rank_spec *rs,
599 const struct missing_values *mv)
601 struct casefile *dest = fastfile_create (casefile_get_value_cnt (cf));
602 struct casereader *lookahead = casefile_get_reader (cf, NULL);
603 struct casereader *pos = casereader_clone (lookahead);
604 struct ccase group_case;
607 struct sort_criterion *ultimate_crit = &crit->crits[crit->crit_cnt - 1];
609 if (casereader_read (lookahead, &group_case))
611 struct ccase this_case;
612 const union value *this_value ;
614 this_value = case_data_idx( &group_case, ultimate_crit->fv);
616 if ( !mv_is_value_missing (mv, this_value, exclude_values) )
617 w = dict_get_case_weight (dict, &group_case, &warn);
619 while (casereader_read (lookahead, &this_case))
621 const union value *this_value =
622 case_data_idx(&this_case, ultimate_crit->fv);
623 double c = dict_get_case_weight (dict, &this_case, &warn);
624 if (!same_group (&group_case, &this_case, crit))
626 rank_cases (pos, casereader_cnum (lookahead) - 1,
634 case_destroy (&group_case);
635 case_move (&group_case, &this_case);
637 if ( !mv_is_value_missing (mv, this_value, exclude_values) )
639 case_destroy (&this_case);
641 case_destroy (&group_case);
642 rank_cases (pos, ULONG_MAX, dict, ultimate_crit, mv, w,
643 rs, n_rank_specs, dest_idx, dest);
646 if (casefile_error (dest))
648 casefile_destroy (dest);
652 casefile_destroy (cf);
657 /* Transformation function to enumerate all the cases */
659 create_resort_key (void *key_var_, struct ccase *cc, casenumber case_num)
661 struct variable *key_var = key_var_;
663 case_data_rw(cc, key_var)->f = case_num;
665 return TRNS_CONTINUE;
669 /* Create and return a new variable in which to store the ranks of SRC_VAR
670 accoring to the rank function F.
671 VNAME is the name of the variable to be created.
672 If VNAME is NULL, then a name will be automatically chosen.
674 static struct variable *
675 create_rank_variable (struct dictionary *dict, enum RANK_FUNC f,
676 const struct variable *src_var,
680 struct variable *var = NULL;
681 char name[SHORT_NAME_LEN + 1];
684 var = dict_create_var(dict, vname, 0);
688 snprintf (name, SHORT_NAME_LEN + 1, "%c%s",
689 function_name[f][0], var_get_name (src_var));
691 var = dict_create_var(dict, name, 0);
698 snprintf(func_abb, 4, "%s", function_name[f]);
699 snprintf(name, SHORT_NAME_LEN + 1, "%s%03d", func_abb,
702 var = dict_create_var(dict, name, 0);
708 while ( NULL == var )
711 snprintf(func_abb, 3, "%s", function_name[f]);
713 snprintf(name, SHORT_NAME_LEN + 1,
714 "RNK%s%02d", func_abb, i);
716 var = dict_create_var(dict, name, 0);
723 msg(ME, _("Cannot create new rank variable. All candidates in use."));
727 var_set_both_formats (var, &dest_format[f]);
742 for (i = 0 ; i < n_rank_specs ; ++i )
743 free (rank_specs[i].destvars);
749 sort_destroy_criteria (sc);
758 cmd_rank (struct lexer *lexer, struct dataset *ds)
761 struct variable *order;
765 if ( !parse_rank (lexer, ds, &cmd, NULL) )
771 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
772 exclude_values = cmd.miss == RANK_INCLUDE ? MV_SYSTEM : MV_ANY;
774 /* Default to /RANK if no function subcommands are given */
775 if ( !( cmd.sbc_normal || cmd.sbc_ntiles || cmd.sbc_proportion ||
776 cmd.sbc_rfraction || cmd.sbc_savage || cmd.sbc_n ||
777 cmd.sbc_percent || cmd.sbc_rank ) )
779 assert ( n_rank_specs == 0 );
781 rank_specs = xmalloc (sizeof (*rank_specs));
782 rank_specs[0].rfunc = RANK;
783 rank_specs[0].destvars =
784 xcalloc (sc->crit_cnt, sizeof (struct variable *));
789 assert ( sc->crit_cnt == n_src_vars);
791 /* Create variables for all rank destinations which haven't
792 already been created with INTO.
793 Add labels to all the destination variables.
795 for (i = 0 ; i < n_rank_specs ; ++i )
798 for ( v = 0 ; v < n_src_vars ; v ++ )
800 if ( rank_specs[i].destvars[v] == NULL )
802 rank_specs[i].destvars[v] =
803 create_rank_variable (dataset_dict(ds), rank_specs[i].rfunc, src_vars[v], NULL);
806 create_var_label ( rank_specs[i].destvars[v],
808 rank_specs[i].rfunc);
812 if ( cmd.print == RANK_YES )
816 tab_output_text (0, _("Variables Created By RANK"));
817 tab_output_text (0, "\n");
819 for (i = 0 ; i < n_rank_specs ; ++i )
821 for ( v = 0 ; v < n_src_vars ; v ++ )
823 if ( n_group_vars > 0 )
825 struct string varlist;
828 ds_init_empty (&varlist);
829 for ( g = 0 ; g < n_group_vars ; ++g )
831 ds_put_cstr (&varlist, var_get_name (group_vars[g]));
833 if ( g < n_group_vars - 1)
834 ds_put_cstr (&varlist, " ");
837 if ( rank_specs[i].rfunc == NORMAL ||
838 rank_specs[i].rfunc == PROPORTION )
839 tab_output_text (TAT_PRINTF,
840 _("%s into %s(%s of %s using %s BY %s)"),
841 var_get_name (src_vars[v]),
842 var_get_name (rank_specs[i].destvars[v]),
843 function_name[rank_specs[i].rfunc],
844 var_get_name (src_vars[v]),
850 tab_output_text (TAT_PRINTF,
851 _("%s into %s(%s of %s BY %s)"),
852 var_get_name (src_vars[v]),
853 var_get_name (rank_specs[i].destvars[v]),
854 function_name[rank_specs[i].rfunc],
855 var_get_name (src_vars[v]),
858 ds_destroy (&varlist);
862 if ( rank_specs[i].rfunc == NORMAL ||
863 rank_specs[i].rfunc == PROPORTION )
864 tab_output_text (TAT_PRINTF,
865 _("%s into %s(%s of %s using %s)"),
866 var_get_name (src_vars[v]),
867 var_get_name (rank_specs[i].destvars[v]),
868 function_name[rank_specs[i].rfunc],
869 var_get_name (src_vars[v]),
874 tab_output_text (TAT_PRINTF,
875 _("%s into %s(%s of %s)"),
876 var_get_name (src_vars[v]),
877 var_get_name (rank_specs[i].destvars[v]),
878 function_name[rank_specs[i].rfunc],
879 var_get_name (src_vars[v])
886 if ( cmd.sbc_fraction &&
887 ( ! cmd.sbc_normal && ! cmd.sbc_proportion) )
888 msg(MW, _("FRACTION has been specified, but NORMAL and PROPORTION rank functions have not been requested. The FRACTION subcommand will be ignored.") );
890 /* Add a variable which we can sort by to get back the original
892 order = dict_create_var_assert (dataset_dict (ds), "$ORDER_", 0);
894 add_transformation (ds, create_resort_key, 0, order);
897 result = rank_cmd (ds, sc, rank_specs, n_rank_specs);
899 /* Put the active file back in its original order */
901 struct sort_criteria criteria;
902 struct sort_criterion restore_criterion ;
903 restore_criterion.fv = var_get_case_index (order);
904 restore_criterion.width = 0;
905 restore_criterion.dir = SRT_ASCEND;
907 criteria.crits = &restore_criterion;
908 criteria.crit_cnt = 1;
910 sort_active_file_in_place (ds, &criteria);
913 /* ... and we don't need our sort key anymore. So delete it */
914 dict_delete_var (dataset_dict (ds), order);
919 return (result ? CMD_SUCCESS : CMD_CASCADING_FAILURE);
923 /* Parser for the variables sub command
924 Returns 1 on success */
926 rank_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd UNUSED, void *aux UNUSED)
928 static const int terminators[2] = {T_BY, 0};
930 lex_match (lexer, '=');
932 if ((lex_token (lexer) != T_ID || dict_lookup_var (dataset_dict (ds), lex_tokid (lexer)) == NULL)
933 && lex_token (lexer) != T_ALL)
936 sc = sort_parse_criteria (lexer, dataset_dict (ds),
937 &src_vars, &n_src_vars, 0, terminators);
939 if ( lex_match (lexer, T_BY) )
941 if ((lex_token (lexer) != T_ID || dict_lookup_var (dataset_dict (ds), lex_tokid (lexer)) == NULL))
946 if (!parse_variables (lexer, dataset_dict (ds),
947 &group_vars, &n_group_vars,
948 PV_NO_DUPLICATE | PV_NO_SCRATCH) )
959 /* Parse the [/rank INTO var1 var2 ... varN ] clause */
961 parse_rank_function (struct lexer *lexer, struct dictionary *dict, struct cmd_rank *cmd UNUSED, enum RANK_FUNC f)
966 rank_specs = xnrealloc(rank_specs, n_rank_specs, sizeof *rank_specs);
967 rank_specs[n_rank_specs - 1].rfunc = f;
968 rank_specs[n_rank_specs - 1].destvars = NULL;
970 rank_specs[n_rank_specs - 1].destvars =
971 xcalloc (sc->crit_cnt, sizeof (struct variable *));
973 if (lex_match_id (lexer, "INTO"))
975 struct variable *destvar;
977 while( lex_token (lexer) == T_ID )
980 if ( dict_lookup_var (dict, lex_tokid (lexer)) != NULL )
982 msg(SE, _("Variable %s already exists."), lex_tokid (lexer));
985 if ( var_count >= sc->crit_cnt )
987 msg(SE, _("Too many variables in INTO clause."));
991 destvar = create_rank_variable (dict, f, src_vars[var_count], lex_tokid (lexer));
992 rank_specs[n_rank_specs - 1].destvars[var_count] = destvar ;
1004 rank_custom_rank (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1006 struct dictionary *dict = dataset_dict (ds);
1008 return parse_rank_function (lexer, dict, cmd, RANK);
1012 rank_custom_normal (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1014 struct dictionary *dict = dataset_dict (ds);
1016 return parse_rank_function (lexer, dict, cmd, NORMAL);
1020 rank_custom_percent (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1022 struct dictionary *dict = dataset_dict (ds);
1024 return parse_rank_function (lexer, dict, cmd, PERCENT);
1028 rank_custom_rfraction (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1030 struct dictionary *dict = dataset_dict (ds);
1032 return parse_rank_function (lexer, dict, cmd, RFRACTION);
1036 rank_custom_proportion (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1038 struct dictionary *dict = dataset_dict (ds);
1040 return parse_rank_function (lexer, dict, cmd, PROPORTION);
1044 rank_custom_n (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1046 struct dictionary *dict = dataset_dict (ds);
1048 return parse_rank_function (lexer, dict, cmd, N);
1052 rank_custom_savage (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1054 struct dictionary *dict = dataset_dict (ds);
1056 return parse_rank_function (lexer, dict, cmd, SAVAGE);
1061 rank_custom_ntiles (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1063 struct dictionary *dict = dataset_dict (ds);
1065 if ( lex_force_match (lexer, '(') )
1067 if ( lex_force_int (lexer) )
1069 k_ntiles = lex_integer (lexer);
1071 lex_force_match (lexer, ')');
1079 return parse_rank_function (lexer, dict, cmd, NTILES);