1 /* PSPP - RANK. -*-c-*-
3 Copyright (C) 2005, 2006 Free Software Foundation, Inc.
4 Author: John Darrington <john@darrington.wattle.id.au>,
5 Ben Pfaff <blp@gnu.org>.
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
12 This program is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
24 #include "sort-criteria.h"
26 #include <data/dictionary.h>
27 #include <data/procedure.h>
28 #include <data/variable.h>
29 #include <data/case.h>
30 #include <data/casefile.h>
31 #include <data/fastfile.h>
32 #include <data/storage-stream.h>
33 #include <language/command.h>
34 #include <language/stats/sort-criteria.h>
36 #include <libpspp/compiler.h>
37 #include <math/sort.h>
38 #include <output/table.h>
39 #include <output/manager.h>
41 #include <gsl/gsl_cdf.h>
45 #define _(msgid) gettext (msgid)
61 +fraction=fraction:!blom/tukey/vw/rankit;
62 +ties=ties:!mean/low/high/condense;
63 missing=miss:!exclude/include.
68 typedef double (*rank_function_t) (double c, double cc, double cc_1,
71 static double rank_proportion (double c, double cc, double cc_1,
74 static double rank_normal (double c, double cc, double cc_1,
77 static double rank_percent (double c, double cc, double cc_1,
80 static double rank_rfraction (double c, double cc, double cc_1,
83 static double rank_rank (double c, double cc, double cc_1,
86 static double rank_n (double c, double cc, double cc_1,
89 static double rank_savage (double c, double cc, double cc_1,
92 static double rank_ntiles (double c, double cc, double cc_1,
109 static const struct fmt_spec dest_format[n_RANK_FUNCS] = {
110 {FMT_F, 9, 3}, /* rank */
111 {FMT_F, 6, 4}, /* normal */
112 {FMT_F, 6, 2}, /* percent */
113 {FMT_F, 6, 4}, /* rfraction */
114 {FMT_F, 6, 4}, /* proportion */
115 {FMT_F, 6, 0}, /* n */
116 {FMT_F, 3, 0}, /* ntiles */
117 {FMT_F, 8, 4} /* savage */
120 static const char * const function_name[n_RANK_FUNCS] = {
131 static const rank_function_t rank_func[n_RANK_FUNCS] = {
145 enum RANK_FUNC rfunc;
146 struct variable **destvars;
150 /* Function to use for testing for missing values */
151 static is_missing_func *value_is_missing;
153 static struct rank_spec *rank_specs;
154 static size_t n_rank_specs;
156 static struct sort_criteria *sc;
158 static struct variable **group_vars;
159 static size_t n_group_vars;
161 static struct variable **src_vars;
162 static size_t n_src_vars;
167 static struct cmd_rank cmd;
169 static struct casefile *rank_sorted_casefile (struct casefile *cf,
170 const struct sort_criteria *,
171 const struct dictionary *,
172 const struct rank_spec *rs,
175 const struct missing_values *miss
180 static char name[10];
181 switch ( cmd.fraction )
184 strcpy (name, "BLOM");
187 strcpy (name, "RANKIT");
190 strcpy (name, "TUKEY");
201 /* Create a label on DEST_VAR, describing its derivation from SRC_VAR and F */
203 create_var_label (struct variable *dest_var,
204 const struct variable *src_var, enum RANK_FUNC f)
207 ds_init_empty (&label);
209 if ( n_group_vars > 0 )
211 struct string group_var_str;
214 ds_init_empty (&group_var_str);
216 for (g = 0 ; g < n_group_vars ; ++g )
218 if ( g > 0 ) ds_put_cstr (&group_var_str, " ");
219 ds_put_cstr (&group_var_str, group_vars[g]->name);
222 ds_put_format (&label, _("%s of %s by %s"), function_name[f],
223 src_var->name, ds_cstr (&group_var_str));
224 ds_destroy (&group_var_str);
227 ds_put_format (&label,_("%s of %s"), function_name[f], src_var->name);
229 dest_var->label = strdup (ds_cstr (&label) );
236 rank_cmd (struct dataset *ds, const struct sort_criteria *sc,
237 const struct rank_spec *rank_specs, int n_rank_specs)
239 struct sort_criteria criteria;
242 const int n_splits = dict_get_split_cnt (dataset_dict (ds));
244 criteria.crit_cnt = n_splits + n_group_vars + 1;
245 criteria.crits = xnmalloc (criteria.crit_cnt, sizeof *criteria.crits);
246 for (i = 0; i < n_splits ; i++)
248 struct variable *v = dict_get_split_vars (dataset_dict (ds))[i];
249 criteria.crits[i].fv = v->fv;
250 criteria.crits[i].width = v->width;
251 criteria.crits[i].dir = SRT_ASCEND;
253 for (i = 0; i < n_group_vars; i++)
255 criteria.crits[i + n_splits].fv = group_vars[i]->fv;
256 criteria.crits[i + n_splits].width = group_vars[i]->width;
257 criteria.crits[i + n_splits].dir = SRT_ASCEND;
259 for (i = 0 ; i < sc->crit_cnt ; ++i )
261 struct casefile *out ;
262 struct casefile *cf ;
263 struct casereader *reader ;
264 struct casefile *sorted_cf ;
266 /* Obtain active file in CF. */
267 if (!procedure (ds, NULL, NULL))
270 cf = proc_capture_output (ds);
272 /* Sort CF into SORTED_CF. */
273 reader = casefile_get_destructive_reader (cf) ;
274 criteria.crits[criteria.crit_cnt - 1] = sc->crits[i];
275 assert ( sc->crits[i].fv == src_vars[i]->fv );
276 sorted_cf = sort_execute (reader, &criteria);
277 casefile_destroy (cf);
279 out = rank_sorted_casefile (sorted_cf, &criteria,
281 rank_specs, n_rank_specs,
282 i, &src_vars[i]->miss) ;
289 proc_set_source (ds, storage_source_create (out));
292 free (criteria.crits);
296 free (criteria.crits);
300 /* Hardly a rank function !! */
302 rank_n (double c UNUSED, double cc UNUSED, double cc_1 UNUSED,
303 int i UNUSED, double w)
310 rank_rank (double c, double cc, double cc_1,
311 int i, double w UNUSED)
325 rank = cc_1 + (c + 1.0)/ 2.0;
345 rank = cc_1 + c / 2.0 ;
360 rank_rfraction (double c, double cc, double cc_1,
363 return rank_rank (c, cc, cc_1, i, w) / w ;
368 rank_percent (double c, double cc, double cc_1,
371 return rank_rank (c, cc, cc_1, i, w) * 100.0 / w ;
376 rank_proportion (double c, double cc, double cc_1,
379 const double r = rank_rank (c, cc, cc_1, i, w) ;
383 switch ( cmd.fraction )
386 f = (r - 3.0/8.0) / (w + 0.25);
392 f = (r - 1.0/3.0) / (w + 1.0/3.0);
402 return (f > 0) ? f : SYSMIS;
406 rank_normal (double c, double cc, double cc_1,
409 double f = rank_proportion (c, cc, cc_1, i, w);
411 return gsl_cdf_ugaussian_Pinv (f);
415 rank_ntiles (double c, double cc, double cc_1,
418 double r = rank_rank (c, cc, cc_1, i, w);
421 return ( floor (( r * k_ntiles) / ( w + 1) ) + 1);
424 /* Expected value of the order statistics from an exponential distribution */
426 ee (int j, double w_star)
431 for (k = 1 ; k <= j; k++)
432 sum += 1.0 / ( w_star + 1 - k );
439 rank_savage (double c, double cc, double cc_1,
440 int i UNUSED, double w)
443 const int i_1 = floor (cc_1);
444 const int i_2 = floor (cc);
446 const double w_star = (modf (w, &int_part) == 0 ) ? w : floor (w) + 1;
448 const double g_1 = cc_1 - i_1;
449 const double g_2 = cc - i_2;
451 /* The second factor is infinite, when the first is zero.
452 Therefore, evaluate the second, only when the first is non-zero */
453 const double expr1 = (1 - g_1) ? (1 - g_1) * ee(i_1+1, w_star) : ( 1 - g_1);
454 const double expr2 = g_2 ? g_2 * ee (i_2+1, w_star) : g_2 ;
457 return ee (i_1 + 1, w_star) - 1;
459 if ( i_1 + 1 == i_2 )
460 return ( ( expr1 + expr2 )/c ) - 1;
462 if ( i_1 + 2 <= i_2 )
466 for (j = i_1 + 2 ; j <= i_2; ++j )
467 sigma += ee (j, w_star);
468 return ( (expr1 + expr2 + sigma) / c) -1;
475 /* Rank the casefile belonging to CR, starting from the current
476 postition of CR continuing up to and including the ENDth case.
478 RS points to an array containing the rank specifications to
479 use. N_RANK_SPECS is the number of elements of RS.
482 DEST_VAR_INDEX is the index into the rank_spec destvar element
483 to be used for this ranking.
485 Prerequisites: 1. The casefile must be sorted according to CRITERION.
486 2. W is the sum of the non-missing caseweights for this
487 range of the casefile.
490 rank_cases (struct casereader *cr,
492 const struct dictionary *dict,
493 const struct sort_criterion *criterion,
494 const struct missing_values *mv,
496 const struct rank_spec *rs,
499 struct casefile *dest)
506 const int fv = criterion->fv;
507 const int width = criterion->width;
509 while (casereader_cnum (cr) < end)
511 struct casereader *lookahead;
512 const union value *this_value;
513 struct ccase this_case, lookahead_case;
518 if (!casereader_read_xfer (cr, &this_case))
521 this_value = case_data (&this_case, fv);
522 c = dict_get_case_weight (dict, &this_case, &warn);
524 lookahead = casereader_clone (cr);
526 while (casereader_cnum (lookahead) < end
527 && casereader_read_xfer (lookahead, &lookahead_case))
529 const union value *lookahead_value = case_data (&lookahead_case, fv);
530 int diff = compare_values (this_value, lookahead_value, width);
534 /* Make sure the casefile was sorted */
535 assert ( diff == ((criterion->dir == SRT_ASCEND) ? -1 :1));
537 case_destroy (&lookahead_case);
541 c += dict_get_case_weight (dict, &lookahead_case, &warn);
542 case_destroy (&lookahead_case);
545 casereader_destroy (lookahead);
548 if ( !value_is_missing (mv, this_value) )
553 for (i = 0; i < n_rank_specs; ++i)
555 const int dest_idx = rs[i].destvars[dest_var_index]->fv;
557 if ( value_is_missing (mv, this_value) )
558 case_data_rw (&this_case, dest_idx)->f = SYSMIS;
560 case_data_rw (&this_case, dest_idx)->f =
561 rank_func[rs[i].rfunc](c, cc, cc_1, iter, w);
563 casefile_append_xfer (dest, &this_case);
565 while (n-- > 0 && casereader_read_xfer (cr, &this_case));
567 if ( !value_is_missing (mv, this_value) )
571 /* If this isn't true, then all the results will be wrong */
576 same_group (const struct ccase *a, const struct ccase *b,
577 const struct sort_criteria *crit)
581 for (i = 0; i < crit->crit_cnt - 1; i++)
583 struct sort_criterion *c = &crit->crits[i];
584 if (compare_values (case_data (a, c->fv), case_data (b, c->fv),
592 static struct casefile *
593 rank_sorted_casefile (struct casefile *cf,
594 const struct sort_criteria *crit,
595 const struct dictionary *dict,
596 const struct rank_spec *rs,
599 const struct missing_values *mv)
601 struct casefile *dest = fastfile_create (casefile_get_value_cnt (cf));
602 struct casereader *lookahead = casefile_get_reader (cf, NULL);
603 struct casereader *pos = casereader_clone (lookahead);
604 struct ccase group_case;
607 struct sort_criterion *ultimate_crit = &crit->crits[crit->crit_cnt - 1];
609 if (casereader_read (lookahead, &group_case))
611 struct ccase this_case;
612 const union value *this_value ;
614 this_value = case_data( &group_case, ultimate_crit->fv);
616 if ( !value_is_missing(mv, this_value) )
617 w = dict_get_case_weight (dict, &group_case, &warn);
619 while (casereader_read (lookahead, &this_case))
621 const union value *this_value =
622 case_data(&this_case, ultimate_crit->fv);
623 double c = dict_get_case_weight (dict, &this_case, &warn);
624 if (!same_group (&group_case, &this_case, crit))
626 rank_cases (pos, casereader_cnum (lookahead) - 1,
634 case_destroy (&group_case);
635 case_move (&group_case, &this_case);
637 if ( !value_is_missing (mv, this_value) )
639 case_destroy (&this_case);
641 case_destroy (&group_case);
642 rank_cases (pos, ULONG_MAX, dict, ultimate_crit, mv, w,
643 rs, n_rank_specs, dest_idx, dest);
646 if (casefile_error (dest))
648 casefile_destroy (dest);
652 casefile_destroy (cf);
657 /* Transformation function to enumerate all the cases */
659 create_resort_key (void *key_var_, struct ccase *cc, casenumber case_num)
661 struct variable *key_var = key_var_;
663 case_data_rw(cc, key_var->fv)->f = case_num;
665 return TRNS_CONTINUE;
669 /* Create and return a new variable in which to store the ranks of SRC_VAR
670 accoring to the rank function F.
671 VNAME is the name of the variable to be created.
672 If VNAME is NULL, then a name will be automatically chosen.
674 static struct variable *
675 create_rank_variable (struct dictionary *dict, enum RANK_FUNC f,
676 const struct variable *src_var,
680 struct variable *var = NULL;
681 char name[SHORT_NAME_LEN + 1];
684 var = dict_create_var(dict, vname, 0);
688 snprintf(name, SHORT_NAME_LEN + 1, "%c%s",
689 function_name[f][0], src_var->name);
691 var = dict_create_var(dict, name, 0);
698 snprintf(func_abb, 4, "%s", function_name[f]);
699 snprintf(name, SHORT_NAME_LEN + 1, "%s%03d", func_abb,
702 var = dict_create_var(dict, name, 0);
708 while ( NULL == var )
711 snprintf(func_abb, 3, "%s", function_name[f]);
713 snprintf(name, SHORT_NAME_LEN + 1,
714 "RNK%s%02d", func_abb, i);
716 var = dict_create_var(dict, name, 0);
723 msg(ME, _("Cannot create new rank variable. All candidates in use."));
727 var->write = var->print = dest_format[f];
742 for (i = 0 ; i < n_rank_specs ; ++i )
744 free (rank_specs[i].destvars);
751 sort_destroy_criteria (sc);
760 cmd_rank (struct lexer *lexer, struct dataset *ds)
763 struct variable *order;
767 if ( !parse_rank (lexer, ds, &cmd, NULL) )
773 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
774 if (cmd.miss == RANK_INCLUDE )
775 value_is_missing = mv_is_value_system_missing;
777 value_is_missing = mv_is_value_missing;
780 /* Default to /RANK if no function subcommands are given */
781 if ( !( cmd.sbc_normal || cmd.sbc_ntiles || cmd.sbc_proportion ||
782 cmd.sbc_rfraction || cmd.sbc_savage || cmd.sbc_n ||
783 cmd.sbc_percent || cmd.sbc_rank ) )
785 assert ( n_rank_specs == 0 );
787 rank_specs = xmalloc (sizeof (*rank_specs));
788 rank_specs[0].rfunc = RANK;
789 rank_specs[0].destvars =
790 xcalloc (sc->crit_cnt, sizeof (struct variable *));
795 assert ( sc->crit_cnt == n_src_vars);
797 /* Create variables for all rank destinations which haven't
798 already been created with INTO.
799 Add labels to all the destination variables.
801 for (i = 0 ; i < n_rank_specs ; ++i )
804 for ( v = 0 ; v < n_src_vars ; v ++ )
806 if ( rank_specs[i].destvars[v] == NULL )
808 rank_specs[i].destvars[v] =
809 create_rank_variable (dataset_dict(ds), rank_specs[i].rfunc, src_vars[v], NULL);
812 create_var_label ( rank_specs[i].destvars[v],
814 rank_specs[i].rfunc);
818 if ( cmd.print == RANK_YES )
822 tab_output_text (0, _("Variables Created By RANK"));
823 tab_output_text (0, "\n");
825 for (i = 0 ; i < n_rank_specs ; ++i )
827 for ( v = 0 ; v < n_src_vars ; v ++ )
829 if ( n_group_vars > 0 )
831 struct string varlist;
834 ds_init_empty (&varlist);
835 for ( g = 0 ; g < n_group_vars ; ++g )
837 ds_put_cstr (&varlist, group_vars[g]->name);
839 if ( g < n_group_vars - 1)
840 ds_put_cstr (&varlist, " ");
843 if ( rank_specs[i].rfunc == NORMAL ||
844 rank_specs[i].rfunc == PROPORTION )
845 tab_output_text (TAT_PRINTF,
846 _("%s into %s(%s of %s using %s BY %s)"),
848 rank_specs[i].destvars[v]->name,
849 function_name[rank_specs[i].rfunc],
856 tab_output_text (TAT_PRINTF,
857 _("%s into %s(%s of %s BY %s)"),
859 rank_specs[i].destvars[v]->name,
860 function_name[rank_specs[i].rfunc],
864 ds_destroy (&varlist);
868 if ( rank_specs[i].rfunc == NORMAL ||
869 rank_specs[i].rfunc == PROPORTION )
870 tab_output_text (TAT_PRINTF,
871 _("%s into %s(%s of %s using %s)"),
873 rank_specs[i].destvars[v]->name,
874 function_name[rank_specs[i].rfunc],
880 tab_output_text (TAT_PRINTF,
881 _("%s into %s(%s of %s)"),
883 rank_specs[i].destvars[v]->name,
884 function_name[rank_specs[i].rfunc],
892 if ( cmd.sbc_fraction &&
893 ( ! cmd.sbc_normal && ! cmd.sbc_proportion) )
894 msg(MW, _("FRACTION has been specified, but NORMAL and PROPORTION rank functions have not been requested. The FRACTION subcommand will be ignored.") );
896 /* Add a variable which we can sort by to get back the original
898 order = dict_create_var_assert (dataset_dict (ds), "$ORDER_", 0);
900 add_transformation (ds, create_resort_key, 0, order);
903 result = rank_cmd (ds, sc, rank_specs, n_rank_specs);
905 /* Put the active file back in its original order */
907 struct sort_criteria criteria;
908 struct sort_criterion restore_criterion ;
909 restore_criterion.fv = order->fv;
910 restore_criterion.width = 0;
911 restore_criterion.dir = SRT_ASCEND;
913 criteria.crits = &restore_criterion;
914 criteria.crit_cnt = 1;
916 sort_active_file_in_place (ds, &criteria);
919 /* ... and we don't need our sort key anymore. So delete it */
920 dict_delete_var (dataset_dict (ds), order);
924 return (result ? CMD_SUCCESS : CMD_CASCADING_FAILURE);
928 /* Parser for the variables sub command
929 Returns 1 on success */
931 rank_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd UNUSED, void *aux UNUSED)
933 static const int terminators[2] = {T_BY, 0};
935 lex_match (lexer, '=');
937 if ((lex_token (lexer) != T_ID || dict_lookup_var (dataset_dict (ds), lex_tokid (lexer)) == NULL)
938 && lex_token (lexer) != T_ALL)
941 sc = sort_parse_criteria (lexer, dataset_dict (ds),
942 &src_vars, &n_src_vars, 0, terminators);
944 if ( lex_match (lexer, T_BY) )
946 if ((lex_token (lexer) != T_ID || dict_lookup_var (dataset_dict (ds), lex_tokid (lexer)) == NULL))
951 if (!parse_variables (lexer, dataset_dict (ds),
952 &group_vars, &n_group_vars,
953 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
964 /* Parse the [/rank INTO var1 var2 ... varN ] clause */
966 parse_rank_function (struct lexer *lexer, struct dictionary *dict, struct cmd_rank *cmd UNUSED, enum RANK_FUNC f)
971 rank_specs = xnrealloc(rank_specs, n_rank_specs, sizeof *rank_specs);
972 rank_specs[n_rank_specs - 1].rfunc = f;
973 rank_specs[n_rank_specs - 1].destvars = NULL;
975 rank_specs[n_rank_specs - 1].destvars =
976 xcalloc (sc->crit_cnt, sizeof (struct variable *));
978 if (lex_match_id (lexer, "INTO"))
980 struct variable *destvar;
982 while( lex_token (lexer) == T_ID )
985 if ( dict_lookup_var (dict, lex_tokid (lexer)) != NULL )
987 msg(SE, _("Variable %s already exists."), lex_tokid (lexer));
990 if ( var_count >= sc->crit_cnt )
992 msg(SE, _("Too many variables in INTO clause."));
996 destvar = create_rank_variable (dict, f, src_vars[var_count], lex_tokid (lexer));
997 rank_specs[n_rank_specs - 1].destvars[var_count] = destvar ;
1009 rank_custom_rank (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1011 struct dictionary *dict = dataset_dict (ds);
1013 return parse_rank_function (lexer, dict, cmd, RANK);
1017 rank_custom_normal (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1019 struct dictionary *dict = dataset_dict (ds);
1021 return parse_rank_function (lexer, dict, cmd, NORMAL);
1025 rank_custom_percent (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1027 struct dictionary *dict = dataset_dict (ds);
1029 return parse_rank_function (lexer, dict, cmd, PERCENT);
1033 rank_custom_rfraction (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1035 struct dictionary *dict = dataset_dict (ds);
1037 return parse_rank_function (lexer, dict, cmd, RFRACTION);
1041 rank_custom_proportion (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1043 struct dictionary *dict = dataset_dict (ds);
1045 return parse_rank_function (lexer, dict, cmd, PROPORTION);
1049 rank_custom_n (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1051 struct dictionary *dict = dataset_dict (ds);
1053 return parse_rank_function (lexer, dict, cmd, N);
1057 rank_custom_savage (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1059 struct dictionary *dict = dataset_dict (ds);
1061 return parse_rank_function (lexer, dict, cmd, SAVAGE);
1066 rank_custom_ntiles (struct lexer *lexer, struct dataset *ds, struct cmd_rank *cmd, void *aux UNUSED )
1068 struct dictionary *dict = dataset_dict (ds);
1070 if ( lex_force_match (lexer, '(') )
1072 if ( lex_force_int (lexer) )
1074 k_ntiles = lex_integer (lexer);
1076 lex_force_match (lexer, ')');
1084 return parse_rank_function (lexer, dict, cmd, NTILES);