1 /* PSPP - RANK. -*-c-*-
3 Copyright (C) 2005, 2006 Free Software Foundation, Inc.
4 Author: John Darrington <john@darrington.wattle.id.au>,
5 Ben Pfaff <blp@gnu.org>.
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
12 This program is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
24 #include "sort-criteria.h"
26 #include <data/dictionary.h>
27 #include <data/procedure.h>
28 #include <data/variable.h>
29 #include <data/case.h>
30 #include <data/casefile.h>
31 #include <data/fastfile.h>
32 #include <data/storage-stream.h>
33 #include <language/command.h>
34 #include <language/stats/sort-criteria.h>
36 #include <libpspp/compiler.h>
37 #include <math/sort.h>
38 #include <output/table.h>
39 #include <output/manager.h>
41 #include <gsl/gsl_cdf.h>
45 #define _(msgid) gettext (msgid)
61 +fraction=fraction:!blom/tukey/vw/rankit;
62 +ties=ties:!mean/low/high/condense;
63 missing=miss:!exclude/include.
68 typedef double (*rank_function_t) (double c, double cc, double cc_1,
71 static double rank_proportion (double c, double cc, double cc_1,
74 static double rank_normal (double c, double cc, double cc_1,
77 static double rank_percent (double c, double cc, double cc_1,
80 static double rank_rfraction (double c, double cc, double cc_1,
83 static double rank_rank (double c, double cc, double cc_1,
86 static double rank_n (double c, double cc, double cc_1,
89 static double rank_savage (double c, double cc, double cc_1,
92 static double rank_ntiles (double c, double cc, double cc_1,
109 static const struct fmt_spec dest_format[n_RANK_FUNCS] = {
110 {FMT_F, 9, 3}, /* rank */
111 {FMT_F, 6, 4}, /* normal */
112 {FMT_F, 6, 2}, /* percent */
113 {FMT_F, 6, 4}, /* rfraction */
114 {FMT_F, 6, 4}, /* proportion */
115 {FMT_F, 6, 0}, /* n */
116 {FMT_F, 3, 0}, /* ntiles */
117 {FMT_F, 8, 4} /* savage */
120 static const char *function_name[n_RANK_FUNCS] = {
131 static rank_function_t rank_func[n_RANK_FUNCS] = {
145 enum RANK_FUNC rfunc;
146 struct variable **destvars;
150 /* Function to use for testing for missing values */
151 static is_missing_func *value_is_missing;
153 static struct rank_spec *rank_specs;
154 static size_t n_rank_specs;
156 static struct sort_criteria *sc;
158 static struct variable **group_vars;
159 static size_t n_group_vars;
161 static struct variable **src_vars;
162 static size_t n_src_vars;
167 static struct cmd_rank cmd;
169 static struct casefile *rank_sorted_casefile (struct casefile *cf,
170 const struct sort_criteria *,
171 const struct rank_spec *rs,
174 const struct missing_values *miss
179 static char name[10];
180 switch ( cmd.fraction )
183 strcpy (name, "BLOM");
186 strcpy (name, "RANKIT");
189 strcpy (name, "TUKEY");
200 /* Create a label on DEST_VAR, describing its derivation from SRC_VAR and F */
202 create_var_label (struct variable *dest_var,
203 const struct variable *src_var, enum RANK_FUNC f)
206 ds_init_empty (&label);
208 if ( n_group_vars > 0 )
210 struct string group_var_str;
213 ds_init_empty (&group_var_str);
215 for (g = 0 ; g < n_group_vars ; ++g )
217 if ( g > 0 ) ds_put_cstr (&group_var_str, " ");
218 ds_put_cstr (&group_var_str, group_vars[g]->name);
221 ds_put_format (&label, _("%s of %s by %s"), function_name[f],
222 src_var->name, ds_cstr (&group_var_str));
223 ds_destroy (&group_var_str);
226 ds_put_format (&label,_("%s of %s"), function_name[f], src_var->name);
228 dest_var->label = strdup (ds_cstr (&label) );
235 rank_cmd (const struct sort_criteria *sc,
236 const struct rank_spec *rank_specs, int n_rank_specs)
238 struct sort_criteria criteria;
241 const int n_splits = dict_get_split_cnt (default_dict);
243 criteria.crit_cnt = n_splits + n_group_vars + 1;
244 criteria.crits = xnmalloc (criteria.crit_cnt, sizeof *criteria.crits);
245 for (i = 0; i < n_splits ; i++)
247 struct variable *v = dict_get_split_vars (default_dict)[i];
248 criteria.crits[i].fv = v->fv;
249 criteria.crits[i].width = v->width;
250 criteria.crits[i].dir = SRT_ASCEND;
252 for (i = 0; i < n_group_vars; i++)
254 criteria.crits[i + n_splits].fv = group_vars[i]->fv;
255 criteria.crits[i + n_splits].width = group_vars[i]->width;
256 criteria.crits[i + n_splits].dir = SRT_ASCEND;
258 for (i = 0 ; i < sc->crit_cnt ; ++i )
260 struct casefile *out ;
261 struct casefile *cf ;
262 struct casereader *reader ;
263 struct casefile *sorted_cf ;
265 /* Obtain active file in CF. */
266 if (!procedure (NULL, NULL))
268 cf = proc_capture_output ();
270 /* Sort CF into SORTED_CF. */
271 reader = casefile_get_destructive_reader (cf) ;
272 criteria.crits[criteria.crit_cnt - 1] = sc->crits[i];
273 assert ( sc->crits[i].fv == src_vars[i]->fv );
274 sorted_cf = sort_execute (reader, &criteria);
275 casefile_destroy (cf);
277 out = rank_sorted_casefile (sorted_cf, &criteria,
278 rank_specs, n_rank_specs,
279 i, &src_vars[i]->miss) ;
286 proc_set_source (storage_source_create (out));
288 free (criteria.crits);
293 /* Hardly a rank function !! */
295 rank_n (double c UNUSED, double cc UNUSED, double cc_1 UNUSED,
296 int i UNUSED, double w)
303 rank_rank (double c, double cc, double cc_1,
304 int i, double w UNUSED)
318 rank = cc_1 + (c + 1.0)/ 2.0;
338 rank = cc_1 + c / 2.0 ;
353 rank_rfraction (double c, double cc, double cc_1,
356 return rank_rank (c, cc, cc_1, i, w) / w ;
361 rank_percent (double c, double cc, double cc_1,
364 return rank_rank (c, cc, cc_1, i, w) * 100.0 / w ;
369 rank_proportion (double c, double cc, double cc_1,
372 const double r = rank_rank (c, cc, cc_1, i, w) ;
376 switch ( cmd.fraction )
379 f = (r - 3.0/8.0) / (w + 0.25);
385 f = (r - 1.0/3.0) / (w + 1.0/3.0);
395 return (f > 0) ? f : SYSMIS;
399 rank_normal (double c, double cc, double cc_1,
402 double f = rank_proportion (c, cc, cc_1, i, w);
404 return gsl_cdf_ugaussian_Pinv (f);
408 rank_ntiles (double c, double cc, double cc_1,
411 double r = rank_rank (c, cc, cc_1, i, w);
414 return ( floor (( r * k_ntiles) / ( w + 1) ) + 1);
417 /* Expected value of the order statistics from an exponential distribution */
419 ee (int j, double w_star)
424 for (k = 1 ; k <= j; k++)
425 sum += 1.0 / ( w_star + 1 - k );
432 rank_savage (double c, double cc, double cc_1,
433 int i UNUSED, double w)
436 const int i_1 = floor (cc_1);
437 const int i_2 = floor (cc);
439 const double w_star = (modf (w, &int_part) == 0 ) ? w : floor (w) + 1;
441 const double g_1 = cc_1 - i_1;
442 const double g_2 = cc - i_2;
444 /* The second factor is infinite, when the first is zero.
445 Therefore, evaluate the second, only when the first is non-zero */
446 const double expr1 = (1 - g_1) ? (1 - g_1) * ee(i_1+1, w_star) : ( 1 - g_1);
447 const double expr2 = g_2 ? g_2 * ee (i_2+1, w_star) : g_2 ;
450 return ee (i_1 + 1, w_star) - 1;
452 if ( i_1 + 1 == i_2 )
453 return ( ( expr1 + expr2 )/c ) - 1;
455 if ( i_1 + 2 <= i_2 )
459 for (j = i_1 + 2 ; j <= i_2; ++j )
460 sigma += ee (j, w_star);
461 return ( (expr1 + expr2 + sigma) / c) -1;
468 /* Rank the casefile belonging to CR, starting from the current
469 postition of CR continuing up to and including the ENDth case.
471 RS points to an array containing the rank specifications to
472 use. N_RANK_SPECS is the number of elements of RS.
475 DEST_VAR_INDEX is the index into the rank_spec destvar element
476 to be used for this ranking.
478 Prerequisites: 1. The casefile must be sorted according to CRITERION.
479 2. W is the sum of the non-missing caseweights for this
480 range of the casefile.
483 rank_cases (struct casereader *cr,
485 const struct sort_criterion *criterion,
486 const struct missing_values *mv,
488 const struct rank_spec *rs,
491 struct casefile *dest)
498 const int fv = criterion->fv;
499 const int width = criterion->width;
501 while (casereader_cnum (cr) < end)
503 struct casereader *lookahead;
504 const union value *this_value;
505 struct ccase this_case, lookahead_case;
510 if (!casereader_read_xfer (cr, &this_case))
513 this_value = case_data (&this_case, fv);
514 c = dict_get_case_weight (default_dict, &this_case, &warn);
516 lookahead = casereader_clone (cr);
518 while (casereader_cnum (lookahead) < end
519 && casereader_read_xfer (lookahead, &lookahead_case))
521 const union value *lookahead_value = case_data (&lookahead_case, fv);
522 int diff = compare_values (this_value, lookahead_value, width);
526 /* Make sure the casefile was sorted */
527 assert ( diff == ((criterion->dir == SRT_ASCEND) ? -1 :1));
529 case_destroy (&lookahead_case);
533 c += dict_get_case_weight (default_dict, &lookahead_case, &warn);
534 case_destroy (&lookahead_case);
537 casereader_destroy (lookahead);
540 if ( !value_is_missing (mv, this_value) )
545 for (i = 0; i < n_rank_specs; ++i)
547 const int dest_idx = rs[i].destvars[dest_var_index]->fv;
549 if ( value_is_missing (mv, this_value) )
550 case_data_rw (&this_case, dest_idx)->f = SYSMIS;
552 case_data_rw (&this_case, dest_idx)->f =
553 rank_func[rs[i].rfunc](c, cc, cc_1, iter, w);
555 casefile_append_xfer (dest, &this_case);
557 while (n-- > 0 && casereader_read_xfer (cr, &this_case));
559 if ( !value_is_missing (mv, this_value) )
563 /* If this isn't true, then all the results will be wrong */
568 same_group (const struct ccase *a, const struct ccase *b,
569 const struct sort_criteria *crit)
573 for (i = 0; i < crit->crit_cnt - 1; i++)
575 struct sort_criterion *c = &crit->crits[i];
576 if (compare_values (case_data (a, c->fv), case_data (b, c->fv),
584 static struct casefile *
585 rank_sorted_casefile (struct casefile *cf,
586 const struct sort_criteria *crit,
587 const struct rank_spec *rs,
590 const struct missing_values *mv)
592 struct casefile *dest = fastfile_create (casefile_get_value_cnt (cf));
593 struct casereader *lookahead = casefile_get_reader (cf);
594 struct casereader *pos = casereader_clone (lookahead);
595 struct ccase group_case;
598 struct sort_criterion *ultimate_crit = &crit->crits[crit->crit_cnt - 1];
600 if (casereader_read (lookahead, &group_case))
602 struct ccase this_case;
603 const union value *this_value ;
605 this_value = case_data( &group_case, ultimate_crit->fv);
607 if ( !value_is_missing(mv, this_value) )
608 w = dict_get_case_weight (default_dict, &group_case, &warn);
610 while (casereader_read (lookahead, &this_case))
612 const union value *this_value =
613 case_data(&this_case, ultimate_crit->fv);
614 double c = dict_get_case_weight (default_dict, &this_case, &warn);
615 if (!same_group (&group_case, &this_case, crit))
617 rank_cases (pos, casereader_cnum (lookahead) - 1,
624 case_move (&group_case, &this_case);
626 if ( !value_is_missing (mv, this_value) )
629 rank_cases (pos, ULONG_MAX, ultimate_crit, mv, w,
630 rs, n_rank_specs, dest_idx, dest);
633 if (casefile_error (dest))
635 casefile_destroy (dest);
639 casefile_destroy (cf);
644 /* Transformation function to enumerate all the cases */
646 create_resort_key (void *key_var_, struct ccase *cc, casenum_t case_num)
648 struct variable *key_var = key_var_;
650 case_data_rw(cc, key_var->fv)->f = case_num;
652 return TRNS_CONTINUE;
656 /* Create and return a new variable in which to store the ranks of SRC_VAR
657 accoring to the rank function F.
658 VNAME is the name of the variable to be created.
659 If VNAME is NULL, then a name will be automatically chosen.
661 static struct variable *
662 create_rank_variable (enum RANK_FUNC f,
663 const struct variable *src_var,
667 struct variable *var = NULL;
668 char name[SHORT_NAME_LEN + 1];
671 var = dict_create_var(default_dict, vname, 0);
675 snprintf(name, SHORT_NAME_LEN + 1, "%c%s",
676 function_name[f][0], src_var->name);
678 var = dict_create_var(default_dict, name, 0);
685 snprintf(func_abb, 4, "%s", function_name[f]);
686 snprintf(name, SHORT_NAME_LEN + 1, "%s%03d", func_abb,
689 var = dict_create_var(default_dict, name, 0);
695 while ( NULL == var )
698 snprintf(func_abb, 3, "%s", function_name[f]);
700 snprintf(name, SHORT_NAME_LEN + 1,
701 "RNK%s%02d", func_abb, i);
703 var = dict_create_var(default_dict, name, 0);
710 msg(ME, _("Cannot create new rank variable. All candidates in use."));
714 var->write = var->print = dest_format[f];
730 for (i = 0 ; i < n_rank_specs ; ++i )
732 free (rank_specs[i].destvars);
739 sort_destroy_criteria (sc);
751 struct variable *order;
755 if ( !parse_rank(&cmd, NULL) )
761 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
762 if (cmd.miss == RANK_INCLUDE )
763 value_is_missing = mv_is_value_system_missing;
765 value_is_missing = mv_is_value_missing;
768 /* Default to /RANK if no function subcommands are given */
769 if ( !( cmd.sbc_normal || cmd.sbc_ntiles || cmd.sbc_proportion ||
770 cmd.sbc_rfraction || cmd.sbc_savage || cmd.sbc_n ||
771 cmd.sbc_percent || cmd.sbc_rank ) )
773 assert ( n_rank_specs == 0 );
775 rank_specs = xmalloc (sizeof (*rank_specs));
776 rank_specs[0].rfunc = RANK;
777 rank_specs[0].destvars =
778 xcalloc (sc->crit_cnt, sizeof (struct variable *));
783 assert ( sc->crit_cnt == n_src_vars);
785 /* Create variables for all rank destinations which haven't
786 already been created with INTO.
787 Add labels to all the destination variables.
789 for (i = 0 ; i < n_rank_specs ; ++i )
792 for ( v = 0 ; v < n_src_vars ; v ++ )
794 if ( rank_specs[i].destvars[v] == NULL )
796 rank_specs[i].destvars[v] =
797 create_rank_variable (rank_specs[i].rfunc, src_vars[v], NULL);
800 create_var_label ( rank_specs[i].destvars[v],
802 rank_specs[i].rfunc);
806 if ( cmd.print == RANK_YES )
810 tab_output_text (0, _("Variables Created By RANK"));
811 tab_output_text (0, "\n");
813 for (i = 0 ; i < n_rank_specs ; ++i )
815 for ( v = 0 ; v < n_src_vars ; v ++ )
817 if ( n_group_vars > 0 )
819 struct string varlist;
822 ds_init_empty (&varlist);
823 for ( g = 0 ; g < n_group_vars ; ++g )
825 ds_put_cstr (&varlist, group_vars[g]->name);
827 if ( g < n_group_vars - 1)
828 ds_put_cstr (&varlist, " ");
831 if ( rank_specs[i].rfunc == NORMAL ||
832 rank_specs[i].rfunc == PROPORTION )
833 tab_output_text (TAT_PRINTF,
834 _("%s into %s(%s of %s using %s BY %s)"),
836 rank_specs[i].destvars[v]->name,
837 function_name[rank_specs[i].rfunc],
844 tab_output_text (TAT_PRINTF,
845 _("%s into %s(%s of %s BY %s)"),
847 rank_specs[i].destvars[v]->name,
848 function_name[rank_specs[i].rfunc],
852 ds_destroy (&varlist);
856 if ( rank_specs[i].rfunc == NORMAL ||
857 rank_specs[i].rfunc == PROPORTION )
858 tab_output_text (TAT_PRINTF,
859 _("%s into %s(%s of %s using %s)"),
861 rank_specs[i].destvars[v]->name,
862 function_name[rank_specs[i].rfunc],
868 tab_output_text (TAT_PRINTF,
869 _("%s into %s(%s of %s)"),
871 rank_specs[i].destvars[v]->name,
872 function_name[rank_specs[i].rfunc],
880 if ( cmd.sbc_fraction &&
881 ( ! cmd.sbc_normal && ! cmd.sbc_proportion) )
882 msg(MW, _("FRACTION has been specified, but NORMAL and PROPORTION rank functions have not been requested. The FRACTION subcommand will be ignored.") );
884 /* Add a variable which we can sort by to get back the original
886 order = dict_create_var_assert (default_dict, "$ORDER_", 0);
888 add_transformation (create_resort_key, 0, order);
891 result = rank_cmd (sc, rank_specs, n_rank_specs);
893 /* Put the active file back in its original order */
895 struct sort_criteria criteria;
896 struct sort_criterion restore_criterion ;
897 restore_criterion.fv = order->fv;
898 restore_criterion.width = 0;
899 restore_criterion.dir = SRT_ASCEND;
901 criteria.crits = &restore_criterion;
902 criteria.crit_cnt = 1;
904 sort_active_file_in_place (&criteria);
907 /* ... and we don't need our sort key anymore. So delete it */
908 dict_delete_var (default_dict, order);
912 return (result ? CMD_SUCCESS : CMD_CASCADING_FAILURE);
916 /* Parser for the variables sub command
917 Returns 1 on success */
919 rank_custom_variables(struct cmd_rank *cmd UNUSED, void *aux UNUSED)
921 static const int terminators[2] = {T_BY, 0};
925 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
929 sc = sort_parse_criteria (default_dict,
930 &src_vars, &n_src_vars, 0, terminators);
932 if ( lex_match(T_BY) )
934 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL))
939 if (!parse_variables (default_dict, &group_vars, &n_group_vars,
940 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
951 /* Parse the [/rank INTO var1 var2 ... varN ] clause */
953 parse_rank_function(struct cmd_rank *cmd UNUSED, enum RANK_FUNC f)
958 rank_specs = xnrealloc(rank_specs, n_rank_specs, sizeof *rank_specs);
959 rank_specs[n_rank_specs - 1].rfunc = f;
960 rank_specs[n_rank_specs - 1].destvars = NULL;
962 rank_specs[n_rank_specs - 1].destvars =
963 xcalloc (sc->crit_cnt, sizeof (struct variable *));
965 if (lex_match_id("INTO"))
967 struct variable *destvar;
969 while( token == T_ID )
972 if ( dict_lookup_var (default_dict, tokid) != NULL )
974 msg(SE, _("Variable %s already exists."), tokid);
977 if ( var_count >= sc->crit_cnt )
979 msg(SE, _("Too many variables in INTO clause."));
983 destvar = create_rank_variable (f, src_vars[var_count], tokid);
984 rank_specs[n_rank_specs - 1].destvars[var_count] = destvar ;
996 rank_custom_rank(struct cmd_rank *cmd, void *aux UNUSED )
998 return parse_rank_function(cmd, RANK);
1002 rank_custom_normal(struct cmd_rank *cmd, void *aux UNUSED )
1004 return parse_rank_function(cmd, NORMAL);
1008 rank_custom_percent(struct cmd_rank *cmd, void *aux UNUSED )
1010 return parse_rank_function (cmd, PERCENT);
1014 rank_custom_rfraction(struct cmd_rank *cmd, void *aux UNUSED )
1016 return parse_rank_function(cmd, RFRACTION);
1020 rank_custom_proportion(struct cmd_rank *cmd, void *aux UNUSED )
1022 return parse_rank_function(cmd, PROPORTION);
1026 rank_custom_n(struct cmd_rank *cmd, void *aux UNUSED )
1028 return parse_rank_function(cmd, N);
1032 rank_custom_savage(struct cmd_rank *cmd, void *aux UNUSED )
1034 return parse_rank_function(cmd, SAVAGE);
1039 rank_custom_ntiles(struct cmd_rank *cmd, void *aux UNUSED )
1041 if ( lex_force_match('(') )
1043 if ( lex_force_int() )
1045 k_ntiles = lex_integer ();
1047 lex_force_match(')');
1055 return parse_rank_function(cmd, NTILES);