1 /* PSPP - RANK. -*-c-*-
3 Copyright (C) 2005, 2006 Free Software Foundation, Inc.
4 Author: John Darrington <john@darrington.wattle.id.au>,
5 Ben Pfaff <blp@gnu.org>.
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
12 This program is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
24 #include "sort-criteria.h"
26 #include <data/dictionary.h>
27 #include <data/procedure.h>
28 #include <data/variable.h>
29 #include <data/case.h>
30 #include <data/casefile.h>
31 #include <data/fastfile.h>
32 #include <data/storage-stream.h>
33 #include <language/command.h>
34 #include <language/stats/sort-criteria.h>
36 #include <libpspp/compiler.h>
37 #include <math/sort.h>
38 #include <output/table.h>
39 #include <output/manager.h>
41 #include <gsl/gsl_cdf.h>
45 #define _(msgid) gettext (msgid)
61 +fraction=fraction:!blom/tukey/vw/rankit;
62 +ties=ties:!mean/low/high/condense;
63 missing=miss:!exclude/include.
68 typedef double (*rank_function_t) (double c, double cc, double cc_1,
71 static double rank_proportion (double c, double cc, double cc_1,
74 static double rank_normal (double c, double cc, double cc_1,
77 static double rank_percent (double c, double cc, double cc_1,
80 static double rank_rfraction (double c, double cc, double cc_1,
83 static double rank_rank (double c, double cc, double cc_1,
86 static double rank_n (double c, double cc, double cc_1,
89 static double rank_savage (double c, double cc, double cc_1,
92 static double rank_ntiles (double c, double cc, double cc_1,
109 static const struct fmt_spec dest_format[n_RANK_FUNCS] = {
110 {FMT_F, 9, 3}, /* rank */
111 {FMT_F, 6, 4}, /* normal */
112 {FMT_F, 6, 2}, /* percent */
113 {FMT_F, 6, 4}, /* rfraction */
114 {FMT_F, 6, 4}, /* proportion */
115 {FMT_F, 6, 0}, /* n */
116 {FMT_F, 3, 0}, /* ntiles */
117 {FMT_F, 8, 4} /* savage */
120 static const char *function_name[n_RANK_FUNCS] = {
131 static rank_function_t rank_func[n_RANK_FUNCS] = {
145 enum RANK_FUNC rfunc;
146 struct variable **destvars;
150 /* Function to use for testing for missing values */
151 static is_missing_func *value_is_missing;
153 static struct rank_spec *rank_specs;
154 static size_t n_rank_specs;
156 static struct sort_criteria *sc;
158 static struct variable **group_vars;
159 static size_t n_group_vars;
161 static struct variable **src_vars;
162 static size_t n_src_vars;
167 static struct cmd_rank cmd;
169 static struct casefile *rank_sorted_casefile (struct casefile *cf,
170 const struct sort_criteria *,
171 const struct rank_spec *rs,
174 const struct missing_values *miss
179 static char name[10];
180 switch ( cmd.fraction )
183 strcpy (name, "BLOM");
186 strcpy (name, "RANKIT");
189 strcpy (name, "TUKEY");
200 /* Create a label on DEST_VAR, describing its derivation from SRC_VAR and F */
202 create_var_label (struct variable *dest_var,
203 const struct variable *src_var, enum RANK_FUNC f)
206 ds_init_empty (&label);
208 if ( n_group_vars > 0 )
210 struct string group_var_str;
213 ds_init_empty (&group_var_str);
215 for (g = 0 ; g < n_group_vars ; ++g )
217 if ( g > 0 ) ds_put_cstr (&group_var_str, " ");
218 ds_put_cstr (&group_var_str, group_vars[g]->name);
221 ds_put_format (&label, _("%s of %s by %s"), function_name[f],
222 src_var->name, ds_cstr (&group_var_str));
223 ds_destroy (&group_var_str);
226 ds_put_format (&label,_("%s of %s"), function_name[f], src_var->name);
228 dest_var->label = strdup (ds_cstr (&label) );
235 rank_cmd (const struct sort_criteria *sc,
236 const struct rank_spec *rank_specs, int n_rank_specs)
238 struct sort_criteria criteria;
241 const int n_splits = dict_get_split_cnt (default_dict);
243 criteria.crit_cnt = n_splits + n_group_vars + 1;
244 criteria.crits = xnmalloc (criteria.crit_cnt, sizeof *criteria.crits);
245 for (i = 0; i < n_splits ; i++)
247 struct variable *v = dict_get_split_vars (default_dict)[i];
248 criteria.crits[i].fv = v->fv;
249 criteria.crits[i].width = v->width;
250 criteria.crits[i].dir = SRT_ASCEND;
252 for (i = 0; i < n_group_vars; i++)
254 criteria.crits[i + n_splits].fv = group_vars[i]->fv;
255 criteria.crits[i + n_splits].width = group_vars[i]->width;
256 criteria.crits[i + n_splits].dir = SRT_ASCEND;
258 for (i = 0 ; i < sc->crit_cnt ; ++i )
260 struct casefile *out ;
261 struct casefile *cf ;
262 struct casereader *reader ;
263 struct casefile *sorted_cf ;
265 /* Obtain active file in CF. */
266 if (!procedure (NULL, NULL))
269 cf = proc_capture_output ();
271 /* Sort CF into SORTED_CF. */
272 reader = casefile_get_destructive_reader (cf) ;
273 criteria.crits[criteria.crit_cnt - 1] = sc->crits[i];
274 assert ( sc->crits[i].fv == src_vars[i]->fv );
275 sorted_cf = sort_execute (reader, &criteria);
276 casefile_destroy (cf);
278 out = rank_sorted_casefile (sorted_cf, &criteria,
279 rank_specs, n_rank_specs,
280 i, &src_vars[i]->miss) ;
287 proc_set_source (storage_source_create (out));
290 free (criteria.crits);
294 free (criteria.crits);
298 /* Hardly a rank function !! */
300 rank_n (double c UNUSED, double cc UNUSED, double cc_1 UNUSED,
301 int i UNUSED, double w)
308 rank_rank (double c, double cc, double cc_1,
309 int i, double w UNUSED)
323 rank = cc_1 + (c + 1.0)/ 2.0;
343 rank = cc_1 + c / 2.0 ;
358 rank_rfraction (double c, double cc, double cc_1,
361 return rank_rank (c, cc, cc_1, i, w) / w ;
366 rank_percent (double c, double cc, double cc_1,
369 return rank_rank (c, cc, cc_1, i, w) * 100.0 / w ;
374 rank_proportion (double c, double cc, double cc_1,
377 const double r = rank_rank (c, cc, cc_1, i, w) ;
381 switch ( cmd.fraction )
384 f = (r - 3.0/8.0) / (w + 0.25);
390 f = (r - 1.0/3.0) / (w + 1.0/3.0);
400 return (f > 0) ? f : SYSMIS;
404 rank_normal (double c, double cc, double cc_1,
407 double f = rank_proportion (c, cc, cc_1, i, w);
409 return gsl_cdf_ugaussian_Pinv (f);
413 rank_ntiles (double c, double cc, double cc_1,
416 double r = rank_rank (c, cc, cc_1, i, w);
419 return ( floor (( r * k_ntiles) / ( w + 1) ) + 1);
422 /* Expected value of the order statistics from an exponential distribution */
424 ee (int j, double w_star)
429 for (k = 1 ; k <= j; k++)
430 sum += 1.0 / ( w_star + 1 - k );
437 rank_savage (double c, double cc, double cc_1,
438 int i UNUSED, double w)
441 const int i_1 = floor (cc_1);
442 const int i_2 = floor (cc);
444 const double w_star = (modf (w, &int_part) == 0 ) ? w : floor (w) + 1;
446 const double g_1 = cc_1 - i_1;
447 const double g_2 = cc - i_2;
449 /* The second factor is infinite, when the first is zero.
450 Therefore, evaluate the second, only when the first is non-zero */
451 const double expr1 = (1 - g_1) ? (1 - g_1) * ee(i_1+1, w_star) : ( 1 - g_1);
452 const double expr2 = g_2 ? g_2 * ee (i_2+1, w_star) : g_2 ;
455 return ee (i_1 + 1, w_star) - 1;
457 if ( i_1 + 1 == i_2 )
458 return ( ( expr1 + expr2 )/c ) - 1;
460 if ( i_1 + 2 <= i_2 )
464 for (j = i_1 + 2 ; j <= i_2; ++j )
465 sigma += ee (j, w_star);
466 return ( (expr1 + expr2 + sigma) / c) -1;
473 /* Rank the casefile belonging to CR, starting from the current
474 postition of CR continuing up to and including the ENDth case.
476 RS points to an array containing the rank specifications to
477 use. N_RANK_SPECS is the number of elements of RS.
480 DEST_VAR_INDEX is the index into the rank_spec destvar element
481 to be used for this ranking.
483 Prerequisites: 1. The casefile must be sorted according to CRITERION.
484 2. W is the sum of the non-missing caseweights for this
485 range of the casefile.
488 rank_cases (struct casereader *cr,
490 const struct sort_criterion *criterion,
491 const struct missing_values *mv,
493 const struct rank_spec *rs,
496 struct casefile *dest)
503 const int fv = criterion->fv;
504 const int width = criterion->width;
506 while (casereader_cnum (cr) < end)
508 struct casereader *lookahead;
509 const union value *this_value;
510 struct ccase this_case, lookahead_case;
515 if (!casereader_read_xfer (cr, &this_case))
518 this_value = case_data (&this_case, fv);
519 c = dict_get_case_weight (default_dict, &this_case, &warn);
521 lookahead = casereader_clone (cr);
523 while (casereader_cnum (lookahead) < end
524 && casereader_read_xfer (lookahead, &lookahead_case))
526 const union value *lookahead_value = case_data (&lookahead_case, fv);
527 int diff = compare_values (this_value, lookahead_value, width);
531 /* Make sure the casefile was sorted */
532 assert ( diff == ((criterion->dir == SRT_ASCEND) ? -1 :1));
534 case_destroy (&lookahead_case);
538 c += dict_get_case_weight (default_dict, &lookahead_case, &warn);
539 case_destroy (&lookahead_case);
542 casereader_destroy (lookahead);
545 if ( !value_is_missing (mv, this_value) )
550 for (i = 0; i < n_rank_specs; ++i)
552 const int dest_idx = rs[i].destvars[dest_var_index]->fv;
554 if ( value_is_missing (mv, this_value) )
555 case_data_rw (&this_case, dest_idx)->f = SYSMIS;
557 case_data_rw (&this_case, dest_idx)->f =
558 rank_func[rs[i].rfunc](c, cc, cc_1, iter, w);
560 casefile_append_xfer (dest, &this_case);
562 while (n-- > 0 && casereader_read_xfer (cr, &this_case));
564 if ( !value_is_missing (mv, this_value) )
568 /* If this isn't true, then all the results will be wrong */
573 same_group (const struct ccase *a, const struct ccase *b,
574 const struct sort_criteria *crit)
578 for (i = 0; i < crit->crit_cnt - 1; i++)
580 struct sort_criterion *c = &crit->crits[i];
581 if (compare_values (case_data (a, c->fv), case_data (b, c->fv),
589 static struct casefile *
590 rank_sorted_casefile (struct casefile *cf,
591 const struct sort_criteria *crit,
592 const struct rank_spec *rs,
595 const struct missing_values *mv)
597 struct casefile *dest = fastfile_create (casefile_get_value_cnt (cf));
598 struct casereader *lookahead = casefile_get_reader (cf);
599 struct casereader *pos = casereader_clone (lookahead);
600 struct ccase group_case;
603 struct sort_criterion *ultimate_crit = &crit->crits[crit->crit_cnt - 1];
605 if (casereader_read (lookahead, &group_case))
607 struct ccase this_case;
608 const union value *this_value ;
610 this_value = case_data( &group_case, ultimate_crit->fv);
612 if ( !value_is_missing(mv, this_value) )
613 w = dict_get_case_weight (default_dict, &group_case, &warn);
615 while (casereader_read (lookahead, &this_case))
617 const union value *this_value =
618 case_data(&this_case, ultimate_crit->fv);
619 double c = dict_get_case_weight (default_dict, &this_case, &warn);
620 if (!same_group (&group_case, &this_case, crit))
622 rank_cases (pos, casereader_cnum (lookahead) - 1,
629 case_move (&group_case, &this_case);
631 if ( !value_is_missing (mv, this_value) )
634 rank_cases (pos, ULONG_MAX, ultimate_crit, mv, w,
635 rs, n_rank_specs, dest_idx, dest);
638 if (casefile_error (dest))
640 casefile_destroy (dest);
644 casefile_destroy (cf);
649 /* Transformation function to enumerate all the cases */
651 create_resort_key (void *key_var_, struct ccase *cc, casenum_t case_num)
653 struct variable *key_var = key_var_;
655 case_data_rw(cc, key_var->fv)->f = case_num;
657 return TRNS_CONTINUE;
661 /* Create and return a new variable in which to store the ranks of SRC_VAR
662 accoring to the rank function F.
663 VNAME is the name of the variable to be created.
664 If VNAME is NULL, then a name will be automatically chosen.
666 static struct variable *
667 create_rank_variable (enum RANK_FUNC f,
668 const struct variable *src_var,
672 struct variable *var = NULL;
673 char name[SHORT_NAME_LEN + 1];
676 var = dict_create_var(default_dict, vname, 0);
680 snprintf(name, SHORT_NAME_LEN + 1, "%c%s",
681 function_name[f][0], src_var->name);
683 var = dict_create_var(default_dict, name, 0);
690 snprintf(func_abb, 4, "%s", function_name[f]);
691 snprintf(name, SHORT_NAME_LEN + 1, "%s%03d", func_abb,
694 var = dict_create_var(default_dict, name, 0);
700 while ( NULL == var )
703 snprintf(func_abb, 3, "%s", function_name[f]);
705 snprintf(name, SHORT_NAME_LEN + 1,
706 "RNK%s%02d", func_abb, i);
708 var = dict_create_var(default_dict, name, 0);
715 msg(ME, _("Cannot create new rank variable. All candidates in use."));
719 var->write = var->print = dest_format[f];
735 for (i = 0 ; i < n_rank_specs ; ++i )
737 free (rank_specs[i].destvars);
744 sort_destroy_criteria (sc);
756 struct variable *order;
760 if ( !parse_rank(&cmd, NULL) )
766 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
767 if (cmd.miss == RANK_INCLUDE )
768 value_is_missing = mv_is_value_system_missing;
770 value_is_missing = mv_is_value_missing;
773 /* Default to /RANK if no function subcommands are given */
774 if ( !( cmd.sbc_normal || cmd.sbc_ntiles || cmd.sbc_proportion ||
775 cmd.sbc_rfraction || cmd.sbc_savage || cmd.sbc_n ||
776 cmd.sbc_percent || cmd.sbc_rank ) )
778 assert ( n_rank_specs == 0 );
780 rank_specs = xmalloc (sizeof (*rank_specs));
781 rank_specs[0].rfunc = RANK;
782 rank_specs[0].destvars =
783 xcalloc (sc->crit_cnt, sizeof (struct variable *));
788 assert ( sc->crit_cnt == n_src_vars);
790 /* Create variables for all rank destinations which haven't
791 already been created with INTO.
792 Add labels to all the destination variables.
794 for (i = 0 ; i < n_rank_specs ; ++i )
797 for ( v = 0 ; v < n_src_vars ; v ++ )
799 if ( rank_specs[i].destvars[v] == NULL )
801 rank_specs[i].destvars[v] =
802 create_rank_variable (rank_specs[i].rfunc, src_vars[v], NULL);
805 create_var_label ( rank_specs[i].destvars[v],
807 rank_specs[i].rfunc);
811 if ( cmd.print == RANK_YES )
815 tab_output_text (0, _("Variables Created By RANK"));
816 tab_output_text (0, "\n");
818 for (i = 0 ; i < n_rank_specs ; ++i )
820 for ( v = 0 ; v < n_src_vars ; v ++ )
822 if ( n_group_vars > 0 )
824 struct string varlist;
827 ds_init_empty (&varlist);
828 for ( g = 0 ; g < n_group_vars ; ++g )
830 ds_put_cstr (&varlist, group_vars[g]->name);
832 if ( g < n_group_vars - 1)
833 ds_put_cstr (&varlist, " ");
836 if ( rank_specs[i].rfunc == NORMAL ||
837 rank_specs[i].rfunc == PROPORTION )
838 tab_output_text (TAT_PRINTF,
839 _("%s into %s(%s of %s using %s BY %s)"),
841 rank_specs[i].destvars[v]->name,
842 function_name[rank_specs[i].rfunc],
849 tab_output_text (TAT_PRINTF,
850 _("%s into %s(%s of %s BY %s)"),
852 rank_specs[i].destvars[v]->name,
853 function_name[rank_specs[i].rfunc],
857 ds_destroy (&varlist);
861 if ( rank_specs[i].rfunc == NORMAL ||
862 rank_specs[i].rfunc == PROPORTION )
863 tab_output_text (TAT_PRINTF,
864 _("%s into %s(%s of %s using %s)"),
866 rank_specs[i].destvars[v]->name,
867 function_name[rank_specs[i].rfunc],
873 tab_output_text (TAT_PRINTF,
874 _("%s into %s(%s of %s)"),
876 rank_specs[i].destvars[v]->name,
877 function_name[rank_specs[i].rfunc],
885 if ( cmd.sbc_fraction &&
886 ( ! cmd.sbc_normal && ! cmd.sbc_proportion) )
887 msg(MW, _("FRACTION has been specified, but NORMAL and PROPORTION rank functions have not been requested. The FRACTION subcommand will be ignored.") );
889 /* Add a variable which we can sort by to get back the original
891 order = dict_create_var_assert (default_dict, "$ORDER_", 0);
893 add_transformation (create_resort_key, 0, order);
896 result = rank_cmd (sc, rank_specs, n_rank_specs);
898 /* Put the active file back in its original order */
900 struct sort_criteria criteria;
901 struct sort_criterion restore_criterion ;
902 restore_criterion.fv = order->fv;
903 restore_criterion.width = 0;
904 restore_criterion.dir = SRT_ASCEND;
906 criteria.crits = &restore_criterion;
907 criteria.crit_cnt = 1;
909 sort_active_file_in_place (&criteria);
912 /* ... and we don't need our sort key anymore. So delete it */
913 dict_delete_var (default_dict, order);
917 return (result ? CMD_SUCCESS : CMD_CASCADING_FAILURE);
921 /* Parser for the variables sub command
922 Returns 1 on success */
924 rank_custom_variables(struct cmd_rank *cmd UNUSED, void *aux UNUSED)
926 static const int terminators[2] = {T_BY, 0};
930 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
934 sc = sort_parse_criteria (default_dict,
935 &src_vars, &n_src_vars, 0, terminators);
937 if ( lex_match(T_BY) )
939 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL))
944 if (!parse_variables (default_dict, &group_vars, &n_group_vars,
945 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
956 /* Parse the [/rank INTO var1 var2 ... varN ] clause */
958 parse_rank_function(struct cmd_rank *cmd UNUSED, enum RANK_FUNC f)
963 rank_specs = xnrealloc(rank_specs, n_rank_specs, sizeof *rank_specs);
964 rank_specs[n_rank_specs - 1].rfunc = f;
965 rank_specs[n_rank_specs - 1].destvars = NULL;
967 rank_specs[n_rank_specs - 1].destvars =
968 xcalloc (sc->crit_cnt, sizeof (struct variable *));
970 if (lex_match_id("INTO"))
972 struct variable *destvar;
974 while( token == T_ID )
977 if ( dict_lookup_var (default_dict, tokid) != NULL )
979 msg(SE, _("Variable %s already exists."), tokid);
982 if ( var_count >= sc->crit_cnt )
984 msg(SE, _("Too many variables in INTO clause."));
988 destvar = create_rank_variable (f, src_vars[var_count], tokid);
989 rank_specs[n_rank_specs - 1].destvars[var_count] = destvar ;
1001 rank_custom_rank(struct cmd_rank *cmd, void *aux UNUSED )
1003 return parse_rank_function(cmd, RANK);
1007 rank_custom_normal(struct cmd_rank *cmd, void *aux UNUSED )
1009 return parse_rank_function(cmd, NORMAL);
1013 rank_custom_percent(struct cmd_rank *cmd, void *aux UNUSED )
1015 return parse_rank_function (cmd, PERCENT);
1019 rank_custom_rfraction(struct cmd_rank *cmd, void *aux UNUSED )
1021 return parse_rank_function(cmd, RFRACTION);
1025 rank_custom_proportion(struct cmd_rank *cmd, void *aux UNUSED )
1027 return parse_rank_function(cmd, PROPORTION);
1031 rank_custom_n(struct cmd_rank *cmd, void *aux UNUSED )
1033 return parse_rank_function(cmd, N);
1037 rank_custom_savage(struct cmd_rank *cmd, void *aux UNUSED )
1039 return parse_rank_function(cmd, SAVAGE);
1044 rank_custom_ntiles(struct cmd_rank *cmd, void *aux UNUSED )
1046 if ( lex_force_match('(') )
1048 if ( lex_force_int() )
1050 k_ntiles = lex_integer ();
1052 lex_force_match(')');
1060 return parse_rank_function(cmd, NTILES);