1 /* PSPP - RANK. -*-c-*-
3 Copyright (C) 2005, 2006 Free Software Foundation, Inc.
4 Author: John Darrington <john@darrington.wattle.id.au>,
5 Ben Pfaff <blp@gnu.org>.
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
12 This program is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
24 #include "sort-criteria.h"
26 #include <data/dictionary.h>
27 #include <data/procedure.h>
28 #include <data/variable.h>
29 #include <data/case.h>
30 #include <data/casefile.h>
31 #include <data/fastfile.h>
32 #include <data/storage-stream.h>
33 #include <language/command.h>
34 #include <language/stats/sort-criteria.h>
36 #include <libpspp/compiler.h>
37 #include <math/sort.h>
38 #include <output/table.h>
39 #include <output/manager.h>
41 #include <gsl/gsl_cdf.h>
45 #define _(msgid) gettext (msgid)
61 +fraction=fraction:!blom/tukey/vw/rankit;
62 +ties=ties:!mean/low/high/condense;
63 missing=miss:!exclude/include.
68 typedef double (*rank_function_t) (double c, double cc, double cc_1,
71 static double rank_proportion (double c, double cc, double cc_1,
74 static double rank_normal (double c, double cc, double cc_1,
77 static double rank_percent (double c, double cc, double cc_1,
80 static double rank_rfraction (double c, double cc, double cc_1,
83 static double rank_rank (double c, double cc, double cc_1,
86 static double rank_n (double c, double cc, double cc_1,
89 static double rank_savage (double c, double cc, double cc_1,
92 static double rank_ntiles (double c, double cc, double cc_1,
109 static const struct fmt_spec dest_format[n_RANK_FUNCS] = {
110 {FMT_F, 9, 3}, /* rank */
111 {FMT_F, 6, 4}, /* normal */
112 {FMT_F, 6, 2}, /* percent */
113 {FMT_F, 6, 4}, /* rfraction */
114 {FMT_F, 6, 4}, /* proportion */
115 {FMT_F, 6, 0}, /* n */
116 {FMT_F, 3, 0}, /* ntiles */
117 {FMT_F, 8, 4} /* savage */
120 static const char * const function_name[n_RANK_FUNCS] = {
131 static const rank_function_t rank_func[n_RANK_FUNCS] = {
145 enum RANK_FUNC rfunc;
146 struct variable **destvars;
150 /* Function to use for testing for missing values */
151 static is_missing_func *value_is_missing;
153 static struct rank_spec *rank_specs;
154 static size_t n_rank_specs;
156 static struct sort_criteria *sc;
158 static struct variable **group_vars;
159 static size_t n_group_vars;
161 static struct variable **src_vars;
162 static size_t n_src_vars;
167 static struct cmd_rank cmd;
169 static struct casefile *rank_sorted_casefile (struct casefile *cf,
170 const struct sort_criteria *,
171 const struct rank_spec *rs,
174 const struct missing_values *miss
179 static char name[10];
180 switch ( cmd.fraction )
183 strcpy (name, "BLOM");
186 strcpy (name, "RANKIT");
189 strcpy (name, "TUKEY");
200 /* Create a label on DEST_VAR, describing its derivation from SRC_VAR and F */
202 create_var_label (struct variable *dest_var,
203 const struct variable *src_var, enum RANK_FUNC f)
206 ds_init_empty (&label);
208 if ( n_group_vars > 0 )
210 struct string group_var_str;
213 ds_init_empty (&group_var_str);
215 for (g = 0 ; g < n_group_vars ; ++g )
217 if ( g > 0 ) ds_put_cstr (&group_var_str, " ");
218 ds_put_cstr (&group_var_str, group_vars[g]->name);
221 ds_put_format (&label, _("%s of %s by %s"), function_name[f],
222 src_var->name, ds_cstr (&group_var_str));
223 ds_destroy (&group_var_str);
226 ds_put_format (&label,_("%s of %s"), function_name[f], src_var->name);
228 dest_var->label = strdup (ds_cstr (&label) );
235 rank_cmd (const struct sort_criteria *sc,
236 const struct rank_spec *rank_specs, int n_rank_specs)
238 struct sort_criteria criteria;
241 const int n_splits = dict_get_split_cnt (dataset_dict (current_dataset));
243 criteria.crit_cnt = n_splits + n_group_vars + 1;
244 criteria.crits = xnmalloc (criteria.crit_cnt, sizeof *criteria.crits);
245 for (i = 0; i < n_splits ; i++)
247 struct variable *v = dict_get_split_vars (dataset_dict (current_dataset))[i];
248 criteria.crits[i].fv = v->fv;
249 criteria.crits[i].width = v->width;
250 criteria.crits[i].dir = SRT_ASCEND;
252 for (i = 0; i < n_group_vars; i++)
254 criteria.crits[i + n_splits].fv = group_vars[i]->fv;
255 criteria.crits[i + n_splits].width = group_vars[i]->width;
256 criteria.crits[i + n_splits].dir = SRT_ASCEND;
258 for (i = 0 ; i < sc->crit_cnt ; ++i )
260 struct casefile *out ;
261 struct casefile *cf ;
262 struct casereader *reader ;
263 struct casefile *sorted_cf ;
265 /* Obtain active file in CF. */
266 if (!procedure (current_dataset, NULL, NULL))
269 cf = proc_capture_output (current_dataset);
271 /* Sort CF into SORTED_CF. */
272 reader = casefile_get_destructive_reader (cf) ;
273 criteria.crits[criteria.crit_cnt - 1] = sc->crits[i];
274 assert ( sc->crits[i].fv == src_vars[i]->fv );
275 sorted_cf = sort_execute (reader, &criteria);
276 casefile_destroy (cf);
278 out = rank_sorted_casefile (sorted_cf, &criteria,
279 rank_specs, n_rank_specs,
280 i, &src_vars[i]->miss) ;
287 proc_set_source (current_dataset, storage_source_create (out));
290 free (criteria.crits);
294 free (criteria.crits);
298 /* Hardly a rank function !! */
300 rank_n (double c UNUSED, double cc UNUSED, double cc_1 UNUSED,
301 int i UNUSED, double w)
308 rank_rank (double c, double cc, double cc_1,
309 int i, double w UNUSED)
323 rank = cc_1 + (c + 1.0)/ 2.0;
343 rank = cc_1 + c / 2.0 ;
358 rank_rfraction (double c, double cc, double cc_1,
361 return rank_rank (c, cc, cc_1, i, w) / w ;
366 rank_percent (double c, double cc, double cc_1,
369 return rank_rank (c, cc, cc_1, i, w) * 100.0 / w ;
374 rank_proportion (double c, double cc, double cc_1,
377 const double r = rank_rank (c, cc, cc_1, i, w) ;
381 switch ( cmd.fraction )
384 f = (r - 3.0/8.0) / (w + 0.25);
390 f = (r - 1.0/3.0) / (w + 1.0/3.0);
400 return (f > 0) ? f : SYSMIS;
404 rank_normal (double c, double cc, double cc_1,
407 double f = rank_proportion (c, cc, cc_1, i, w);
409 return gsl_cdf_ugaussian_Pinv (f);
413 rank_ntiles (double c, double cc, double cc_1,
416 double r = rank_rank (c, cc, cc_1, i, w);
419 return ( floor (( r * k_ntiles) / ( w + 1) ) + 1);
422 /* Expected value of the order statistics from an exponential distribution */
424 ee (int j, double w_star)
429 for (k = 1 ; k <= j; k++)
430 sum += 1.0 / ( w_star + 1 - k );
437 rank_savage (double c, double cc, double cc_1,
438 int i UNUSED, double w)
441 const int i_1 = floor (cc_1);
442 const int i_2 = floor (cc);
444 const double w_star = (modf (w, &int_part) == 0 ) ? w : floor (w) + 1;
446 const double g_1 = cc_1 - i_1;
447 const double g_2 = cc - i_2;
449 /* The second factor is infinite, when the first is zero.
450 Therefore, evaluate the second, only when the first is non-zero */
451 const double expr1 = (1 - g_1) ? (1 - g_1) * ee(i_1+1, w_star) : ( 1 - g_1);
452 const double expr2 = g_2 ? g_2 * ee (i_2+1, w_star) : g_2 ;
455 return ee (i_1 + 1, w_star) - 1;
457 if ( i_1 + 1 == i_2 )
458 return ( ( expr1 + expr2 )/c ) - 1;
460 if ( i_1 + 2 <= i_2 )
464 for (j = i_1 + 2 ; j <= i_2; ++j )
465 sigma += ee (j, w_star);
466 return ( (expr1 + expr2 + sigma) / c) -1;
473 /* Rank the casefile belonging to CR, starting from the current
474 postition of CR continuing up to and including the ENDth case.
476 RS points to an array containing the rank specifications to
477 use. N_RANK_SPECS is the number of elements of RS.
480 DEST_VAR_INDEX is the index into the rank_spec destvar element
481 to be used for this ranking.
483 Prerequisites: 1. The casefile must be sorted according to CRITERION.
484 2. W is the sum of the non-missing caseweights for this
485 range of the casefile.
488 rank_cases (struct casereader *cr,
490 const struct sort_criterion *criterion,
491 const struct missing_values *mv,
493 const struct rank_spec *rs,
496 struct casefile *dest)
503 const int fv = criterion->fv;
504 const int width = criterion->width;
506 while (casereader_cnum (cr) < end)
508 struct casereader *lookahead;
509 const union value *this_value;
510 struct ccase this_case, lookahead_case;
515 if (!casereader_read_xfer (cr, &this_case))
518 this_value = case_data (&this_case, fv);
519 c = dict_get_case_weight (dataset_dict (current_dataset), &this_case, &warn);
521 lookahead = casereader_clone (cr);
523 while (casereader_cnum (lookahead) < end
524 && casereader_read_xfer (lookahead, &lookahead_case))
526 const union value *lookahead_value = case_data (&lookahead_case, fv);
527 int diff = compare_values (this_value, lookahead_value, width);
531 /* Make sure the casefile was sorted */
532 assert ( diff == ((criterion->dir == SRT_ASCEND) ? -1 :1));
534 case_destroy (&lookahead_case);
538 c += dict_get_case_weight (dataset_dict (current_dataset), &lookahead_case, &warn);
539 case_destroy (&lookahead_case);
542 casereader_destroy (lookahead);
545 if ( !value_is_missing (mv, this_value) )
550 for (i = 0; i < n_rank_specs; ++i)
552 const int dest_idx = rs[i].destvars[dest_var_index]->fv;
554 if ( value_is_missing (mv, this_value) )
555 case_data_rw (&this_case, dest_idx)->f = SYSMIS;
557 case_data_rw (&this_case, dest_idx)->f =
558 rank_func[rs[i].rfunc](c, cc, cc_1, iter, w);
560 casefile_append_xfer (dest, &this_case);
562 while (n-- > 0 && casereader_read_xfer (cr, &this_case));
564 if ( !value_is_missing (mv, this_value) )
568 /* If this isn't true, then all the results will be wrong */
573 same_group (const struct ccase *a, const struct ccase *b,
574 const struct sort_criteria *crit)
578 for (i = 0; i < crit->crit_cnt - 1; i++)
580 struct sort_criterion *c = &crit->crits[i];
581 if (compare_values (case_data (a, c->fv), case_data (b, c->fv),
589 static struct casefile *
590 rank_sorted_casefile (struct casefile *cf,
591 const struct sort_criteria *crit,
592 const struct rank_spec *rs,
595 const struct missing_values *mv)
597 struct casefile *dest = fastfile_create (casefile_get_value_cnt (cf));
598 struct casereader *lookahead = casefile_get_reader (cf);
599 struct casereader *pos = casereader_clone (lookahead);
600 struct ccase group_case;
603 struct sort_criterion *ultimate_crit = &crit->crits[crit->crit_cnt - 1];
605 if (casereader_read (lookahead, &group_case))
607 struct ccase this_case;
608 const union value *this_value ;
610 this_value = case_data( &group_case, ultimate_crit->fv);
612 if ( !value_is_missing(mv, this_value) )
613 w = dict_get_case_weight (dataset_dict (current_dataset), &group_case, &warn);
615 while (casereader_read (lookahead, &this_case))
617 const union value *this_value =
618 case_data(&this_case, ultimate_crit->fv);
619 double c = dict_get_case_weight (dataset_dict (current_dataset), &this_case, &warn);
620 if (!same_group (&group_case, &this_case, crit))
622 rank_cases (pos, casereader_cnum (lookahead) - 1,
629 case_destroy (&group_case);
630 case_move (&group_case, &this_case);
632 if ( !value_is_missing (mv, this_value) )
634 case_destroy (&this_case);
636 case_destroy (&group_case);
637 rank_cases (pos, ULONG_MAX, ultimate_crit, mv, w,
638 rs, n_rank_specs, dest_idx, dest);
641 if (casefile_error (dest))
643 casefile_destroy (dest);
647 casefile_destroy (cf);
652 /* Transformation function to enumerate all the cases */
654 create_resort_key (void *key_var_, struct ccase *cc, casenum_t case_num)
656 struct variable *key_var = key_var_;
658 case_data_rw(cc, key_var->fv)->f = case_num;
660 return TRNS_CONTINUE;
664 /* Create and return a new variable in which to store the ranks of SRC_VAR
665 accoring to the rank function F.
666 VNAME is the name of the variable to be created.
667 If VNAME is NULL, then a name will be automatically chosen.
669 static struct variable *
670 create_rank_variable (enum RANK_FUNC f,
671 const struct variable *src_var,
675 struct variable *var = NULL;
676 char name[SHORT_NAME_LEN + 1];
679 var = dict_create_var(dataset_dict (current_dataset), vname, 0);
683 snprintf(name, SHORT_NAME_LEN + 1, "%c%s",
684 function_name[f][0], src_var->name);
686 var = dict_create_var(dataset_dict (current_dataset), name, 0);
693 snprintf(func_abb, 4, "%s", function_name[f]);
694 snprintf(name, SHORT_NAME_LEN + 1, "%s%03d", func_abb,
697 var = dict_create_var(dataset_dict (current_dataset), name, 0);
703 while ( NULL == var )
706 snprintf(func_abb, 3, "%s", function_name[f]);
708 snprintf(name, SHORT_NAME_LEN + 1,
709 "RNK%s%02d", func_abb, i);
711 var = dict_create_var(dataset_dict (current_dataset), name, 0);
718 msg(ME, _("Cannot create new rank variable. All candidates in use."));
722 var->write = var->print = dest_format[f];
738 for (i = 0 ; i < n_rank_specs ; ++i )
740 free (rank_specs[i].destvars);
747 sort_destroy_criteria (sc);
759 struct variable *order;
763 if ( !parse_rank(&cmd, NULL) )
769 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
770 if (cmd.miss == RANK_INCLUDE )
771 value_is_missing = mv_is_value_system_missing;
773 value_is_missing = mv_is_value_missing;
776 /* Default to /RANK if no function subcommands are given */
777 if ( !( cmd.sbc_normal || cmd.sbc_ntiles || cmd.sbc_proportion ||
778 cmd.sbc_rfraction || cmd.sbc_savage || cmd.sbc_n ||
779 cmd.sbc_percent || cmd.sbc_rank ) )
781 assert ( n_rank_specs == 0 );
783 rank_specs = xmalloc (sizeof (*rank_specs));
784 rank_specs[0].rfunc = RANK;
785 rank_specs[0].destvars =
786 xcalloc (sc->crit_cnt, sizeof (struct variable *));
791 assert ( sc->crit_cnt == n_src_vars);
793 /* Create variables for all rank destinations which haven't
794 already been created with INTO.
795 Add labels to all the destination variables.
797 for (i = 0 ; i < n_rank_specs ; ++i )
800 for ( v = 0 ; v < n_src_vars ; v ++ )
802 if ( rank_specs[i].destvars[v] == NULL )
804 rank_specs[i].destvars[v] =
805 create_rank_variable (rank_specs[i].rfunc, src_vars[v], NULL);
808 create_var_label ( rank_specs[i].destvars[v],
810 rank_specs[i].rfunc);
814 if ( cmd.print == RANK_YES )
818 tab_output_text (0, _("Variables Created By RANK"));
819 tab_output_text (0, "\n");
821 for (i = 0 ; i < n_rank_specs ; ++i )
823 for ( v = 0 ; v < n_src_vars ; v ++ )
825 if ( n_group_vars > 0 )
827 struct string varlist;
830 ds_init_empty (&varlist);
831 for ( g = 0 ; g < n_group_vars ; ++g )
833 ds_put_cstr (&varlist, group_vars[g]->name);
835 if ( g < n_group_vars - 1)
836 ds_put_cstr (&varlist, " ");
839 if ( rank_specs[i].rfunc == NORMAL ||
840 rank_specs[i].rfunc == PROPORTION )
841 tab_output_text (TAT_PRINTF,
842 _("%s into %s(%s of %s using %s BY %s)"),
844 rank_specs[i].destvars[v]->name,
845 function_name[rank_specs[i].rfunc],
852 tab_output_text (TAT_PRINTF,
853 _("%s into %s(%s of %s BY %s)"),
855 rank_specs[i].destvars[v]->name,
856 function_name[rank_specs[i].rfunc],
860 ds_destroy (&varlist);
864 if ( rank_specs[i].rfunc == NORMAL ||
865 rank_specs[i].rfunc == PROPORTION )
866 tab_output_text (TAT_PRINTF,
867 _("%s into %s(%s of %s using %s)"),
869 rank_specs[i].destvars[v]->name,
870 function_name[rank_specs[i].rfunc],
876 tab_output_text (TAT_PRINTF,
877 _("%s into %s(%s of %s)"),
879 rank_specs[i].destvars[v]->name,
880 function_name[rank_specs[i].rfunc],
888 if ( cmd.sbc_fraction &&
889 ( ! cmd.sbc_normal && ! cmd.sbc_proportion) )
890 msg(MW, _("FRACTION has been specified, but NORMAL and PROPORTION rank functions have not been requested. The FRACTION subcommand will be ignored.") );
892 /* Add a variable which we can sort by to get back the original
894 order = dict_create_var_assert (dataset_dict (current_dataset), "$ORDER_", 0);
896 add_transformation (current_dataset, create_resort_key, 0, order);
899 result = rank_cmd (sc, rank_specs, n_rank_specs);
901 /* Put the active file back in its original order */
903 struct sort_criteria criteria;
904 struct sort_criterion restore_criterion ;
905 restore_criterion.fv = order->fv;
906 restore_criterion.width = 0;
907 restore_criterion.dir = SRT_ASCEND;
909 criteria.crits = &restore_criterion;
910 criteria.crit_cnt = 1;
912 sort_active_file_in_place (&criteria);
915 /* ... and we don't need our sort key anymore. So delete it */
916 dict_delete_var (dataset_dict (current_dataset), order);
920 return (result ? CMD_SUCCESS : CMD_CASCADING_FAILURE);
924 /* Parser for the variables sub command
925 Returns 1 on success */
927 rank_custom_variables(struct cmd_rank *cmd UNUSED, void *aux UNUSED)
929 static const int terminators[2] = {T_BY, 0};
933 if ((token != T_ID || dict_lookup_var (dataset_dict (current_dataset), tokid) == NULL)
937 sc = sort_parse_criteria (dataset_dict (current_dataset),
938 &src_vars, &n_src_vars, 0, terminators);
940 if ( lex_match(T_BY) )
942 if ((token != T_ID || dict_lookup_var (dataset_dict (current_dataset), tokid) == NULL))
947 if (!parse_variables (dataset_dict (current_dataset), &group_vars, &n_group_vars,
948 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
959 /* Parse the [/rank INTO var1 var2 ... varN ] clause */
961 parse_rank_function(struct cmd_rank *cmd UNUSED, enum RANK_FUNC f)
966 rank_specs = xnrealloc(rank_specs, n_rank_specs, sizeof *rank_specs);
967 rank_specs[n_rank_specs - 1].rfunc = f;
968 rank_specs[n_rank_specs - 1].destvars = NULL;
970 rank_specs[n_rank_specs - 1].destvars =
971 xcalloc (sc->crit_cnt, sizeof (struct variable *));
973 if (lex_match_id("INTO"))
975 struct variable *destvar;
977 while( token == T_ID )
980 if ( dict_lookup_var (dataset_dict (current_dataset), tokid) != NULL )
982 msg(SE, _("Variable %s already exists."), tokid);
985 if ( var_count >= sc->crit_cnt )
987 msg(SE, _("Too many variables in INTO clause."));
991 destvar = create_rank_variable (f, src_vars[var_count], tokid);
992 rank_specs[n_rank_specs - 1].destvars[var_count] = destvar ;
1004 rank_custom_rank(struct cmd_rank *cmd, void *aux UNUSED )
1006 return parse_rank_function(cmd, RANK);
1010 rank_custom_normal(struct cmd_rank *cmd, void *aux UNUSED )
1012 return parse_rank_function(cmd, NORMAL);
1016 rank_custom_percent(struct cmd_rank *cmd, void *aux UNUSED )
1018 return parse_rank_function (cmd, PERCENT);
1022 rank_custom_rfraction(struct cmd_rank *cmd, void *aux UNUSED )
1024 return parse_rank_function(cmd, RFRACTION);
1028 rank_custom_proportion(struct cmd_rank *cmd, void *aux UNUSED )
1030 return parse_rank_function(cmd, PROPORTION);
1034 rank_custom_n(struct cmd_rank *cmd, void *aux UNUSED )
1036 return parse_rank_function(cmd, N);
1040 rank_custom_savage(struct cmd_rank *cmd, void *aux UNUSED )
1042 return parse_rank_function(cmd, SAVAGE);
1047 rank_custom_ntiles(struct cmd_rank *cmd, void *aux UNUSED )
1049 if ( lex_force_match('(') )
1051 if ( lex_force_int() )
1053 k_ntiles = lex_integer ();
1055 lex_force_match(')');
1063 return parse_rank_function(cmd, NTILES);