pintos-os.org Git - pspp/blob - src/language/stats/rank.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2005, 2006, 2007, 2009, 2010, 2011, 2012, 2013, 2014, 2016 Free Software Foundation, Inc
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include <math.h>
  20 #include <gsl/gsl_cdf.h>
  21
  22 #include "data/case.h"
  23 #include "data/casegrouper.h"
  24 #include "data/casereader.h"
  25 #include "data/dataset.h"
  26 #include "data/dictionary.h"
  27 #include "data/format.h"
  28 #include "data/variable.h"
  29 #include "data/subcase.h"
  30 #include "data/casewriter.h"
  31 #include "data/short-names.h"
  32 #include "language/command.h"
  33 #include "language/lexer/lexer.h"
  34 #include "language/lexer/variable-parser.h"
  35 #include "language/stats/sort-criteria.h"
  36 #include "math/sort.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/i18n.h"
  39 #include "libpspp/message.h"
  40 #include "libpspp/misc.h"
  41 #include "libpspp/pool.h"
  42 #include "libpspp/string-set.h"
  43 #include "libpspp/taint.h"
  44 #include "output/pivot-table.h"
  45
  46 #include "gettext.h"
  47 #define _(msgid) gettext (msgid)
  48 #define N_(msgid) (msgid)
  49
  50 struct rank;
  51
  52 typedef double (*rank_function_t) (const struct rank*, double c, double cc, double cc_1,
  53                                    int i, double w);
  54
  55 static double rank_proportion (const struct rank *, double c, double cc, double cc_1,
  56                                int i, double w);
  57
  58 static double rank_normal (const struct rank *, double c, double cc, double cc_1,
  59                            int i, double w);
  60
  61 static double rank_percent (const struct rank *, double c, double cc, double cc_1,
  62                             int i, double w);
  63
  64 static double rank_rfraction (const struct rank *, double c, double cc, double cc_1,
  65                               int i, double w);
  66
  67 static double rank_rank (const struct rank *, double c, double cc, double cc_1,
  68                          int i, double w);
  69
  70 static double rank_n (const struct rank *, double c, double cc, double cc_1,
  71                       int i, double w);
  72
  73 static double rank_savage (const struct rank *, double c, double cc, double cc_1,
  74                            int i, double w);
  75
  76 static double rank_ntiles (const struct rank *, double c, double cc, double cc_1,
  77                            int i, double w);
  78
  79
  80 enum rank_func
  81   {
  82     RANK,
  83     NORMAL,
  84     PERCENT,
  85     RFRACTION,
  86     PROPORTION,
  87     N,
  88     NTILES,
  89     SAVAGE,
  90     n_RANK_FUNCS
  91   };
  92
  93 static const struct fmt_spec dest_format[n_RANK_FUNCS] = {
  94   [RANK]       = { .type = FMT_F, .w = 9, .d = 3 },
  95   [NORMAL]     = { .type = FMT_F, .w = 6, .d = 4 },
  96   [PERCENT]    = { .type = FMT_F, .w = 6, .d = 2 },
  97   [RFRACTION]  = { .type = FMT_F, .w = 6, .d = 4 },
  98   [PROPORTION] = { .type = FMT_F, .w = 6, .d = 4 },
  99   [N]          = { .type = FMT_F, .w = 6, .d = 0 },
 100   [NTILES]     = { .type = FMT_F, .w = 3, .d = 0 },
 101   [SAVAGE]     = { .type = FMT_F, .w = 8, .d = 4 }
 102 };
 103
 104 static const char * const function_name[n_RANK_FUNCS] = {
 105   "RANK",
 106   "NORMAL",
 107   "PERCENT",
 108   "RFRACTION",
 109   "PROPORTION",
 110   "N",
 111   "NTILES",
 112   "SAVAGE"
 113 };
 114
 115 static const rank_function_t rank_func[n_RANK_FUNCS] = {
 116   rank_rank,
 117   rank_normal,
 118   rank_percent,
 119   rank_rfraction,
 120   rank_proportion,
 121   rank_n,
 122   rank_ntiles,
 123   rank_savage
 124 };
 125
 126 static enum measure rank_measures[n_RANK_FUNCS] = {
 127   [RANK] = MEASURE_ORDINAL,
 128   [NORMAL] = MEASURE_ORDINAL,
 129   [PERCENT] = MEASURE_ORDINAL,
 130   [RFRACTION] = MEASURE_ORDINAL,
 131   [PROPORTION] = MEASURE_ORDINAL,
 132   [N] = MEASURE_SCALE,
 133   [NTILES] = MEASURE_ORDINAL,
 134   [SAVAGE] = MEASURE_ORDINAL,
 135 };
 136
 137 enum ties
 138   {
 139     TIES_LOW,
 140     TIES_HIGH,
 141     TIES_MEAN,
 142     TIES_CONDENSE
 143   };
 144
 145 enum fraction
 146   {
 147     FRAC_BLOM,
 148     FRAC_RANKIT,
 149     FRAC_TUKEY,
 150     FRAC_VW
 151   };
 152
 153 struct rank_spec
 154 {
 155   enum rank_func rfunc;
 156   const char **dest_names;
 157   const char **dest_labels;
 158 };
 159
 160 /* If NEW_NAME exists in DICT or NEW_NAMES, returns NULL without changing
 161    anything.  Otherwise, inserts NEW_NAME in NEW_NAMES and returns the copy of
 162    NEW_NAME now in NEW_NAMES. */
 163 static const char *
 164 try_new_name (const char *new_name,
 165               const struct dictionary *dict, struct string_set *new_names)
 166 {
 167   return (!dict_lookup_var (dict, new_name)
 168           && string_set_insert (new_names, new_name)
 169           ? string_set_find_node (new_names, new_name)->string
 170           : NULL);
 171 }
 172
 173 /* Returns a variable name for storing ranks of a variable named SRC_NAME
 174    according to the rank function F.  The name chosen will not be one already in
 175    DICT or NEW_NAMES.
 176
 177    If successful, adds the new name to NEW_NAMES and returns the name added.
 178    If no name can be generated, returns NULL. */
 179 static const char *
 180 rank_choose_dest_name (struct dictionary *dict, struct string_set *new_names,
 181                        enum rank_func f, const char *src_name)
 182 {
 183   char *src_name_7;
 184   char name[128];
 185   const char *s;
 186   int i;
 187
 188   /* Try the first character of the ranking function followed by the first 7
 189      bytes of the srcinal variable name. */
 190   src_name_7 = utf8_encoding_trunc (src_name, dict_get_encoding (dict), 7);
 191   snprintf (name, sizeof name, "%c%s", function_name[f][0], src_name_7);
 192   free (src_name_7);
 193   s = try_new_name (name, dict, new_names);
 194   if (s != NULL)
 195     return s;
 196
 197   /* Try "fun###". */
 198   for (i = 1; i <= 999; i++)
 199     {
 200       sprintf (name, "%.3s%03d", function_name[f], i);
 201       s = try_new_name (name, dict, new_names);
 202       if (s != NULL)
 203         return s;
 204     }
 205
 206   /* Try "RNKfn##". */
 207   for (i = 1; i <= 99; i++)
 208     {
 209       sprintf (name, "RNK%.2s%02d", function_name[f], i);
 210       s = try_new_name (name, dict, new_names);
 211       if (s != NULL)
 212         return s;
 213     }
 214
 215   msg (ME, _("Cannot generate variable name for ranking %s with %s.  "
 216              "All candidates in use."),
 217        src_name, function_name[f]);
 218   return NULL;
 219 }
 220
 221 struct rank
 222 {
 223   struct dictionary *dict;
 224
 225   struct subcase sc;
 226
 227   const struct variable **vars;
 228   size_t n_vars;
 229
 230   const struct variable **group_vars;
 231   size_t n_group_vars;
 232
 233
 234   enum mv_class exclude;
 235
 236   struct rank_spec *rs;
 237   size_t n_rs;
 238
 239   enum ties ties;
 240
 241   enum fraction fraction;
 242   int k_ntiles;
 243
 244   bool print;
 245
 246   /* Pool on which cell functions may allocate data */
 247   struct pool *pool;
 248 };
 249
 250
 251 static void
 252 destroy_rank (struct rank *rank)
 253 {
 254  free (rank->vars);
 255  free (rank->group_vars);
 256  subcase_uninit (&rank->sc);
 257  pool_destroy (rank->pool);
 258 }
 259
 260 static bool
 261 parse_into (struct lexer *lexer, struct rank *cmd,
 262             struct string_set *new_names)
 263 {
 264   int var_count = 0;
 265   struct rank_spec *rs = NULL;
 266
 267   cmd->rs = pool_realloc (cmd->pool, cmd->rs, sizeof (*cmd->rs) * (cmd->n_rs + 1));
 268   rs = &cmd->rs[cmd->n_rs];
 269
 270   if (lex_match_id (lexer, "RANK"))
 271     {
 272       rs->rfunc = RANK;
 273     }
 274   else if (lex_match_id (lexer, "NORMAL"))
 275     {
 276       rs->rfunc = NORMAL;
 277     }
 278   else if (lex_match_id (lexer, "RFRACTION"))
 279     {
 280       rs->rfunc = RFRACTION;
 281     }
 282   else if (lex_match_id (lexer, "N"))
 283     {
 284       rs->rfunc = N;
 285     }
 286   else if (lex_match_id (lexer, "SAVAGE"))
 287     {
 288       rs->rfunc = SAVAGE;
 289     }
 290   else if (lex_match_id (lexer, "PERCENT"))
 291     {
 292       rs->rfunc = PERCENT;
 293     }
 294   else if (lex_match_id (lexer, "PROPORTION"))
 295     {
 296       rs->rfunc = PROPORTION;
 297     }
 298   else if (lex_match_id (lexer, "NTILES"))
 299     {
 300       if (!lex_force_match (lexer, T_LPAREN))
 301         return false;
 302
 303       if (! lex_force_int_range (lexer, "NTILES", 1, INT_MAX))
 304         return false;
 305
 306       cmd->k_ntiles = lex_integer (lexer);
 307       lex_get (lexer);
 308
 309       if (!lex_force_match (lexer, T_RPAREN))
 310         return false;
 311
 312       rs->rfunc = NTILES;
 313     }
 314   else
 315     {
 316       lex_error (lexer, NULL);
 317       return false;
 318     }
 319
 320   cmd->n_rs++;
 321   rs->dest_names = pool_calloc (cmd->pool, cmd->n_vars,
 322                                 sizeof *rs->dest_names);
 323
 324   if (lex_match_id (lexer, "INTO"))
 325     {
 326       while(lex_token (lexer) == T_ID)
 327         {
 328           const char *name = lex_tokcstr (lexer);
 329
 330           if (var_count >= subcase_get_n_fields (&cmd->sc))
 331             msg (SE, _("Too many variables in %s clause."), "INTO");
 332           else if (dict_lookup_var (cmd->dict, name) != NULL)
 333             msg (SE, _("Variable %s already exists."), name);
 334           else if (string_set_contains (new_names, name))
 335             msg (SE, _("Duplicate variable name %s."), name);
 336           else
 337             {
 338               string_set_insert (new_names, name);
 339               rs->dest_names[var_count++] = pool_strdup (cmd->pool, name);
 340               lex_get (lexer);
 341               continue;
 342             }
 343
 344           /* Error path. */
 345           return false;
 346         }
 347     }
 348
 349   return true;
 350 }
 351
 352 /* Hardly a rank function !! */
 353 static double
 354 rank_n (const struct rank *cmd UNUSED, double c UNUSED, double cc UNUSED, double cc_1 UNUSED,
 355         int i UNUSED, double w)
 356 {
 357   return w;
 358 }
 359
 360
 361 static double
 362 rank_rank (const struct rank *cmd, double c, double cc, double cc_1,
 363            int i, double w UNUSED)
 364 {
 365   double rank;
 366
 367   if (c >= 1.0)
 368     {
 369       switch (cmd->ties)
 370         {
 371         case TIES_LOW:
 372           rank = cc_1 + 1;
 373           break;
 374         case TIES_HIGH:
 375           rank = cc;
 376           break;
 377         case TIES_MEAN:
 378           rank = cc_1 + (c + 1.0)/ 2.0;
 379           break;
 380         case TIES_CONDENSE:
 381           rank = i;
 382           break;
 383         default:
 384           NOT_REACHED ();
 385         }
 386     }
 387   else
 388     {
 389       switch (cmd->ties)
 390         {
 391         case TIES_LOW:
 392           rank = cc_1;
 393           break;
 394         case TIES_HIGH:
 395           rank = cc;
 396           break;
 397         case TIES_MEAN:
 398           rank = cc_1 + c / 2.0 ;
 399           break;
 400         case TIES_CONDENSE:
 401           rank = i;
 402           break;
 403         default:
 404           NOT_REACHED ();
 405         }
 406     }
 407
 408   return rank;
 409 }
 410
 411
 412 static double
 413 rank_rfraction (const struct rank *cmd, double c, double cc, double cc_1,
 414                 int i, double w)
 415 {
 416   return rank_rank (cmd, c, cc, cc_1, i, w) / w ;
 417 }
 418
 419
 420 static double
 421 rank_percent (const struct rank *cmd, double c, double cc, double cc_1,
 422               int i, double w)
 423 {
 424   return rank_rank (cmd, c, cc, cc_1, i, w) * 100.0 / w ;
 425 }
 426
 427
 428 static double
 429 rank_proportion (const struct rank *cmd, double c, double cc, double cc_1,
 430                  int i, double w)
 431 {
 432   const double r =  rank_rank (cmd, c, cc, cc_1, i, w) ;
 433
 434   double f;
 435
 436   switch (cmd->fraction)
 437     {
 438     case FRAC_BLOM:
 439       f =  (r - 3.0/8.0) / (w + 0.25);
 440       break;
 441     case FRAC_RANKIT:
 442       f = (r - 0.5) / w ;
 443       break;
 444     case FRAC_TUKEY:
 445       f = (r - 1.0/3.0) / (w + 1.0/3.0);
 446       break;
 447     case FRAC_VW:
 448       f = r / (w + 1.0);
 449       break;
 450     default:
 451       NOT_REACHED ();
 452     }
 453
 454
 455   return (f > 0) ? f : SYSMIS;
 456 }
 457
 458 static double
 459 rank_normal (const struct rank *cmd, double c, double cc, double cc_1,
 460              int i, double w)
 461 {
 462   double f = rank_proportion (cmd, c, cc, cc_1, i, w);
 463
 464   return gsl_cdf_ugaussian_Pinv (f);
 465 }
 466
 467 static double
 468 rank_ntiles (const struct rank *cmd, double c, double cc, double cc_1,
 469              int i, double w)
 470 {
 471   double r = rank_rank (cmd, c, cc, cc_1, i, w);
 472
 473
 474   return (floor ((r * cmd->k_ntiles) / (w + 1)) + 1);
 475 }
 476
 477 /* Expected value of the order statistics from an exponential distribution */
 478 static double
 479 ee (int j, double w_star)
 480 {
 481   int k;
 482   double sum = 0.0;
 483
 484   for (k = 1 ; k <= j; k++)
 485     sum += 1.0 / (w_star + 1 - k);
 486
 487   return sum;
 488 }
 489
 490
 491 static double
 492 rank_savage (const struct rank *cmd UNUSED, double c, double cc, double cc_1,
 493              int i UNUSED, double w)
 494 {
 495   double int_part;
 496   const int i_1 = floor (cc_1);
 497   const int i_2 = floor (cc);
 498
 499   const double w_star = (modf (w, &int_part) == 0) ? w : floor (w) + 1;
 500
 501   const double g_1 = cc_1 - i_1;
 502   const double g_2 = cc - i_2;
 503
 504   /* The second factor is infinite, when the first is zero.
 505      Therefore, evaluate the second, only when the first is non-zero */
 506   const double expr1 =  (1 - g_1) ? (1 - g_1) * ee(i_1+1, w_star) : (1 - g_1);
 507   const double expr2 =  g_2 ? g_2 * ee (i_2+1, w_star) : g_2 ;
 508
 509   if (i_1 == i_2)
 510     return ee (i_1 + 1, w_star) - 1;
 511
 512   if (i_1 + 1 == i_2)
 513     return ((expr1 + expr2)/c) - 1;
 514
 515   if (i_1 + 2 <= i_2)
 516     {
 517       int j;
 518       double sigma = 0.0;
 519       for (j = i_1 + 2 ; j <= i_2; ++j)
 520         sigma += ee (j, w_star);
 521       return ((expr1 + expr2 + sigma) / c) -1;
 522     }
 523
 524   NOT_REACHED();
 525 }
 526
 527 static double
 528 sum_weights (const struct casereader *input, int weight_idx)
 529 {
 530   if (weight_idx == -1)
 531     return casereader_count_cases (input);
 532   else
 533     {
 534       struct casereader *pass;
 535       struct ccase *c;
 536       double w;
 537
 538       w = 0.0;
 539       pass = casereader_clone (input);
 540       for (; (c = casereader_read (pass)) != NULL; case_unref (c))
 541         w += case_num_idx (c, weight_idx);
 542       casereader_destroy (pass);
 543
 544       return w;
 545     }
 546 }
 547
 548 static void
 549 rank_sorted_file (struct casereader *input,
 550                   struct casewriter *output,
 551                   int weight_idx,
 552                   const struct rank *cmd)
 553 {
 554   struct casegrouper *tie_grouper;
 555   struct casereader *tied_cases;
 556   struct subcase input_var;
 557   int tie_group = 1;
 558   struct ccase *c;
 559   double cc = 0.0;
 560   double w;
 561
 562   /* Get total group weight. */
 563   w = sum_weights (input, weight_idx);
 564
 565   /* Do ranking. */
 566   subcase_init (&input_var, 0, 0, SC_ASCEND);
 567   tie_grouper = casegrouper_create_subcase (input, &input_var);
 568   subcase_uninit (&input_var);
 569   for (; casegrouper_get_next_group (tie_grouper, &tied_cases);
 570        casereader_destroy (tied_cases))
 571     {
 572       double tw = sum_weights (tied_cases, weight_idx);
 573       double cc_1 = cc;
 574       cc += tw;
 575
 576       taint_propagate (casereader_get_taint (tied_cases),
 577                        casewriter_get_taint (output));
 578
 579       /* Rank tied cases. */
 580       for (; (c = casereader_read (tied_cases)) != NULL; case_unref (c))
 581         {
 582           struct ccase *out_case;
 583           size_t i;
 584
 585           out_case = case_create (casewriter_get_proto (output));
 586           *case_num_rw_idx (out_case, 0) = case_num_idx (c, 1);
 587           for (i = 0; i < cmd->n_rs; ++i)
 588             {
 589               rank_function_t func = rank_func[cmd->rs[i].rfunc];
 590               double rank = func (cmd, tw, cc, cc_1, tie_group, w);
 591               *case_num_rw_idx (out_case, i + 1) = rank;
 592             }
 593
 594           casewriter_write (output, out_case);
 595         }
 596       tie_group++;
 597     }
 598   casegrouper_destroy (tie_grouper);
 599 }
 600
 601
 602 static bool
 603 rank_cmd (struct dataset *ds,  const struct rank *cmd);
 604
 605 static const char *
 606 fraction_name (const struct rank *cmd)
 607 {
 608   switch (cmd->fraction)
 609     {
 610     case FRAC_BLOM:   return "BLOM";
 611     case FRAC_RANKIT: return "RANKIT";
 612     case FRAC_TUKEY:  return "TUKEY";
 613     case FRAC_VW:     return "VW";
 614     default:          NOT_REACHED ();
 615     }
 616 }
 617
 618 /* Returns a label for a variable derived from SRC_VAR with function F. */
 619 static const char *
 620 create_var_label (struct rank *cmd, const struct variable *src_var,
 621                   enum rank_func f)
 622 {
 623   struct string label;
 624   const char *pool_label;
 625
 626   ds_init_empty (&label);
 627
 628   if (cmd->n_group_vars > 0)
 629     {
 630       struct string group_var_str;
 631       int g;
 632
 633       ds_init_empty (&group_var_str);
 634
 635       for (g = 0 ; g < cmd->n_group_vars ; ++g)
 636         {
 637           if (g > 0) ds_put_cstr (&group_var_str, " ");
 638           ds_put_cstr (&group_var_str, var_get_name (cmd->group_vars[g]));
 639         }
 640
 641       ds_put_format (&label, _("%s of %s by %s"), function_name[f],
 642                      var_get_name (src_var), ds_cstr (&group_var_str));
 643       ds_destroy (&group_var_str);
 644     }
 645   else
 646     ds_put_format (&label, _("%s of %s"),
 647                    function_name[f], var_get_name (src_var));
 648
 649   pool_label = pool_strdup (cmd->pool, ds_cstr (&label));
 650
 651   ds_destroy (&label);
 652
 653   return pool_label;
 654 }
 655
 656 int
 657 cmd_rank (struct lexer *lexer, struct dataset *ds)
 658 {
 659   struct string_set new_names;
 660   struct rank rank;
 661   struct rank_spec *rs;
 662
 663   subcase_init_empty (&rank.sc);
 664
 665   rank.rs = NULL;
 666   rank.n_rs = 0;
 667   rank.exclude = MV_ANY;
 668   rank.n_group_vars = 0;
 669   rank.group_vars = NULL;
 670   rank.dict = dataset_dict (ds);
 671   rank.ties = TIES_MEAN;
 672   rank.fraction = FRAC_BLOM;
 673   rank.print = true;
 674   rank.vars = NULL;
 675   rank.pool = pool_create ();
 676
 677   string_set_init (&new_names);
 678
 679   if (lex_match_id (lexer, "VARIABLES"))
 680     if (! lex_force_match (lexer, T_EQUALS))
 681       goto error;
 682
 683   if (!parse_sort_criteria (lexer, rank.dict,
 684                             &rank.sc,
 685                             &rank.vars, NULL))
 686     goto error;
 687
 688   rank.n_vars = rank.sc.n_fields;
 689
 690   if (lex_match (lexer, T_BY))
 691     {
 692       if (! parse_variables_const (lexer, rank.dict,
 693                                     &rank.group_vars, &rank.n_group_vars,
 694                                     PV_NO_DUPLICATE | PV_NO_SCRATCH))
 695         goto error;
 696     }
 697
 698
 699   while (lex_token (lexer) != T_ENDCMD)
 700     {
 701       if (! lex_force_match (lexer, T_SLASH))
 702         goto error;
 703       if (lex_match_id (lexer, "TIES"))
 704         {
 705           if (! lex_force_match (lexer, T_EQUALS))
 706             goto error;
 707           if (lex_match_id (lexer, "MEAN"))
 708             {
 709               rank.ties = TIES_MEAN;
 710             }
 711           else if (lex_match_id (lexer, "LOW"))
 712             {
 713               rank.ties = TIES_LOW;
 714             }
 715           else if (lex_match_id (lexer, "HIGH"))
 716             {
 717               rank.ties = TIES_HIGH;
 718             }
 719           else if (lex_match_id (lexer, "CONDENSE"))
 720             {
 721               rank.ties = TIES_CONDENSE;
 722             }
 723           else
 724             {
 725               lex_error (lexer, NULL);
 726               goto error;
 727             }
 728         }
 729       else if (lex_match_id (lexer, "FRACTION"))
 730         {
 731           if (! lex_force_match (lexer, T_EQUALS))
 732             goto error;
 733           if (lex_match_id (lexer, "BLOM"))
 734             {
 735               rank.fraction = FRAC_BLOM;
 736             }
 737           else if (lex_match_id (lexer, "TUKEY"))
 738             {
 739               rank.fraction = FRAC_TUKEY;
 740             }
 741           else if (lex_match_id (lexer, "VW"))
 742             {
 743               rank.fraction = FRAC_VW;
 744             }
 745           else if (lex_match_id (lexer, "RANKIT"))
 746             {
 747               rank.fraction = FRAC_RANKIT;
 748             }
 749           else
 750             {
 751               lex_error (lexer, NULL);
 752               goto error;
 753             }
 754         }
 755       else if (lex_match_id (lexer, "PRINT"))
 756         {
 757           if (! lex_force_match (lexer, T_EQUALS))
 758             goto error;
 759           if (lex_match_id (lexer, "YES"))
 760             {
 761               rank.print = true;
 762             }
 763           else if (lex_match_id (lexer, "NO"))
 764             {
 765               rank.print = false;
 766             }
 767           else
 768             {
 769               lex_error (lexer, NULL);
 770               goto error;
 771             }
 772         }
 773       else if (lex_match_id (lexer, "MISSING"))
 774         {
 775           if (! lex_force_match (lexer, T_EQUALS))
 776             goto error;
 777           if (lex_match_id (lexer, "INCLUDE"))
 778             {
 779               rank.exclude = MV_SYSTEM;
 780             }
 781           else if (lex_match_id (lexer, "EXCLUDE"))
 782             {
 783               rank.exclude = MV_ANY;
 784             }
 785           else
 786             {
 787               lex_error (lexer, NULL);
 788               goto error;
 789             }
 790         }
 791       else if (! parse_into (lexer, &rank, &new_names))
 792         goto error;
 793     }
 794
 795
 796   /* If no rank specs are given, then apply a default */
 797   if (rank.n_rs == 0)
 798     {
 799       struct rank_spec *rs;
 800
 801       rs = pool_calloc (rank.pool, 1, sizeof *rs);
 802       rs->rfunc = RANK;
 803       rs->dest_names = pool_calloc (rank.pool, rank.n_vars,
 804                                     sizeof *rs->dest_names);
 805
 806       rank.rs = rs;
 807       rank.n_rs = 1;
 808     }
 809
 810   /* Choose variable names for all rank destinations which haven't already been
 811      created with INTO. */
 812   for (rs = rank.rs; rs < &rank.rs[rank.n_rs]; rs++)
 813     {
 814       rs->dest_labels = pool_calloc (rank.pool, rank.n_vars,
 815                                      sizeof *rs->dest_labels);
 816       for (int v = 0 ; v < rank.n_vars ;  v ++)
 817         {
 818           const char **dst_name = &rs->dest_names[v];
 819           if (*dst_name == NULL)
 820             {
 821               *dst_name = rank_choose_dest_name (rank.dict, &new_names,
 822                                                  rs->rfunc,
 823                                                  var_get_name (rank.vars[v]));
 824               if (*dst_name == NULL)
 825                 goto error;
 826             }
 827
 828           rs->dest_labels[v] = create_var_label (&rank, rank.vars[v],
 829                                                  rs->rfunc);
 830         }
 831     }
 832
 833   if (rank.print)
 834     {
 835       struct pivot_table *table = pivot_table_create (
 836         N_("Variables Created by RANK"));
 837
 838       pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("New Variable"),
 839                               N_("New Variable"), N_("Function"),
 840                               N_("Fraction"), N_("Grouping Variables"));
 841
 842       struct pivot_dimension *variables = pivot_dimension_create (
 843         table, PIVOT_AXIS_ROW, N_("Existing Variable"),
 844         N_("Existing Variable"));
 845       variables->root->show_label = true;
 846
 847       for (size_t i = 0 ; i <  rank.n_rs ; ++i)
 848         {
 849           for (size_t v = 0 ; v < rank.n_vars ;  v ++)
 850             {
 851               int row_idx = pivot_category_create_leaf (
 852                 variables->root, pivot_value_new_variable (rank.vars[v]));
 853
 854               struct string group_vars = DS_EMPTY_INITIALIZER;
 855               for (int g = 0 ; g < rank.n_group_vars ; ++g)
 856                 {
 857                   if (g)
 858                     ds_put_byte (&group_vars, ' ');
 859                   ds_put_cstr (&group_vars, var_get_name (rank.group_vars[g]));
 860                 }
 861
 862               enum rank_func rfunc = rank.rs[i].rfunc;
 863               bool has_fraction = rfunc == NORMAL || rfunc == PROPORTION;
 864               const char *entries[] =
 865                 {
 866                   rank.rs[i].dest_names[v],
 867                   function_name[rank.rs[i].rfunc],
 868                   has_fraction ? fraction_name (&rank) : NULL,
 869                   rank.n_group_vars ? ds_cstr (&group_vars) : NULL,
 870                 };
 871               for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
 872                 {
 873                   const char *entry = entries[j];
 874                   if (entry)
 875                     pivot_table_put2 (table, j, row_idx,
 876                                       pivot_value_new_user_text (entry, -1));
 877                 }
 878               ds_destroy (&group_vars);
 879             }
 880         }
 881
 882       pivot_table_submit (table);
 883     }
 884
 885   /* Do the ranking */
 886   rank_cmd (ds, &rank);
 887
 888   destroy_rank (&rank);
 889   string_set_destroy (&new_names);
 890   return CMD_SUCCESS;
 891
 892  error:
 893
 894   destroy_rank (&rank);
 895   string_set_destroy (&new_names);
 896   return CMD_FAILURE;
 897 }
 898
 899 /* RANK transformation. */
 900 struct rank_trns
 901   {
 902     int order_case_idx;
 903
 904     struct rank_trns_input_var *input_vars;
 905     size_t n_input_vars;
 906
 907     size_t n_funcs;
 908   };
 909
 910 struct rank_trns_input_var
 911   {
 912     struct casereader *input;
 913     struct ccase *current;
 914
 915     struct variable **output_vars;
 916   };
 917
 918 static void
 919 advance_ranking (struct rank_trns_input_var *iv)
 920 {
 921   case_unref (iv->current);
 922   iv->current = casereader_read (iv->input);
 923 }
 924
 925 static enum trns_result
 926 rank_trns_proc (void *trns_, struct ccase **c, casenumber case_idx UNUSED)
 927 {
 928   struct rank_trns *trns = trns_;
 929   double order = case_num_idx (*c, trns->order_case_idx);
 930   struct rank_trns_input_var *iv;
 931
 932   *c = case_unshare (*c);
 933   for (iv = trns->input_vars; iv < &trns->input_vars[trns->n_input_vars]; iv++)
 934     while (iv->current != NULL)
 935       {
 936         double iv_order = case_num_idx (iv->current, 0);
 937         if (iv_order == order)
 938           {
 939             size_t i;
 940
 941             for (i = 0; i < trns->n_funcs; i++)
 942               *case_num_rw (*c, iv->output_vars[i])
 943                 = case_num_idx (iv->current, i + 1);
 944             advance_ranking (iv);
 945             break;
 946           }
 947         else if (iv_order > order)
 948           break;
 949         else
 950           advance_ranking (iv);
 951       }
 952   return TRNS_CONTINUE;
 953 }
 954
 955 static bool
 956 rank_trns_free (void *trns_)
 957 {
 958   struct rank_trns *trns = trns_;
 959   struct rank_trns_input_var *iv;
 960
 961   for (iv = trns->input_vars; iv < &trns->input_vars[trns->n_input_vars]; iv++)
 962     {
 963       casereader_destroy (iv->input);
 964       case_unref (iv->current);
 965
 966       free (iv->output_vars);
 967     }
 968   free (trns->input_vars);
 969   free (trns);
 970
 971   return true;
 972 }
 973
 974 static const struct trns_class rank_trns_class = {
 975   .name = "RANK",
 976   .execute = rank_trns_proc,
 977   .destroy = rank_trns_free,
 978 };
 979
 980 static bool
 981 rank_cmd (struct dataset *ds, const struct rank *cmd)
 982 {
 983   struct dictionary *d = dataset_dict (ds);
 984   struct variable *weight_var = dict_get_weight (d);
 985   struct casewriter **outputs;
 986   struct variable *order_var;
 987   struct casereader *input;
 988   struct rank_trns *trns;
 989   bool ok = true;
 990   int i;
 991
 992   order_var = add_permanent_ordering_transformation (ds);
 993
 994   /* Create output files. */
 995   {
 996     struct caseproto *output_proto;
 997     struct subcase by_order;
 998
 999     output_proto = caseproto_create ();
1000     for (i = 0; i < cmd->n_rs + 1; i++)
1001       output_proto = caseproto_add_width (output_proto, 0);
1002
1003     subcase_init (&by_order, 0, 0, SC_ASCEND);
1004
1005     outputs = xnmalloc (cmd->n_vars, sizeof *outputs);
1006     for (i = 0; i < cmd->n_vars; i++)
1007       outputs[i] = sort_create_writer (&by_order, output_proto);
1008
1009     subcase_uninit (&by_order);
1010     caseproto_unref (output_proto);
1011   }
1012
1013   /* Open the active file and make one pass per input variable. */
1014   input = proc_open (ds);
1015   input = casereader_create_filter_weight (input, d, NULL, NULL);
1016   for (i = 0 ; i < cmd->n_vars ; ++i)
1017     {
1018       const struct variable *input_var = cmd->vars[i];
1019       struct casereader *input_pass;
1020       struct casegrouper *split_grouper;
1021       struct casereader *split_group;
1022       struct subcase rank_ordering;
1023       struct subcase projection;
1024       struct subcase split_vars;
1025       struct subcase group_vars;
1026       int weight_idx;
1027       int j;
1028
1029       /* Discard cases that have missing values of input variable. */
1030       input_pass = i == cmd->n_vars - 1 ? input : casereader_clone (input);
1031       input_pass = casereader_create_filter_missing (input_pass, &input_var, 1,
1032                                                      cmd->exclude, NULL, NULL);
1033
1034       /* Keep only the columns we really need, to save time and space when we
1035          sort them just below.
1036
1037          After this projection, the input_pass case indexes look like:
1038
1039            - 0: input_var.
1040            - 1: order_var.
1041            - 2 and up: cmd->n_group_vars group variables
1042            - 2 + cmd->n_group_vars and up: split variables
1043            - 2 + cmd->n_group_vars + n_split_vars: weight var
1044       */
1045       subcase_init_empty (&projection);
1046       subcase_add_var_always (&projection, input_var, SC_ASCEND);
1047       subcase_add_var_always (&projection, order_var, SC_ASCEND);
1048       subcase_add_vars_always (&projection,
1049                                cmd->group_vars, cmd->n_group_vars);
1050       subcase_add_vars_always (&projection, dict_get_split_vars (d),
1051                                dict_get_n_splits (d));
1052       if (weight_var != NULL)
1053         {
1054           subcase_add_var_always (&projection, weight_var, SC_ASCEND);
1055           weight_idx = 2 + cmd->n_group_vars + dict_get_n_splits (d);
1056         }
1057       else
1058         weight_idx = -1;
1059       input_pass = casereader_project (input_pass, &projection);
1060       subcase_uninit (&projection);
1061
1062       /* Prepare 'group_vars' as the set of grouping variables. */
1063       subcase_init_empty (&group_vars);
1064       for (j = 0; j < cmd->n_group_vars; j++)
1065         subcase_add_always (&group_vars,
1066                             j + 2, var_get_width (cmd->group_vars[j]),
1067                             SC_ASCEND);
1068
1069       /* Prepare 'rank_ordering' for sorting with the group variables as
1070          primary key and the input variable as secondary key. */
1071       subcase_clone (&rank_ordering, &group_vars);
1072       subcase_add (&rank_ordering, 0, 0, subcase_get_direction (&cmd->sc, i));
1073
1074       /* Group by split variables */
1075       subcase_init_empty (&split_vars);
1076       for (j = 0; j < dict_get_n_splits (d); j++)
1077         subcase_add_always (&split_vars, 2 + j + cmd->n_group_vars,
1078                             var_get_width (dict_get_split_vars (d)[j]),
1079                             SC_ASCEND);
1080       split_grouper = casegrouper_create_subcase (input_pass, &split_vars);
1081       subcase_uninit (&split_vars);
1082       while (casegrouper_get_next_group (split_grouper, &split_group))
1083         {
1084           struct casereader *ordered;
1085           struct casegrouper *by_grouper;
1086           struct casereader *by_group;
1087
1088           ordered = sort_execute (split_group, &rank_ordering);
1089           by_grouper = casegrouper_create_subcase (ordered, &group_vars);
1090           while (casegrouper_get_next_group (by_grouper, &by_group))
1091             rank_sorted_file (by_group, outputs[i], weight_idx, cmd);
1092           ok = casegrouper_destroy (by_grouper) && ok;
1093         }
1094       subcase_uninit (&group_vars);
1095       subcase_uninit (&rank_ordering);
1096
1097       ok = casegrouper_destroy (split_grouper) && ok;
1098     }
1099   ok = proc_commit (ds) && ok;
1100
1101   /* Re-fetch the dictionary and order variable, because if TEMPORARY was in
1102      effect then there's a new dictionary. */
1103   d = dataset_dict (ds);
1104   order_var = dict_lookup_var_assert (d, "$ORDER");
1105
1106   /* Merge the original data set with the ranks (which we already sorted on
1107      $ORDER). */
1108   trns = xmalloc (sizeof *trns);
1109   trns->order_case_idx = var_get_case_index (order_var);
1110   trns->input_vars = xnmalloc (cmd->n_vars, sizeof *trns->input_vars);
1111   trns->n_input_vars = cmd->n_vars;
1112   trns->n_funcs = cmd->n_rs;
1113   for (i = 0; i < trns->n_input_vars; i++)
1114     {
1115       struct rank_trns_input_var *iv = &trns->input_vars[i];
1116       int j;
1117
1118       iv->input = casewriter_make_reader (outputs[i]);
1119       iv->current = casereader_read (iv->input);
1120       iv->output_vars = xnmalloc (trns->n_funcs, sizeof *iv->output_vars);
1121       for (j = 0; j < trns->n_funcs; j++)
1122         {
1123           struct rank_spec *rs = &cmd->rs[j];
1124           struct variable *var;
1125
1126           var = dict_create_var_assert (d, rs->dest_names[i], 0);
1127           var_set_both_formats (var, &dest_format[rs->rfunc]);
1128           var_set_label (var, rs->dest_labels[i]);
1129           var_set_measure (var, rank_measures[rs->rfunc]);
1130
1131           iv->output_vars[j] = var;
1132         }
1133     }
1134   free (outputs);
1135
1136   add_transformation (ds, &rank_trns_class, trns);
1137
1138   /* Delete our sort key, which we don't need anymore. */
1139   dict_delete_var (d, order_var);
1140
1141   return ok;
1142 }