pintos-os.org Git - pspp/blob - src/language/stats/rank.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2005, 2006, 2007, 2009, 2010, 2011, 2012, 2013, 2014, 2016 Free Software Foundation, Inc
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include <math.h>
  20 #include <gsl/gsl_cdf.h>
  21
  22 #include "data/case.h"
  23 #include "data/casegrouper.h"
  24 #include "data/casereader.h"
  25 #include "data/dataset.h"
  26 #include "data/dictionary.h"
  27 #include "data/format.h"
  28 #include "data/variable.h"
  29 #include "data/subcase.h"
  30 #include "data/casewriter.h"
  31 #include "data/short-names.h"
  32 #include "language/command.h"
  33 #include "language/lexer/lexer.h"
  34 #include "language/lexer/variable-parser.h"
  35 #include "language/stats/sort-criteria.h"
  36 #include "math/sort.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/i18n.h"
  39 #include "libpspp/message.h"
  40 #include "libpspp/misc.h"
  41 #include "libpspp/pool.h"
  42 #include "libpspp/stringi-set.h"
  43 #include "libpspp/taint.h"
  44 #include "output/pivot-table.h"
  45
  46 #include "gettext.h"
  47 #define _(msgid) gettext (msgid)
  48 #define N_(msgid) (msgid)
  49
  50 struct rank;
  51
  52 typedef double (*rank_function_t) (const struct rank*, double c, double cc, double cc_1,
  53                                    int i, double w);
  54
  55 static double rank_proportion (const struct rank *, double c, double cc, double cc_1,
  56                                int i, double w);
  57
  58 static double rank_normal (const struct rank *, double c, double cc, double cc_1,
  59                            int i, double w);
  60
  61 static double rank_percent (const struct rank *, double c, double cc, double cc_1,
  62                             int i, double w);
  63
  64 static double rank_rfraction (const struct rank *, double c, double cc, double cc_1,
  65                               int i, double w);
  66
  67 static double rank_rank (const struct rank *, double c, double cc, double cc_1,
  68                          int i, double w);
  69
  70 static double rank_n (const struct rank *, double c, double cc, double cc_1,
  71                       int i, double w);
  72
  73 static double rank_savage (const struct rank *, double c, double cc, double cc_1,
  74                            int i, double w);
  75
  76 static double rank_ntiles (const struct rank *, double c, double cc, double cc_1,
  77                            int i, double w);
  78
  79
  80 enum rank_func
  81   {
  82     RANK,
  83     NORMAL,
  84     PERCENT,
  85     RFRACTION,
  86     PROPORTION,
  87     N,
  88     NTILES,
  89     SAVAGE,
  90     n_RANK_FUNCS
  91   };
  92
  93 static const struct fmt_spec dest_format[n_RANK_FUNCS] = {
  94   [RANK]       = { .type = FMT_F, .w = 9, .d = 3 },
  95   [NORMAL]     = { .type = FMT_F, .w = 6, .d = 4 },
  96   [PERCENT]    = { .type = FMT_F, .w = 6, .d = 2 },
  97   [RFRACTION]  = { .type = FMT_F, .w = 6, .d = 4 },
  98   [PROPORTION] = { .type = FMT_F, .w = 6, .d = 4 },
  99   [N]          = { .type = FMT_F, .w = 6, .d = 0 },
 100   [NTILES]     = { .type = FMT_F, .w = 3, .d = 0 },
 101   [SAVAGE]     = { .type = FMT_F, .w = 8, .d = 4 }
 102 };
 103
 104 static const char * const function_name[n_RANK_FUNCS] = {
 105   "RANK",
 106   "NORMAL",
 107   "PERCENT",
 108   "RFRACTION",
 109   "PROPORTION",
 110   "N",
 111   "NTILES",
 112   "SAVAGE"
 113 };
 114
 115 static const rank_function_t rank_func[n_RANK_FUNCS] = {
 116   rank_rank,
 117   rank_normal,
 118   rank_percent,
 119   rank_rfraction,
 120   rank_proportion,
 121   rank_n,
 122   rank_ntiles,
 123   rank_savage
 124 };
 125
 126 static enum measure rank_measures[n_RANK_FUNCS] = {
 127   [RANK] = MEASURE_ORDINAL,
 128   [NORMAL] = MEASURE_ORDINAL,
 129   [PERCENT] = MEASURE_ORDINAL,
 130   [RFRACTION] = MEASURE_ORDINAL,
 131   [PROPORTION] = MEASURE_ORDINAL,
 132   [N] = MEASURE_SCALE,
 133   [NTILES] = MEASURE_ORDINAL,
 134   [SAVAGE] = MEASURE_ORDINAL,
 135 };
 136
 137 enum ties
 138   {
 139     TIES_LOW,
 140     TIES_HIGH,
 141     TIES_MEAN,
 142     TIES_CONDENSE
 143   };
 144
 145 enum fraction
 146   {
 147     FRAC_BLOM,
 148     FRAC_RANKIT,
 149     FRAC_TUKEY,
 150     FRAC_VW
 151   };
 152
 153 struct rank_spec
 154 {
 155   enum rank_func rfunc;
 156   const char **dest_names;
 157   const char **dest_labels;
 158 };
 159
 160 /* If NEW_NAME exists in DICT or NEW_NAMES, returns NULL without changing
 161    anything.  Otherwise, inserts NEW_NAME in NEW_NAMES and returns the copy of
 162    NEW_NAME now in NEW_NAMES.  In any case, frees NEW_NAME. */
 163 static const char *
 164 try_new_name (char *new_name,
 165               const struct dictionary *dict, struct stringi_set *new_names)
 166 {
 167   const char *retval = (!dict_lookup_var (dict, new_name)
 168                         && stringi_set_insert (new_names, new_name)
 169                         ? stringi_set_find_node (new_names, new_name)->string
 170                         : NULL);
 171   free (new_name);
 172   return retval;
 173 }
 174
 175 /* Returns a variable name for storing ranks of a variable named SRC_NAME
 176    according to the rank function F.  The name chosen will not be one already in
 177    DICT or NEW_NAMES.
 178
 179    If successful, adds the new name to NEW_NAMES and returns the name added.
 180    If no name can be generated, returns NULL. */
 181 static const char *
 182 rank_choose_dest_name (struct dictionary *dict, struct stringi_set *new_names,
 183                        enum rank_func f, const char *src_name)
 184 {
 185   /* Try the first character of the ranking function followed by the first 7
 186      bytes of the srcinal variable name. */
 187   char *src_name_7 = utf8_encoding_trunc (src_name, dict_get_encoding (dict),
 188                                           7);
 189   const char *s = try_new_name (
 190     xasprintf ("%c%s", function_name[f][0], src_name_7), dict, new_names);
 191   free (src_name_7);
 192   if (s)
 193     return s;
 194
 195   /* Try "fun###". */
 196   for (int i = 1; i <= 999; i++)
 197     {
 198       s = try_new_name (xasprintf ("%.3s%03d", function_name[f], i),
 199                         dict, new_names);
 200       if (s)
 201         return s;
 202     }
 203
 204   /* Try "RNKfn##". */
 205   for (int i = 1; i <= 99; i++)
 206     {
 207       s = try_new_name (xasprintf ("RNK%.2s%02d", function_name[f], i),
 208                         dict, new_names);
 209       if (s)
 210         return s;
 211     }
 212
 213   msg (ME, _("Cannot generate variable name for ranking %s with %s.  "
 214              "All candidates in use."),
 215        src_name, function_name[f]);
 216   return NULL;
 217 }
 218
 219 struct rank
 220 {
 221   struct dictionary *dict;
 222
 223   struct subcase sc;
 224
 225   const struct variable **vars;
 226   size_t n_vars;
 227
 228   const struct variable **group_vars;
 229   size_t n_group_vars;
 230
 231
 232   enum mv_class exclude;
 233
 234   struct rank_spec *rs;
 235   size_t n_rs;
 236
 237   enum ties ties;
 238
 239   enum fraction fraction;
 240   int k_ntiles;
 241
 242   bool print;
 243
 244   /* Pool on which cell functions may allocate data */
 245   struct pool *pool;
 246 };
 247
 248
 249 static void
 250 destroy_rank (struct rank *rank)
 251 {
 252   free (rank->vars);
 253   free (rank->group_vars);
 254   subcase_uninit (&rank->sc);
 255   pool_destroy (rank->pool);
 256 }
 257
 258 static bool
 259 parse_into (struct lexer *lexer, struct rank *cmd,
 260             struct stringi_set *new_names)
 261 {
 262   enum rank_func rfunc;
 263   if (lex_match_id (lexer, "RANK"))
 264     rfunc = RANK;
 265   else if (lex_match_id (lexer, "NORMAL"))
 266     rfunc = NORMAL;
 267   else if (lex_match_id (lexer, "RFRACTION"))
 268     rfunc = RFRACTION;
 269   else if (lex_match_id (lexer, "N"))
 270     rfunc = N;
 271   else if (lex_match_id (lexer, "SAVAGE"))
 272     rfunc = SAVAGE;
 273   else if (lex_match_id (lexer, "PERCENT"))
 274     rfunc = PERCENT;
 275   else if (lex_match_id (lexer, "PROPORTION"))
 276     rfunc = PROPORTION;
 277   else if (lex_match_id (lexer, "NTILES"))
 278     {
 279       if (!lex_force_match (lexer, T_LPAREN)
 280           || !lex_force_int_range (lexer, "NTILES", 1, INT_MAX))
 281         return false;
 282
 283       cmd->k_ntiles = lex_integer (lexer);
 284       lex_get (lexer);
 285
 286       if (!lex_force_match (lexer, T_RPAREN))
 287         return false;
 288
 289       rfunc = NTILES;
 290     }
 291   else
 292     {
 293       lex_error_expecting (lexer, "RANK", "NORMAL", "RFRACTION", "N",
 294                            "SAVAGE", "PERCENT", "PROPORTION", "NTILES");
 295       return false;
 296     }
 297
 298   cmd->rs = pool_realloc (cmd->pool, cmd->rs, sizeof (*cmd->rs) * (cmd->n_rs + 1));
 299   struct rank_spec *rs = &cmd->rs[cmd->n_rs++];
 300   *rs = (struct rank_spec) {
 301     .rfunc = rfunc,
 302     .dest_names = pool_calloc (cmd->pool, cmd->n_vars,
 303                                sizeof *rs->dest_names),
 304   };
 305
 306   if (lex_match_id (lexer, "INTO"))
 307     {
 308       int vars_start = lex_ofs (lexer);
 309       size_t var_count = 0;
 310       while (lex_token (lexer) == T_ID)
 311         {
 312           const char *name = lex_tokcstr (lexer);
 313
 314           if (var_count >= subcase_get_n_fields (&cmd->sc))
 315             lex_ofs_error (lexer, vars_start, lex_ofs (lexer),
 316                            _("Too many variables in %s clause."), "INTO");
 317           else if (dict_lookup_var (cmd->dict, name) != NULL)
 318             lex_error (lexer, _("Variable %s already exists."), name);
 319           else if (stringi_set_contains (new_names, name))
 320             lex_error (lexer, _("Duplicate variable name %s."), name);
 321           else
 322             {
 323               stringi_set_insert (new_names, name);
 324               rs->dest_names[var_count++] = pool_strdup (cmd->pool, name);
 325               lex_get (lexer);
 326               continue;
 327             }
 328
 329           /* Error path. */
 330           return false;
 331         }
 332     }
 333
 334   return true;
 335 }
 336
 337 /* Hardly a rank function. */
 338 static double
 339 rank_n (const struct rank *cmd UNUSED, double c UNUSED, double cc UNUSED, double cc_1 UNUSED,
 340         int i UNUSED, double w)
 341 {
 342   return w;
 343 }
 344
 345 static double
 346 rank_rank (const struct rank *cmd, double c, double cc, double cc_1,
 347            int i, double w UNUSED)
 348 {
 349   double rank;
 350
 351   if (c >= 1.0)
 352     {
 353       switch (cmd->ties)
 354         {
 355         case TIES_LOW:
 356           rank = cc_1 + 1;
 357           break;
 358         case TIES_HIGH:
 359           rank = cc;
 360           break;
 361         case TIES_MEAN:
 362           rank = cc_1 + (c + 1.0)/ 2.0;
 363           break;
 364         case TIES_CONDENSE:
 365           rank = i;
 366           break;
 367         default:
 368           NOT_REACHED ();
 369         }
 370     }
 371   else
 372     {
 373       switch (cmd->ties)
 374         {
 375         case TIES_LOW:
 376           rank = cc_1;
 377           break;
 378         case TIES_HIGH:
 379           rank = cc;
 380           break;
 381         case TIES_MEAN:
 382           rank = cc_1 + c / 2.0;
 383           break;
 384         case TIES_CONDENSE:
 385           rank = i;
 386           break;
 387         default:
 388           NOT_REACHED ();
 389         }
 390     }
 391
 392   return rank;
 393 }
 394
 395
 396 static double
 397 rank_rfraction (const struct rank *cmd, double c, double cc, double cc_1,
 398                 int i, double w)
 399 {
 400   return rank_rank (cmd, c, cc, cc_1, i, w) / w;
 401 }
 402
 403
 404 static double
 405 rank_percent (const struct rank *cmd, double c, double cc, double cc_1,
 406               int i, double w)
 407 {
 408   return rank_rank (cmd, c, cc, cc_1, i, w) * 100.0 / w;
 409 }
 410
 411
 412 static double
 413 rank_proportion (const struct rank *cmd, double c, double cc, double cc_1,
 414                  int i, double w)
 415 {
 416   const double r =  rank_rank (cmd, c, cc, cc_1, i, w);
 417
 418   double f;
 419
 420   switch (cmd->fraction)
 421     {
 422     case FRAC_BLOM:
 423       f =  (r - 3.0/8.0) / (w + 0.25);
 424       break;
 425     case FRAC_RANKIT:
 426       f = (r - 0.5) / w;
 427       break;
 428     case FRAC_TUKEY:
 429       f = (r - 1.0/3.0) / (w + 1.0/3.0);
 430       break;
 431     case FRAC_VW:
 432       f = r / (w + 1.0);
 433       break;
 434     default:
 435       NOT_REACHED ();
 436     }
 437
 438
 439   return (f > 0) ? f : SYSMIS;
 440 }
 441
 442 static double
 443 rank_normal (const struct rank *cmd, double c, double cc, double cc_1,
 444              int i, double w)
 445 {
 446   double f = rank_proportion (cmd, c, cc, cc_1, i, w);
 447
 448   return gsl_cdf_ugaussian_Pinv (f);
 449 }
 450
 451 static double
 452 rank_ntiles (const struct rank *cmd, double c, double cc, double cc_1,
 453              int i, double w)
 454 {
 455   double r = rank_rank (cmd, c, cc, cc_1, i, w);
 456
 457
 458   return (floor ((r * cmd->k_ntiles) / (w + 1)) + 1);
 459 }
 460
 461 /* Expected value of the order statistics from an exponential distribution */
 462 static double
 463 ee (int j, double w_star)
 464 {
 465   double sum = 0.0;
 466
 467   for (int k = 1; k <= j; k++)
 468     sum += 1.0 / (w_star + 1 - k);
 469
 470   return sum;
 471 }
 472
 473
 474 static double
 475 rank_savage (const struct rank *cmd UNUSED, double c, double cc, double cc_1,
 476              int i UNUSED, double w)
 477 {
 478   double int_part;
 479   const int i_1 = floor (cc_1);
 480   const int i_2 = floor (cc);
 481
 482   const double w_star = (modf (w, &int_part) == 0) ? w : floor (w) + 1;
 483
 484   const double g_1 = cc_1 - i_1;
 485   const double g_2 = cc - i_2;
 486
 487   /* The second factor is infinite, when the first is zero.
 488      Therefore, evaluate the second, only when the first is non-zero */
 489   const double expr1 =  (1 - g_1) ? (1 - g_1) * ee(i_1+1, w_star) : (1 - g_1);
 490   const double expr2 =  g_2 ? g_2 * ee (i_2+1, w_star) : g_2;
 491
 492   if (i_1 == i_2)
 493     return ee (i_1 + 1, w_star) - 1;
 494
 495   if (i_1 + 1 == i_2)
 496     return ((expr1 + expr2)/c) - 1;
 497
 498   if (i_1 + 2 <= i_2)
 499     {
 500       double sigma = 0.0;
 501       for (int j = i_1 + 2; j <= i_2; ++j)
 502         sigma += ee (j, w_star);
 503       return ((expr1 + expr2 + sigma) / c) -1;
 504     }
 505
 506   NOT_REACHED ();
 507 }
 508
 509 static double
 510 sum_weights (const struct casereader *input, int weight_idx)
 511 {
 512   if (weight_idx == -1)
 513     return casereader_count_cases (input);
 514
 515   double w = 0.0;
 516
 517   struct casereader *pass = casereader_clone (input);
 518   struct ccase *c;
 519   for (; (c = casereader_read (pass)) != NULL; case_unref (c))
 520     w += case_num_idx (c, weight_idx);
 521   casereader_destroy (pass);
 522
 523   return w;
 524 }
 525
 526 static void
 527 rank_sorted_file (struct casereader *input,
 528                   struct casewriter *output,
 529                   int weight_idx,
 530                   const struct rank *cmd)
 531 {
 532   int tie_group = 1;
 533   double cc = 0.0;
 534
 535   /* Get total group weight. */
 536   double w = sum_weights (input, weight_idx);
 537
 538   /* Do ranking. */
 539   struct subcase input_var = SUBCASE_EMPTY_INITIALIZER;
 540   subcase_add (&input_var, 0, 0, SC_ASCEND);
 541   struct casegrouper *tie_grouper = casegrouper_create_subcase (input, &input_var);
 542   subcase_uninit (&input_var);
 543
 544   struct casereader *tied_cases;
 545   for (; casegrouper_get_next_group (tie_grouper, &tied_cases);
 546        casereader_destroy (tied_cases))
 547     {
 548       double tw = sum_weights (tied_cases, weight_idx);
 549       double cc_1 = cc;
 550       cc += tw;
 551
 552       taint_propagate (casereader_get_taint (tied_cases),
 553                        casewriter_get_taint (output));
 554
 555       /* Rank tied cases. */
 556       struct ccase *c;
 557       for (; (c = casereader_read (tied_cases)) != NULL; case_unref (c))
 558         {
 559           struct ccase *out_case = case_create (casewriter_get_proto (output));
 560           *case_num_rw_idx (out_case, 0) = case_num_idx (c, 1);
 561           for (size_t i = 0; i < cmd->n_rs; ++i)
 562             {
 563               rank_function_t func = rank_func[cmd->rs[i].rfunc];
 564               double rank = func (cmd, tw, cc, cc_1, tie_group, w);
 565               *case_num_rw_idx (out_case, i + 1) = rank;
 566             }
 567
 568           casewriter_write (output, out_case);
 569         }
 570       tie_group++;
 571     }
 572   casegrouper_destroy (tie_grouper);
 573 }
 574
 575
 576 static bool
 577 rank_cmd (struct dataset *ds,  const struct rank *cmd);
 578
 579 static const char *
 580 fraction_name (const struct rank *cmd)
 581 {
 582   switch (cmd->fraction)
 583     {
 584     case FRAC_BLOM:   return "BLOM";
 585     case FRAC_RANKIT: return "RANKIT";
 586     case FRAC_TUKEY:  return "TUKEY";
 587     case FRAC_VW:     return "VW";
 588     default:          NOT_REACHED ();
 589     }
 590 }
 591
 592 /* Returns a label for a variable derived from SRC_VAR with function F. */
 593 static const char *
 594 create_var_label (struct rank *cmd, const struct variable *src_var,
 595                   enum rank_func f)
 596 {
 597   if (cmd->n_group_vars > 0)
 598     {
 599       struct string group_var_str = DS_EMPTY_INITIALIZER;
 600       for (size_t g = 0; g < cmd->n_group_vars; ++g)
 601         {
 602           if (g > 0)
 603             ds_put_cstr (&group_var_str, " ");
 604           ds_put_cstr (&group_var_str, var_get_name (cmd->group_vars[g]));
 605         }
 606
 607       const char *label = pool_asprintf (
 608         cmd->pool, _("%s of %s by %s"), function_name[f],
 609         var_get_name (src_var), ds_cstr (&group_var_str));
 610       ds_destroy (&group_var_str);
 611       return label;
 612     }
 613   else
 614     return pool_asprintf (cmd->pool, _("%s of %s"),
 615                           function_name[f], var_get_name (src_var));
 616 }
 617
 618 int
 619 cmd_rank (struct lexer *lexer, struct dataset *ds)
 620 {
 621   struct stringi_set new_names = STRINGI_SET_INITIALIZER (new_names);
 622   struct rank rank = {
 623     .sc = SUBCASE_EMPTY_INITIALIZER,
 624     .exclude = MV_ANY,
 625     .dict = dataset_dict (ds),
 626     .ties = TIES_MEAN,
 627     .fraction = FRAC_BLOM,
 628     .print = true,
 629     .pool = pool_create (),
 630   };
 631
 632   if (lex_match_id (lexer, "VARIABLES") && !lex_force_match (lexer, T_EQUALS))
 633     goto error;
 634
 635   if (!parse_sort_criteria (lexer, rank.dict, &rank.sc, &rank.vars, NULL))
 636     goto error;
 637   rank.n_vars = rank.sc.n_fields;
 638
 639   if (lex_match (lexer, T_BY)
 640       && !parse_variables_const (lexer, rank.dict,
 641                                  &rank.group_vars, &rank.n_group_vars,
 642                                  PV_NO_DUPLICATE | PV_NO_SCRATCH))
 643     goto error;
 644
 645   while (lex_token (lexer) != T_ENDCMD)
 646     {
 647       if (!lex_force_match (lexer, T_SLASH))
 648         goto error;
 649       if (lex_match_id (lexer, "TIES"))
 650         {
 651           if (!lex_force_match (lexer, T_EQUALS))
 652             goto error;
 653           if (lex_match_id (lexer, "MEAN"))
 654             rank.ties = TIES_MEAN;
 655           else if (lex_match_id (lexer, "LOW"))
 656             rank.ties = TIES_LOW;
 657           else if (lex_match_id (lexer, "HIGH"))
 658             rank.ties = TIES_HIGH;
 659           else if (lex_match_id (lexer, "CONDENSE"))
 660             rank.ties = TIES_CONDENSE;
 661           else
 662             {
 663               lex_error_expecting (lexer, "MEAN", "LOW", "HIGH", "CONDENSE");
 664               goto error;
 665             }
 666         }
 667       else if (lex_match_id (lexer, "FRACTION"))
 668         {
 669           if (!lex_force_match (lexer, T_EQUALS))
 670             goto error;
 671           if (lex_match_id (lexer, "BLOM"))
 672             rank.fraction = FRAC_BLOM;
 673           else if (lex_match_id (lexer, "TUKEY"))
 674             rank.fraction = FRAC_TUKEY;
 675           else if (lex_match_id (lexer, "VW"))
 676             rank.fraction = FRAC_VW;
 677           else if (lex_match_id (lexer, "RANKIT"))
 678             rank.fraction = FRAC_RANKIT;
 679           else
 680             {
 681               lex_error_expecting (lexer, "BLOM", "TUKEY", "VW", "RANKIT");
 682               goto error;
 683             }
 684         }
 685       else if (lex_match_id (lexer, "PRINT"))
 686         {
 687           if (!lex_force_match (lexer, T_EQUALS))
 688             goto error;
 689           if (lex_match_id (lexer, "YES"))
 690             rank.print = true;
 691           else if (lex_match_id (lexer, "NO"))
 692             rank.print = false;
 693           else
 694             {
 695               lex_error_expecting (lexer, "YES", "NO");
 696               goto error;
 697             }
 698         }
 699       else if (lex_match_id (lexer, "MISSING"))
 700         {
 701           if (!lex_force_match (lexer, T_EQUALS))
 702             goto error;
 703           if (lex_match_id (lexer, "INCLUDE"))
 704             rank.exclude = MV_SYSTEM;
 705           else if (lex_match_id (lexer, "EXCLUDE"))
 706             rank.exclude = MV_ANY;
 707           else
 708             {
 709               lex_error_expecting (lexer, "INCLUDE", "EXCLUDE");
 710               goto error;
 711             }
 712         }
 713       else if (!parse_into (lexer, &rank, &new_names))
 714         goto error;
 715     }
 716
 717
 718   /* If no rank specs are given, then apply a default */
 719   if (rank.n_rs == 0)
 720     {
 721       struct rank_spec *rs = pool_malloc (rank.pool, sizeof *rs);
 722       *rs = (struct rank_spec) {
 723         .rfunc = RANK,
 724         .dest_names = pool_calloc (rank.pool, rank.n_vars,
 725                                    sizeof *rs->dest_names),
 726       };
 727
 728       rank.rs = rs;
 729       rank.n_rs = 1;
 730     }
 731
 732   /* Choose variable names for all rank destinations which haven't already been
 733      created with INTO. */
 734   for (struct rank_spec *rs = rank.rs; rs < &rank.rs[rank.n_rs]; rs++)
 735     {
 736       rs->dest_labels = pool_calloc (rank.pool, rank.n_vars,
 737                                      sizeof *rs->dest_labels);
 738       for (int v = 0; v < rank.n_vars;  v ++)
 739         {
 740           const char **dst_name = &rs->dest_names[v];
 741           if (*dst_name == NULL)
 742             {
 743               *dst_name = rank_choose_dest_name (rank.dict, &new_names,
 744                                                  rs->rfunc,
 745                                                  var_get_name (rank.vars[v]));
 746               if (*dst_name == NULL)
 747                 goto error;
 748             }
 749
 750           rs->dest_labels[v] = create_var_label (&rank, rank.vars[v],
 751                                                  rs->rfunc);
 752         }
 753     }
 754
 755   if (rank.print)
 756     {
 757       struct pivot_table *table = pivot_table_create (
 758         N_("Variables Created by RANK"));
 759
 760       pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("New Variable"),
 761                               N_("New Variable"), N_("Function"),
 762                               N_("Fraction"), N_("Grouping Variables"));
 763
 764       struct pivot_dimension *variables = pivot_dimension_create (
 765         table, PIVOT_AXIS_ROW, N_("Existing Variable"),
 766         N_("Existing Variable"));
 767       variables->root->show_label = true;
 768
 769       for (size_t i = 0; i <  rank.n_rs; ++i)
 770         {
 771           for (size_t v = 0; v < rank.n_vars;  v ++)
 772             {
 773               int row_idx = pivot_category_create_leaf (
 774                 variables->root, pivot_value_new_variable (rank.vars[v]));
 775
 776               struct string group_vars = DS_EMPTY_INITIALIZER;
 777               for (int g = 0; g < rank.n_group_vars; ++g)
 778                 {
 779                   if (g)
 780                     ds_put_byte (&group_vars, ' ');
 781                   ds_put_cstr (&group_vars, var_get_name (rank.group_vars[g]));
 782                 }
 783
 784               enum rank_func rfunc = rank.rs[i].rfunc;
 785               bool has_fraction = rfunc == NORMAL || rfunc == PROPORTION;
 786               const char *entries[] =
 787                 {
 788                   rank.rs[i].dest_names[v],
 789                   function_name[rank.rs[i].rfunc],
 790                   has_fraction ? fraction_name (&rank) : NULL,
 791                   rank.n_group_vars ? ds_cstr (&group_vars) : NULL,
 792                 };
 793               for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
 794                 {
 795                   const char *entry = entries[j];
 796                   if (entry)
 797                     pivot_table_put2 (table, j, row_idx,
 798                                       pivot_value_new_user_text (entry, -1));
 799                 }
 800               ds_destroy (&group_vars);
 801             }
 802         }
 803
 804       pivot_table_submit (table);
 805     }
 806
 807   /* Do the ranking */
 808   rank_cmd (ds, &rank);
 809
 810   destroy_rank (&rank);
 811   stringi_set_destroy (&new_names);
 812   return CMD_SUCCESS;
 813
 814  error:
 815   destroy_rank (&rank);
 816   stringi_set_destroy (&new_names);
 817   return CMD_FAILURE;
 818 }
 819
 820 /* RANK transformation. */
 821 struct rank_trns
 822   {
 823     int order_case_idx;
 824
 825     struct rank_trns_input_var *input_vars;
 826     size_t n_input_vars;
 827
 828     size_t n_funcs;
 829   };
 830
 831 struct rank_trns_input_var
 832   {
 833     struct casereader *input;
 834     struct ccase *current;
 835
 836     struct variable **output_vars;
 837   };
 838
 839 static void
 840 advance_ranking (struct rank_trns_input_var *iv)
 841 {
 842   case_unref (iv->current);
 843   iv->current = casereader_read (iv->input);
 844 }
 845
 846 static enum trns_result
 847 rank_trns_proc (void *trns_, struct ccase **c, casenumber case_idx UNUSED)
 848 {
 849   struct rank_trns *trns = trns_;
 850   double order = case_num_idx (*c, trns->order_case_idx);
 851   struct rank_trns_input_var *iv;
 852
 853   *c = case_unshare (*c);
 854   for (iv = trns->input_vars; iv < &trns->input_vars[trns->n_input_vars]; iv++)
 855     while (iv->current != NULL)
 856       {
 857         double iv_order = case_num_idx (iv->current, 0);
 858         if (iv_order == order)
 859           {
 860             size_t i;
 861
 862             for (i = 0; i < trns->n_funcs; i++)
 863               *case_num_rw (*c, iv->output_vars[i])
 864                 = case_num_idx (iv->current, i + 1);
 865             advance_ranking (iv);
 866             break;
 867           }
 868         else if (iv_order > order)
 869           break;
 870         else
 871           advance_ranking (iv);
 872       }
 873   return TRNS_CONTINUE;
 874 }
 875
 876 static bool
 877 rank_trns_free (void *trns_)
 878 {
 879   struct rank_trns *trns = trns_;
 880   struct rank_trns_input_var *iv;
 881
 882   for (iv = trns->input_vars; iv < &trns->input_vars[trns->n_input_vars]; iv++)
 883     {
 884       casereader_destroy (iv->input);
 885       case_unref (iv->current);
 886
 887       free (iv->output_vars);
 888     }
 889   free (trns->input_vars);
 890   free (trns);
 891
 892   return true;
 893 }
 894
 895 static const struct trns_class rank_trns_class = {
 896   .name = "RANK",
 897   .execute = rank_trns_proc,
 898   .destroy = rank_trns_free,
 899 };
 900
 901 static bool
 902 rank_cmd (struct dataset *ds, const struct rank *cmd)
 903 {
 904   struct dictionary *d = dataset_dict (ds);
 905   struct variable *weight_var = dict_get_weight (d);
 906   bool ok = true;
 907
 908   struct variable *order_var = add_permanent_ordering_transformation (ds);
 909
 910   /* Create output files. */
 911   struct caseproto *output_proto = caseproto_create ();
 912   for (size_t i = 0; i < cmd->n_rs + 1; i++)
 913     output_proto = caseproto_add_width (output_proto, 0);
 914
 915   struct subcase by_order;
 916   subcase_init (&by_order, 0, 0, SC_ASCEND);
 917
 918   struct casewriter **outputs = xnmalloc (cmd->n_vars, sizeof *outputs);
 919   for (size_t i = 0; i < cmd->n_vars; i++)
 920     outputs[i] = sort_create_writer (&by_order, output_proto);
 921
 922   subcase_uninit (&by_order);
 923   caseproto_unref (output_proto);
 924
 925   /* Open the active file and make one pass per input variable. */
 926   struct casereader *input = proc_open (ds);
 927   input = casereader_create_filter_weight (input, d, NULL, NULL);
 928   for (size_t i = 0; i < cmd->n_vars; ++i)
 929     {
 930       const struct variable *input_var = cmd->vars[i];
 931
 932       /* Discard cases that have missing values of input variable. */
 933       struct casereader *input_pass
 934         = i == cmd->n_vars - 1 ? input : casereader_clone (input);
 935       input_pass = casereader_create_filter_missing (input_pass, &input_var, 1,
 936                                                      cmd->exclude, NULL, NULL);
 937
 938       /* Keep only the columns we really need, to save time and space when we
 939          sort them just below.
 940
 941          After this projection, the input_pass case indexes look like:
 942
 943            - 0: input_var.
 944            - 1: order_var.
 945            - 2 and up: cmd->n_group_vars group variables
 946            - 2 + cmd->n_group_vars and up: split variables
 947            - 2 + cmd->n_group_vars + n_split_vars: weight var
 948       */
 949       struct subcase projection = SUBCASE_EMPTY_INITIALIZER;
 950       subcase_add_var_always (&projection, input_var, SC_ASCEND);
 951       subcase_add_var_always (&projection, order_var, SC_ASCEND);
 952       subcase_add_vars_always (&projection,
 953                                cmd->group_vars, cmd->n_group_vars);
 954       subcase_add_vars_always (&projection, dict_get_split_vars (d),
 955                                dict_get_n_splits (d));
 956       int weight_idx;
 957       if (weight_var != NULL)
 958         {
 959           subcase_add_var_always (&projection, weight_var, SC_ASCEND);
 960           weight_idx = 2 + cmd->n_group_vars + dict_get_n_splits (d);
 961         }
 962       else
 963         weight_idx = -1;
 964       input_pass = casereader_project (input_pass, &projection);
 965       subcase_uninit (&projection);
 966
 967       /* Prepare 'group_vars' as the set of grouping variables. */
 968       struct subcase group_vars = SUBCASE_EMPTY_INITIALIZER;
 969       for (size_t j = 0; j < cmd->n_group_vars; j++)
 970         subcase_add_always (&group_vars,
 971                             j + 2, var_get_width (cmd->group_vars[j]),
 972                             SC_ASCEND);
 973
 974       /* Prepare 'rank_ordering' for sorting with the group variables as
 975          primary key and the input variable as secondary key. */
 976       struct subcase rank_ordering;
 977       subcase_clone (&rank_ordering, &group_vars);
 978       subcase_add (&rank_ordering, 0, 0, subcase_get_direction (&cmd->sc, i));
 979
 980       /* Group by split variables */
 981       struct subcase split_vars = SUBCASE_EMPTY_INITIALIZER;
 982       for (size_t j = 0; j < dict_get_n_splits (d); j++)
 983         subcase_add_always (&split_vars, 2 + j + cmd->n_group_vars,
 984                             var_get_width (dict_get_split_vars (d)[j]),
 985                             SC_ASCEND);
 986
 987       struct casegrouper *split_grouper
 988         = casegrouper_create_subcase (input_pass, &split_vars);
 989       subcase_uninit (&split_vars);
 990
 991       struct casereader *split_group;
 992       while (casegrouper_get_next_group (split_grouper, &split_group))
 993         {
 994           struct casereader *ordered;
 995           struct casegrouper *by_grouper;
 996           struct casereader *by_group;
 997
 998           ordered = sort_execute (split_group, &rank_ordering);
 999           by_grouper = casegrouper_create_subcase (ordered, &group_vars);
1000           while (casegrouper_get_next_group (by_grouper, &by_group))
1001             rank_sorted_file (by_group, outputs[i], weight_idx, cmd);
1002           ok = casegrouper_destroy (by_grouper) && ok;
1003         }
1004       subcase_uninit (&group_vars);
1005       subcase_uninit (&rank_ordering);
1006
1007       ok = casegrouper_destroy (split_grouper) && ok;
1008     }
1009   ok = proc_commit (ds) && ok;
1010
1011   /* Re-fetch the dictionary and order variable, because if TEMPORARY was in
1012      effect then there's a new dictionary. */
1013   d = dataset_dict (ds);
1014   order_var = dict_lookup_var_assert (d, "$ORDER");
1015
1016   /* Merge the original data set with the ranks (which we already sorted on
1017      $ORDER). */
1018   struct rank_trns *trns = xmalloc (sizeof *trns);
1019   trns->order_case_idx = var_get_case_index (order_var);
1020   trns->input_vars = xnmalloc (cmd->n_vars, sizeof *trns->input_vars);
1021   trns->n_input_vars = cmd->n_vars;
1022   trns->n_funcs = cmd->n_rs;
1023   for (size_t i = 0; i < trns->n_input_vars; i++)
1024     {
1025       struct rank_trns_input_var *iv = &trns->input_vars[i];
1026
1027       iv->input = casewriter_make_reader (outputs[i]);
1028       iv->current = casereader_read (iv->input);
1029       iv->output_vars = xnmalloc (trns->n_funcs, sizeof *iv->output_vars);
1030       for (size_t j = 0; j < trns->n_funcs; j++)
1031         {
1032           struct rank_spec *rs = &cmd->rs[j];
1033           struct variable *var;
1034
1035           var = dict_create_var_assert (d, rs->dest_names[i], 0);
1036           var_set_both_formats (var, &dest_format[rs->rfunc]);
1037           var_set_label (var, rs->dest_labels[i]);
1038           var_set_measure (var, rank_measures[rs->rfunc]);
1039
1040           iv->output_vars[j] = var;
1041         }
1042     }
1043   free (outputs);
1044
1045   add_transformation (ds, &rank_trns_class, trns);
1046
1047   /* Delete our sort key, which we don't need anymore. */
1048   dict_delete_var (d, order_var);
1049
1050   return ok;
1051 }