pintos-os.org Git - pspp/blob - src/language/stats/reliability.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2009, 2010, 2011, 2013, 2015, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include <math.h>
  20
  21 #include "data/casegrouper.h"
  22 #include "data/casereader.h"
  23 #include "data/dataset.h"
  24 #include "data/dictionary.h"
  25 #include "data/format.h"
  26 #include "data/missing-values.h"
  27 #include "language/command.h"
  28 #include "language/lexer/lexer.h"
  29 #include "language/lexer/variable-parser.h"
  30 #include "libpspp/message.h"
  31 #include "libpspp/misc.h"
  32 #include "libpspp/str.h"
  33 #include "math/moments.h"
  34 #include "output/pivot-table.h"
  35 #include "output/output-item.h"
  36
  37 #include "gettext.h"
  38 #define _(msgid) gettext (msgid)
  39 #define N_(msgid) msgid
  40
  41 struct cronbach
  42   {
  43     const struct variable **items;
  44     size_t n_items;
  45     double alpha;
  46     double sum_of_variances;
  47     double variance_of_sums;
  48     int totals_idx;          /* Casereader index into the totals */
  49
  50     struct moments1 **m;    /* Moments of the items */
  51     struct moments1 *total; /* Moments of the totals */
  52   };
  53
  54 #if 0
  55 static void
  56 dump_cronbach (const struct cronbach *s)
  57 {
  58   int i;
  59   printf ("N items %d\n", s->n_items);
  60   for (i = 0; i < s->n_items; ++i)
  61     {
  62       printf ("%s\n", var_get_name (s->items[i]));
  63     }
  64
  65   printf ("Totals idx %d\n", s->totals_idx);
  66
  67   printf ("scale variance %g\n", s->variance_of_sums);
  68   printf ("alpha %g\n", s->alpha);
  69   putchar ('\n');
  70 }
  71 #endif
  72
  73 enum model
  74   {
  75     MODEL_ALPHA,
  76     MODEL_SPLIT
  77   };
  78
  79
  80 struct reliability
  81 {
  82   const struct variable **vars;
  83   size_t n_vars;
  84   enum mv_class exclude;
  85
  86   struct cronbach *sc;
  87   int n_sc;
  88
  89   int total_start;
  90
  91   char *scale_name;
  92
  93   enum model model;
  94   int split_point;
  95
  96   bool summary_total;
  97
  98   const struct variable *wv;
  99 };
 100
 101
 102 static bool run_reliability (struct dataset *ds, const struct reliability *reliability);
 103
 104 static void
 105 reliability_destroy (struct reliability *rel)
 106 {
 107   int j;
 108   free (rel->scale_name);
 109   if (rel->sc)
 110     for (j = 0; j < rel->n_sc; ++j)
 111       {
 112         int x;
 113         free (rel->sc[j].items);
 114         moments1_destroy (rel->sc[j].total);
 115         if (rel->sc[j].m)
 116           for (x = 0; x < rel->sc[j].n_items; ++x)
 117             free (rel->sc[j].m[x]);
 118         free (rel->sc[j].m);
 119       }
 120
 121   free (rel->sc);
 122   free (rel->vars);
 123 }
 124
 125 int
 126 cmd_reliability (struct lexer *lexer, struct dataset *ds)
 127 {
 128   const struct dictionary *dict = dataset_dict (ds);
 129
 130   struct reliability r = {
 131     .model = MODEL_ALPHA,
 132     .exclude = MV_ANY,
 133     .wv = dict_get_weight (dict),
 134     .scale_name = xstrdup ("ANY"),
 135   };
 136
 137   lex_match (lexer, T_SLASH);
 138
 139   if (!lex_force_match_id (lexer, "VARIABLES"))
 140     goto error;
 141
 142   lex_match (lexer, T_EQUALS);
 143
 144   int vars_start = lex_ofs (lexer);
 145   if (!parse_variables_const (lexer, dict, &r.vars, &r.n_vars,
 146                               PV_NO_DUPLICATE | PV_NUMERIC))
 147     goto error;
 148   int vars_end = lex_ofs (lexer) - 1;
 149
 150   if (r.n_vars < 2)
 151     lex_ofs_msg (lexer, SW, vars_start, vars_end,
 152                  _("Reliability on a single variable is not useful."));
 153
 154   /* Create a default scale. */
 155   r.n_sc = 1;
 156   r.sc = xcalloc (r.n_sc, sizeof (struct cronbach));
 157
 158   struct cronbach *c = &r.sc[0];
 159   c->n_items = r.n_vars;
 160   c->items = xcalloc (c->n_items, sizeof (struct variable*));
 161   for (size_t i = 0; i < c->n_items; ++i)
 162     c->items[i] = r.vars[i];
 163
 164   int split_ofs = 0;
 165   while (lex_token (lexer) != T_ENDCMD)
 166     {
 167       lex_match (lexer, T_SLASH);
 168
 169       if (lex_match_id (lexer, "SCALE"))
 170         {
 171           struct const_var_set *vs;
 172           if (!lex_force_match (lexer, T_LPAREN))
 173             goto error;
 174
 175           if (!lex_force_string (lexer))
 176             goto error;
 177           free (r.scale_name);
 178           r.scale_name = xstrdup (lex_tokcstr (lexer));
 179           lex_get (lexer);
 180
 181           if (!lex_force_match (lexer, T_RPAREN))
 182             goto error;
 183
 184           lex_match (lexer, T_EQUALS);
 185
 186           vs = const_var_set_create_from_array (r.vars, r.n_vars);
 187
 188           free (r.sc->items);
 189           if (!parse_const_var_set_vars (lexer, vs, &r.sc->items, &r.sc->n_items, 0))
 190             {
 191               const_var_set_destroy (vs);
 192               goto error;
 193             }
 194
 195           const_var_set_destroy (vs);
 196         }
 197       else if (lex_match_id (lexer, "MODEL"))
 198         {
 199           lex_match (lexer, T_EQUALS);
 200           if (lex_match_id (lexer, "ALPHA"))
 201             r.model = MODEL_ALPHA;
 202           else if (lex_match_id (lexer, "SPLIT"))
 203             {
 204               r.model = MODEL_SPLIT;
 205               r.split_point = -1;
 206
 207               if (lex_match (lexer, T_LPAREN))
 208                 {
 209                   if (!lex_force_num (lexer))
 210                     goto error;
 211                   split_ofs = lex_ofs (lexer);
 212                   r.split_point = lex_number (lexer);
 213                   lex_get (lexer);
 214                   if (!lex_force_match (lexer, T_RPAREN))
 215                     goto error;
 216                 }
 217             }
 218           else
 219             {
 220               lex_error_expecting (lexer, "ALPHA", "SPLIT");
 221               goto error;
 222             }
 223         }
 224       else if (lex_match_id (lexer, "SUMMARY"))
 225         {
 226           lex_match (lexer, T_EQUALS);
 227           if (lex_match_id (lexer, "TOTAL") || lex_match (lexer, T_ALL))
 228             r.summary_total = true;
 229           else
 230             {
 231               lex_error_expecting (lexer, "TOTAL", "ALL");
 232               goto error;
 233             }
 234         }
 235       else if (lex_match_id (lexer, "MISSING"))
 236         {
 237           lex_match (lexer, T_EQUALS);
 238           while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
 239             {
 240               if (lex_match_id (lexer, "INCLUDE"))
 241                 r.exclude = MV_SYSTEM;
 242               else if (lex_match_id (lexer, "EXCLUDE"))
 243                 r.exclude = MV_ANY;
 244               else
 245                 {
 246                   lex_error_expecting (lexer, "INCLUDE", "EXCLUDE");
 247                   goto error;
 248                 }
 249             }
 250         }
 251       else if (lex_match_id (lexer, "STATISTICS"))
 252         {
 253           int statistics_start = lex_ofs (lexer) - 1;
 254           lex_match (lexer, T_EQUALS);
 255           while (lex_match (lexer, T_ID))
 256             continue;
 257           int statistics_end = lex_ofs (lexer) - 1;
 258
 259           lex_ofs_msg (lexer, SW, statistics_start, statistics_end,
 260                        _("The STATISTICS subcommand is not yet implemented.  "
 261                          "No statistics will be produced."));
 262         }
 263       else
 264         {
 265           lex_error_expecting (lexer, "SCALE", "MODEL", "SUMMARY", "MISSING",
 266                                "STATISTICS");
 267           goto error;
 268         }
 269     }
 270
 271   if (r.model == MODEL_SPLIT)
 272     {
 273       if (r.split_point >= r.n_vars)
 274         {
 275           lex_ofs_error (lexer, split_ofs, split_ofs,
 276                          _("The split point must be less than the "
 277                            "number of variables."));
 278           lex_ofs_msg (lexer, SN, vars_start, vars_end,
 279                        ngettext ("There is %zu variable.",
 280                                  "There are %zu variables.", r.n_vars),
 281                        r.n_vars);
 282           goto error;
 283         }
 284
 285       r.n_sc += 2;
 286       r.sc = xrealloc (r.sc, sizeof (struct cronbach) * r.n_sc);
 287
 288       const struct cronbach *s = &r.sc[0];
 289
 290       r.sc[1].n_items = r.split_point == -1 ? s->n_items / 2 : r.split_point;
 291
 292       r.sc[2].n_items = s->n_items - r.sc[1].n_items;
 293       r.sc[1].items = XCALLOC (r.sc[1].n_items, const struct variable *);
 294       r.sc[2].items = XCALLOC (r.sc[2].n_items, const struct variable *);
 295
 296       size_t i = 0;
 297       while (i < r.sc[1].n_items)
 298         {
 299           r.sc[1].items[i] = s->items[i];
 300           i++;
 301         }
 302       while (i < s->n_items)
 303         {
 304           r.sc[2].items[i - r.sc[1].n_items] = s->items[i];
 305           i++;
 306         }
 307     }
 308
 309   if (r.summary_total)
 310     {
 311       const int base_sc = r.n_sc;
 312
 313       r.total_start = base_sc;
 314
 315       r.n_sc +=  r.sc[0].n_items;
 316       r.sc = xrealloc (r.sc, sizeof (struct cronbach) * r.n_sc);
 317
 318       for (size_t i = 0; i < r.sc[0].n_items; ++i)
 319         {
 320           struct cronbach *s = &r.sc[i + base_sc];
 321
 322           s->n_items = r.sc[0].n_items - 1;
 323           s->items = xcalloc (s->n_items, sizeof (struct variable *));
 324
 325           size_t v_dest = 0;
 326           for (size_t v_src = 0; v_src < r.sc[0].n_items; ++v_src)
 327             if (v_src != i)
 328               s->items[v_dest++] = r.sc[0].items[v_src];
 329         }
 330     }
 331
 332   if (!run_reliability (ds, &r))
 333     goto error;
 334
 335   reliability_destroy (&r);
 336   return CMD_SUCCESS;
 337
 338  error:
 339   reliability_destroy (&r);
 340   return CMD_FAILURE;
 341 }
 342
 343
 344 static void
 345 do_reliability (struct casereader *group, struct dataset *ds,
 346                 const struct reliability *rel);
 347
 348
 349 static void reliability_summary_total (const struct reliability *rel);
 350
 351 static void reliability_statistics (const struct reliability *rel);
 352
 353
 354 static bool
 355 run_reliability (struct dataset *ds, const struct reliability *reliability)
 356 {
 357   for (size_t si = 0; si < reliability->n_sc; ++si)
 358     {
 359       struct cronbach *s = &reliability->sc[si];
 360
 361       s->m = xcalloc (s->n_items, sizeof *s->m);
 362       s->total = moments1_create (MOMENT_VARIANCE);
 363
 364       for (size_t i = 0; i < s->n_items; ++i)
 365         s->m[i] = moments1_create (MOMENT_VARIANCE);
 366     }
 367
 368   struct dictionary *dict = dataset_dict (ds);
 369   struct casegrouper *grouper = casegrouper_create_splits (proc_open (ds), dict);
 370   struct casereader *group;
 371   while (casegrouper_get_next_group (grouper, &group))
 372     {
 373       do_reliability (group, ds, reliability);
 374
 375       reliability_statistics (reliability);
 376
 377       if (reliability->summary_total)
 378         reliability_summary_total (reliability);
 379     }
 380
 381   bool ok = casegrouper_destroy (grouper);
 382   ok = proc_commit (ds) && ok;
 383   return ok;
 384 }
 385 \f
 386 /* Return the sum of all the item variables in S */
 387 static double
 388 append_sum (const struct ccase *c, casenumber n UNUSED, void *aux)
 389 {
 390   double sum = 0;
 391   const struct cronbach *s = aux;
 392
 393   for (int v = 0; v < s->n_items; ++v)
 394     sum += case_num (c, s->items[v]);
 395
 396   return sum;
 397 }
 398
 399 static void
 400 case_processing_summary (casenumber n_valid, casenumber n_missing,
 401                          const struct dictionary *);
 402
 403 static double
 404 alpha (int k, double sum_of_variances, double variance_of_sums)
 405 {
 406   return k / (k - 1.0) * (1 - sum_of_variances / variance_of_sums);
 407 }
 408
 409 static void
 410 do_reliability (struct casereader *input, struct dataset *ds,
 411                 const struct reliability *rel)
 412 {
 413   for (size_t si = 0; si < rel->n_sc; ++si)
 414     {
 415       struct cronbach *s = &rel->sc[si];
 416
 417       moments1_clear (s->total);
 418       for (size_t i = 0; i < s->n_items; ++i)
 419         moments1_clear (s->m[i]);
 420     }
 421
 422   casenumber n_missing;
 423   input = casereader_create_filter_missing (input,
 424                                             rel->vars,
 425                                             rel->n_vars,
 426                                             rel->exclude,
 427                                             &n_missing,
 428                                             NULL);
 429
 430   for (size_t si = 0; si < rel->n_sc; ++si)
 431     {
 432       struct cronbach *s = &rel->sc[si];
 433       s->totals_idx = caseproto_get_n_widths (casereader_get_proto (input));
 434       input = casereader_create_append_numeric (input, append_sum, s, NULL);
 435     }
 436
 437   struct ccase *c;
 438   casenumber n_valid = 0;
 439   for (; (c = casereader_read (input)) != NULL; case_unref (c))
 440     {
 441       double weight = 1.0;
 442       n_valid++;
 443
 444       for (size_t si = 0; si < rel->n_sc; ++si)
 445         {
 446           struct cronbach *s = &rel->sc[si];
 447
 448           for (size_t i = 0; i < s->n_items; ++i)
 449             moments1_add (s->m[i], case_num (c, s->items[i]), weight);
 450           moments1_add (s->total, case_num_idx (c, s->totals_idx), weight);
 451         }
 452     }
 453   casereader_destroy (input);
 454
 455   for (size_t si = 0; si < rel->n_sc; ++si)
 456     {
 457       struct cronbach *s = &rel->sc[si];
 458
 459       s->sum_of_variances = 0;
 460       for (size_t i = 0; i < s->n_items; ++i)
 461         {
 462           double weight, mean, variance;
 463           moments1_calculate (s->m[i], &weight, &mean, &variance, NULL, NULL);
 464
 465           s->sum_of_variances += variance;
 466         }
 467
 468       moments1_calculate (s->total, NULL, NULL, &s->variance_of_sums,
 469                           NULL, NULL);
 470
 471       s->alpha = alpha (s->n_items, s->sum_of_variances, s->variance_of_sums);
 472     }
 473
 474   output_item_submit (text_item_create_nocopy (
 475                         TEXT_ITEM_TITLE,
 476                         xasprintf (_("Scale: %s"), rel->scale_name),
 477                         NULL));
 478
 479   case_processing_summary (n_valid, n_missing, dataset_dict (ds));
 480 }
 481
 482 static void
 483 case_processing_summary (casenumber n_valid, casenumber n_missing,
 484                          const struct dictionary *dict)
 485 {
 486   struct pivot_table *table = pivot_table_create (
 487     N_("Case Processing Summary"));
 488   pivot_table_set_weight_var (table, dict_get_weight (dict));
 489
 490   pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
 491                           N_("N"), PIVOT_RC_COUNT,
 492                           N_("Percent"), PIVOT_RC_PERCENT);
 493
 494   struct pivot_dimension *cases = pivot_dimension_create (
 495     table, PIVOT_AXIS_ROW, N_("Cases"), N_("Valid"), N_("Excluded"),
 496     N_("Total"));
 497   cases->root->show_label = true;
 498
 499   casenumber total = n_missing + n_valid;
 500
 501   struct entry
 502     {
 503       int stat_idx;
 504       int case_idx;
 505       double x;
 506     }
 507   entries[] = {
 508     { 0, 0, n_valid },
 509     { 0, 1, n_missing },
 510     { 0, 2, total },
 511     { 1, 0, 100.0 * n_valid / total },
 512     { 1, 1, 100.0 * n_missing / total },
 513     { 1, 2, 100.0 }
 514   };
 515
 516   for (size_t i = 0; i < sizeof entries / sizeof *entries; i++)
 517     {
 518       const struct entry *e = &entries[i];
 519       pivot_table_put2 (table, e->stat_idx, e->case_idx,
 520                         pivot_value_new_number (e->x));
 521     }
 522
 523   pivot_table_submit (table);
 524 }
 525
 526 static void
 527 reliability_summary_total (const struct reliability *rel)
 528 {
 529   struct pivot_table *table = pivot_table_create (N_("Item-Total Statistics"));
 530
 531   pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
 532                           N_("Scale Mean if Item Deleted"),
 533                           N_("Scale Variance if Item Deleted"),
 534                           N_("Corrected Item-Total Correlation"),
 535                           N_("Cronbach's Alpha if Item Deleted"));
 536
 537   struct pivot_dimension *variables = pivot_dimension_create (
 538     table, PIVOT_AXIS_ROW, N_("Variables"));
 539
 540   for (size_t i = 0; i < rel->sc[0].n_items; ++i)
 541     {
 542       const struct cronbach *s = &rel->sc[rel->total_start + i];
 543
 544       int var_idx = pivot_category_create_leaf (
 545         variables->root, pivot_value_new_variable (rel->sc[0].items[i]));
 546
 547       double mean;
 548       moments1_calculate (s->total, NULL, &mean, NULL, NULL, NULL);
 549
 550       double var;
 551       moments1_calculate (rel->sc[0].m[i], NULL, NULL, &var, NULL, NULL);
 552       double cov
 553         = (rel->sc[0].variance_of_sums + var - s->variance_of_sums) / 2.0;
 554
 555       double entries[] = {
 556         mean,
 557         s->variance_of_sums,
 558         (cov - var) / sqrt (var * s->variance_of_sums),
 559         s->alpha,
 560       };
 561       for (size_t i = 0; i < sizeof entries / sizeof *entries; i++)
 562         pivot_table_put2 (table, i, var_idx,
 563                           pivot_value_new_number (entries[i]));
 564     }
 565
 566   pivot_table_submit (table);
 567 }
 568
 569
 570 static void
 571 reliability_statistics (const struct reliability *rel)
 572 {
 573   struct pivot_table *table = pivot_table_create (
 574     N_("Reliability Statistics"));
 575   pivot_table_set_weight_var (table, rel->wv);
 576
 577   if (rel->model == MODEL_ALPHA)
 578     {
 579       pivot_dimension_create (table, PIVOT_AXIS_COLUMN,
 580                               N_("Statistics"),
 581                               N_("Cronbach's Alpha"), PIVOT_RC_OTHER,
 582                               N_("N of Items"), PIVOT_RC_COUNT);
 583
 584       const struct cronbach *s = &rel->sc[0];
 585       pivot_table_put1 (table, 0, pivot_value_new_number (s->alpha));
 586       pivot_table_put1 (table, 1, pivot_value_new_number (s->n_items));
 587     }
 588   else
 589     {
 590       struct pivot_dimension *statistics = pivot_dimension_create (
 591         table, PIVOT_AXIS_ROW, N_("Statistics"));
 592       struct pivot_category *alpha = pivot_category_create_group (
 593         statistics->root, N_("Cronbach's Alpha"));
 594       pivot_category_create_group (alpha, N_("Part 1"),
 595                                    N_("Value"), PIVOT_RC_OTHER,
 596                                    N_("N of Items"), PIVOT_RC_COUNT);
 597       pivot_category_create_group (alpha, N_("Part 2"),
 598                                    N_("Value"), PIVOT_RC_OTHER,
 599                                    N_("N of Items"), PIVOT_RC_COUNT);
 600       pivot_category_create_leaves (alpha,
 601                                     N_("Total N of Items"), PIVOT_RC_COUNT);
 602       pivot_category_create_leaves (statistics->root,
 603                                     N_("Correlation Between Forms"),
 604                                     PIVOT_RC_OTHER);
 605       pivot_category_create_group (statistics->root,
 606                                    N_("Spearman-Brown Coefficient"),
 607                                    N_("Equal Length"), PIVOT_RC_OTHER,
 608                                    N_("Unequal Length"), PIVOT_RC_OTHER);
 609       pivot_category_create_leaves (statistics->root,
 610                                     N_("Guttman Split-Half Coefficient"),
 611                                     PIVOT_RC_OTHER);
 612
 613       /* R is the correlation between the two parts */
 614       double r0 = rel->sc[0].variance_of_sums -
 615         rel->sc[1].variance_of_sums -
 616         rel->sc[2].variance_of_sums;
 617       double r1 = (r0 / sqrt (rel->sc[1].variance_of_sums)
 618                    / sqrt (rel->sc[2].variance_of_sums)
 619                    / 2.0);
 620
 621       /* Guttman Split Half Coefficient */
 622       double g = 2 * r0 / rel->sc[0].variance_of_sums;
 623
 624       double tmp = (1.0 - r1*r1) * rel->sc[1].n_items * rel->sc[2].n_items /
 625         pow2 (rel->sc[0].n_items);
 626
 627       double entries[] = {
 628         rel->sc[1].alpha,
 629         rel->sc[1].n_items,
 630         rel->sc[2].alpha,
 631         rel->sc[2].n_items,
 632         rel->sc[1].n_items + rel->sc[2].n_items,
 633         r1,
 634         2 * r1 / (1.0 + r1),
 635         (sqrt (pow4 (r1) + 4 * pow2 (r1) * tmp) - pow2 (r1)) / (2 * tmp),
 636         g,
 637       };
 638       for (size_t i = 0; i < sizeof entries / sizeof *entries; i++)
 639         pivot_table_put1 (table, i, pivot_value_new_number (entries[i]));
 640     }
 641
 642   pivot_table_submit (table);
 643 }