EXAMINE: Allow the /ID subcommand to take a non-numeric variable
authorJohn Darrington <john@darrington.wattle.id.au>
Thu, 22 Mar 2012 09:56:01 +0000 (10:56 +0100)
committerJohn Darrington <john@darrington.wattle.id.au>
Thu, 22 Mar 2012 20:49:56 +0000 (21:49 +0100)
src/language/stats/examine.c
src/math/box-whisker.c
src/math/box-whisker.h
tests/language/stats/examine.at

index 5d9f642641e038ec6dd343659bf6dbca52db1ba6..069f807c0a17d40cae2deed032021228c4855250 100644 (file)
@@ -110,11 +110,9 @@ struct examine
 
   bool missing_pw;
 
-  /* Test options require that casenumbers are known */
-  bool casenumbers;
-
   /* The case index of the ID value (or -1) if not applicable */
   size_t id_idx;
+  int id_width;
 
   enum pc_alg pc_alg;
   double *ptiles;
@@ -138,7 +136,7 @@ struct extremity
 
   /* Either the casenumber or the value of the variable specified
      by the /ID subcommand which corresponds to this extremity */
-  double identity;
+  union value identity;
 };
 
 struct exploratory_stats
@@ -1150,13 +1148,21 @@ extremes_report (const struct examine *cmd, int iact_idx)
                           &F_8_0);
 
               /* The casenumber */
-              tab_double (t,
+              if (cmd->id_var)
+                tab_value (t,
+                           heading_columns,
+                           heading_rows + v * rows_per_var + i * rows_per_cat + e,
+                           TAB_RIGHT,
+                           &es->maxima[e].identity,
+                           cmd->id_var,
+                           NULL);
+              else 
+                tab_double (t,
                           heading_columns,
-                          heading_rows + v * rows_per_var + i * rows_per_cat + e,
-                          0,
-                          es->maxima[e].identity,
-                          &F_8_0);
-
+                            heading_rows + v * rows_per_var + i * rows_per_cat + e,
+                            TAB_RIGHT,
+                            es->maxima[e].identity.f,
+                            &F_8_0);
 
               tab_double (t,
                           heading_columns + 1,
@@ -1175,12 +1181,21 @@ extremes_report (const struct examine *cmd, int iact_idx)
                           &F_8_0);
 
               /* The casenumber */
-              tab_double (t,
-                          heading_columns,
-                          heading_rows + v * rows_per_var + i * rows_per_cat + cmd->disp_extremes + e,
-                          0,
-                          es->minima[e].identity,
-                          &F_8_0);
+              if (cmd->id_var)
+                tab_value (t,
+                           heading_columns,
+                           heading_rows + v * rows_per_var + i * rows_per_cat + cmd->disp_extremes + e,
+                           TAB_RIGHT,
+                           &es->minima[e].identity,
+                           cmd->id_var,
+                           NULL);
+              else
+                tab_double (t,
+                            heading_columns,
+                            heading_rows + v * rows_per_var + i * rows_per_cat + cmd->disp_extremes + e,
+                            TAB_RIGHT,
+                            es->minima[e].identity.f,
+                            &F_8_0);
 
               tab_double (t,
                           heading_columns + 1,
@@ -1483,10 +1498,11 @@ update_n (const void *aux1, void *aux2 UNUSED, void *user_data,
 
       moments_pass_one (es[v].mom, x, weight);
 
-      /* Save the value and the casenumber to the writer */
+      /* Save the value and the ID to the writer */
+      assert (examine->id_idx != -1);
       case_data_rw_idx (outcase, EX_VAL)->f = x;
-      if ( examine->id_idx != -1)
-        case_data_rw_idx (outcase, EX_ID)->f = case_data_idx (c, examine->id_idx)->f;
+      value_copy (case_data_rw_idx (outcase, EX_ID),
+                  case_data_idx (c, examine->id_idx), examine->id_width);
 
       case_data_rw_idx (outcase, EX_WT)->f = weight;
       
@@ -1534,6 +1550,11 @@ calculate_n (const void *aux1, void *aux2 UNUSED, void *user_data)
 
       es[v].maxima = pool_calloc (examine->pool, examine->calc_extremes, sizeof (*es[v].maxima));
       es[v].minima = pool_calloc (examine->pool, examine->calc_extremes, sizeof (*es[v].minima));
+      for (i = 0; i < examine->calc_extremes; ++i)
+        {
+          value_init_pool (examine->pool, &es[v].maxima[i].identity, examine->id_width) ;
+          value_init_pool (examine->pool, &es[v].minima[i].identity, examine->id_width) ;
+        }
       
       for (reader = casereader_clone (es[v].sorted_reader);
            (c = casereader_read (reader)) != NULL; case_unref (c))
@@ -1553,7 +1574,7 @@ calculate_n (const void *aux1, void *aux2 UNUSED, void *user_data)
                 {
                   struct extremity *min = &es[v].minima[x];
                   min->val = val;
-                  min->identity = case_data_idx (c, EX_ID)->f;
+                  value_copy (&min->identity, case_data_idx (c, EX_ID), examine->id_width);
                 }
               imin += wt;
             }
@@ -1572,7 +1593,7 @@ calculate_n (const void *aux1, void *aux2 UNUSED, void *user_data)
 
                   max = &es[v].maxima[x];
                   max->val = val;
-                  max->identity = case_data_idx (c, EX_ID)->f;
+                  value_copy (&max->identity, case_data_idx (c, EX_ID), examine->id_width);
                 }
             }
         }
@@ -1623,7 +1644,7 @@ calculate_n (const void *aux1, void *aux2 UNUSED, void *user_data)
           struct order_stats *os;
 
           es[v].box_whisker = box_whisker_create (es[v].hinges, 
-                                                  EX_ID);
+                                                  EX_ID, examine->id_var);
 
           os = &es[v].box_whisker->parent;
          order_stats_accumulate_idx (&os, 1,
@@ -1721,24 +1742,20 @@ run_examine (struct examine *cmd, struct casereader *input)
   
   cmd->wv = dict_get_weight (cmd->dict);
 
-  cmd->id_idx = -1;
   cmd->cats
     = categoricals_create (cmd->iacts, cmd->n_iacts,  
                            cmd->wv, cmd->exclude);
 
   categoricals_set_payload (cmd->cats, &payload, cmd, NULL);
 
-  if (cmd->casenumbers)
+  if (cmd->id_idx == -1)
     {
       struct ccase *c = casereader_peek (input,  0);
 
-      if (cmd->id_var) 
-        cmd->id_idx = var_get_case_index (cmd->id_var);
-      else
-        {
-          cmd->id_idx = case_get_value_cnt (c);
-          input = casereader_create_arithmetic_sequence (input, 1.0, 1.0);
-        }
+      assert (cmd->id_var == NULL);
+
+      cmd->id_idx = case_get_value_cnt (c);
+      input = casereader_create_arithmetic_sequence (input, 1.0, 1.0);
 
       case_unref (c);
     }
@@ -1814,7 +1831,6 @@ cmd_examine (struct lexer *lexer, struct dataset *ds)
   struct examine examine;
   bool percentiles_seen = false;
 
-  examine.casenumbers = false;
   examine.missing_pw = false;
   examine.disp_extremes = 0;
   examine.calc_extremes = 0;
@@ -1823,13 +1839,12 @@ cmd_examine (struct lexer *lexer, struct dataset *ds)
   examine.pc_alg = PC_HAVERAGE;
   examine.ptiles = NULL;
   examine.n_percentiles = 0;
-  examine.id_var = 0;
+  examine.id_idx = -1;
+  examine.id_width = 0;
+  examine.id_var = NULL;
   examine.boxplot_mode = BP_GROUPS;
   
   examine.ex_proto = caseproto_create ();
-  examine.ex_proto = caseproto_add_width (examine.ex_proto, 0); /* value */
-  examine.ex_proto = caseproto_add_width (examine.ex_proto, 0); /* id */
-  examine.ex_proto = caseproto_add_width (examine.ex_proto, 0); /* weight */
 
   examine.pool = pool_create ();
 
@@ -2114,6 +2129,7 @@ cmd_examine (struct lexer *lexer, struct dataset *ds)
         }
     }
 
+
   if ( totals_seen && nototals_seen)
     {
       msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
@@ -2133,18 +2149,22 @@ cmd_examine (struct lexer *lexer, struct dataset *ds)
     }
 
 
-  if (examine.disp_extremes > 0)
+  if ( examine.id_var )
     {
-      examine.calc_extremes = examine.disp_extremes;
-      examine.casenumbers = true;
+      examine.id_idx = var_get_case_index (examine.id_var);
+      examine.id_width = var_get_width (examine.id_var);
     }
 
-  if (examine.boxplot)
+  examine.ex_proto = caseproto_add_width (examine.ex_proto, 0); /* value */
+  examine.ex_proto = caseproto_add_width (examine.ex_proto, examine.id_width);   /* id */
+  examine.ex_proto = caseproto_add_width (examine.ex_proto, 0); /* weight */
+
+
+  if (examine.disp_extremes > 0)
     {
-      examine.casenumbers = true;
+      examine.calc_extremes = examine.disp_extremes;
     }
 
-
   if (examine.descriptives && examine.calc_extremes == 0)
     {
       /* Descriptives always displays the max and min */
index fb5c2c62a21fefe6842f820ef6599c4e05bbf3c7..0d893cb80c14d98e4b7bf3c24c5603b4d23c5d27 100644 (file)
@@ -22,6 +22,7 @@
 #include <float.h>
 
 #include "data/case.h"
+#include "data/data-out.h"
 #include "data/val-type.h"
 #include "data/variable.h"
 #include "libpspp/assertion.h"
@@ -82,9 +83,21 @@ acc (struct statistic *s, const struct ccase *cx,
   o->extreme = extreme;
   ds_init_empty (&o->label);
 
-  ds_put_format (&o->label,
-                  "%ld",
-                 (casenumber) case_data_idx (cx, bw->casenumber_idx)->f);
+  if (bw->id_var)
+    {
+      char *s = data_out (case_data_idx (cx, bw->id_idx),
+                           var_get_encoding (bw->id_var),
+                           var_get_print_format (bw->id_var));
+
+      ds_put_cstr (&o->label, s);
+      free (s);
+    }
+  else
+    {
+      ds_put_format (&o->label,
+                     "%ld",
+                     (casenumber) case_data_idx (cx, bw->id_idx)->f);
+    }
 
   ll_push_head (&bw->outliers, &o->ll);
 }
@@ -115,12 +128,13 @@ box_whisker_outliers (const struct box_whisker *bw)
 
   TH are the tukey hinges of the dataset.
 
-  Casenumber_idx is the index into the casereader which will be used to label 
+  id_idx is the index into the casereader which will be used to label 
   outliers.
+  id_var is the variable from which that label came, or NULL
 */
 struct box_whisker *
 box_whisker_create (const struct tukey_hinges *th,
-                   size_t casenumber_idx)
+                   size_t id_idx, const struct variable *id_var)
 {
   struct box_whisker *w = xzalloc (sizeof (*w));
   struct order_stats *os = &w->parent;
@@ -133,7 +147,8 @@ box_whisker_create (const struct tukey_hinges *th,
 
   tukey_hinges_calculate (th, w->hinges);
 
-  w->casenumber_idx = casenumber_idx;
+  w->id_idx = id_idx;
+  w->id_var = id_var;
 
   w->step = (w->hinges[2] - w->hinges[0]) * 1.5;
 
index c9b2bef8d37d5227502eed9a48855e9771dde9e3..ccafef1fc9069a509c064f813374356339a6775a 100644 (file)
@@ -26,6 +26,7 @@
    However, there's no reason not to use it for other purposes too.
  */
 struct tukey_hinges;
+struct variable;
 
 struct outlier
 {
@@ -47,11 +48,12 @@ struct box_whisker
 
   double step;
 
-  size_t casenumber_idx;
+  size_t id_idx;
+  const struct variable *id_var;
 };
 
 struct box_whisker * box_whisker_create (const struct tukey_hinges *,
-                                         size_t);
+                                         size_t id_idx, const struct variable *id_var);
 
 void box_whisker_whiskers (const struct box_whisker *bw, double whiskers[2]);
 
index 40c5c1a9663f2e721408dcd3f4a086c9fefc8c66..64f6d7c573f84ae9c6aafb29517d461e1ea4e343 100644 (file)
@@ -587,3 +587,56 @@ AT_CHECK([$PERL more-big-input.pl >> large.txt])
 AT_CHECK([pspp -o pspp.csv examine.sps])
 dnl Ignore output -- this is just a no-crash check.
 AT_CLEANUP
+
+
+dnl Test that the ID command works with non-numberic variables
+AT_SETUP([EXAMINE -- non-numeric ID])
+
+AT_DATA([examine-id.sps], [dnl
+data list notable list /x * y (a12).
+begin data.
+1  one
+2  two
+3  three
+4  four
+5  five
+6  six
+7  seven
+8  eight
+9  nine
+10 ten
+11 eleven
+12 twelve
+30 thirty
+300 threehundred
+end data.
+
+examine x
+       /statistics = extreme
+       /id = y
+       /plot = boxplot
+       .
+])
+
+AT_CHECK([pspp -O format=csv examine-id.sps], [0], 
+[Table: Case Processing Summary
+,Cases,,,,,
+,Valid,,Missing,,Total,
+,N,Percent,N,Percent,N,Percent
+x,14,100%,0,0%,14,100%
+
+Table: Extreme Values
+,,,y,Value
+x,Highest,1,threehundred,300.00
+,,2,thirty      ,30.00
+,,3,twelve      ,12.00
+,,4,eleven      ,11.00
+,,5,ten         ,10.00
+,Lowest,1,one         ,1.00
+,,2,two         ,2.00
+,,3,three       ,3.00
+,,4,four        ,4.00
+,,5,five        ,5.00
+])
+
+AT_CLEANUP 
\ No newline at end of file