Rewrote most of the examine command.
authorJohn Darrington <john@darrington.wattle.id.au>
Sat, 20 Nov 2004 12:26:20 +0000 (12:26 +0000)
committerJohn Darrington <john@darrington.wattle.id.au>
Sat, 20 Nov 2004 12:26:20 +0000 (12:26 +0000)
15 files changed:
doc/ChangeLog
doc/statistics.texi
src/ChangeLog
src/casefile.c
src/casefile.h
src/examine.q
src/factor_stats.c
src/factor_stats.h
src/glob.c
src/hash.c
src/value-labels.c
src/var-labs.c
tests/ChangeLog
tests/Makefile.am
tests/command/examine.sh [new file with mode: 0755]

index c674017bd0f81b117f7cf818fd161e5d0297c067..7618916e91c8b8bd966692770701f96f73be8b19 100644 (file)
@@ -1,3 +1,5 @@
+       * statistics.texi Added documentation about the EXAMINE cmd
+
 Tue Nov 16 13:18:53 WST 2004 John Darrington <john@darrington.wattle.id.au>
 
        * utilities.texi Added documentation for the PERMISSIONS command.
@@ -11,7 +13,7 @@ Tue Nov  9 09:38:43 WST 2004 John Darrington <john@darrington.wattle.id.au>
 
 Fri Nov  5 17:46:46 WST 2004 John Darrington <john@darrington.wattle.id.au>
 
-       * Added a note to the about SPLIT requiring adjecent cases.
+       * Added a note to the about SPLIT requiring adjacent cases.
 
 Sat Oct 30 17:32:53 WST 2004 John Darrington <john@darrington.wattle.id.au>
 
index 6014e670e4ed14c818c152c0589817443648b876..08d66d797e922554512fc209760b22a3ed182a93 100644 (file)
@@ -10,6 +10,7 @@ far.
 @menu
 * DESCRIPTIVES::                Descriptive statistics.
 * FREQUENCIES::                 Frequency tables.
+* EXAMINE::                     Testing data for normality.
 * CROSSTABS::                   Crosstabulation tables.
 * T-TEST::                      Test hypotheses about means.
 * ONEWAY::                      One way analysis of variance.
@@ -105,7 +106,7 @@ in the order that they are specified on the VARIABLES subcommand.  The A
 and D settings request an ascending or descending sort order,
 respectively.
 
-@node FREQUENCIES, CROSSTABS, DESCRIPTIVES, Statistics
+@node FREQUENCIES, EXAMINE, DESCRIPTIVES, Statistics
 @section FREQUENCIES
 
 @vindex FREQUENCIES
@@ -212,7 +213,32 @@ boundaries of the data set divided into the specified number of ranges.
 For instance, @code{/NTILES=4} would cause quartiles to be reported.
 
 
-@node CROSSTABS, T-TEST, FREQUENCIES, Statistics
+@node EXAMINE, CROSSTABS, FREQUENCIES, Statistics
+@comment  node-name,  next,  previous,  up
+@section EXAMINE
+@vindex EXAMINE
+
+@cindex Normality, testing for
+
+@display
+EXAMINE
+        VARIABLES=var_list [[BY var_list] [BY var_list]]
+        /STATISTICS=@{DESCRIPTIVES, EXTREME[(n)], ALL, NONE@}
+        /PLOT=@{STEMLEAF, BOXPLOT, NPPLOT, SPREADLEVEL(n), HISTOGRAM, 
+              ALL, NONE@}
+        /CINTERVAL n
+        /COMPARE=@{GROUPS,VARIABLES@}
+        /ID=@{case_number, var_name@}
+        /@{TOTAL,NOTOTAL@}
+        /MISSING=@{LISTWISE, PAIRWISE@} [@{EXCLUDE, INCLUDE@}] 
+               [@{NOREPORT,REPORT@}]
+@end display
+
+The @cmd{EXAMINE} command is used to test how closely a distribution is to a 
+normal distribution.  It also shows you outliers and extreme values.
+
+
+@node CROSSTABS, T-TEST, EXAMINE, Statistics
 @section CROSSTABS
 
 @vindex CROSSTABS
index dbf1f89cadfe05750f7185a1eed6fcff5cbfb711..07513dc2206d65669d43885f67d4d6b5d1331a17 100644 (file)
@@ -1,3 +1,18 @@
+       * var-labs.c (var_to_string) Now returns null if the variable is null
+
+       * value-labels.c (value_to_string) Made it return null if either the 
+       value or the variable is null.
+
+       * hash.c (hsh_clear) Fixed a buglet.
+
+       * examine.q  factor_stats.[ch] Largely  rewrote, because I'd started 
+       with  the wrong model.
+
+       * casefile.[ch] Added a function to return the casereader.case_idx 
+       member
+
+       * examine.q  Implemented the extreme values results.
+
 John Darrington <john@darrington.wattle.id.au>
 
        * settings.h set.c glob.[ch] frequencies.q q2c.c error.c lexer.[ch] 
@@ -47,6 +62,7 @@ Tue Nov 16 13:19:18 WST 2004 John Darrington <john@darrington.wattle.id.au>
 
        * permissions.c command.def Added the PERMISSIONS command
 
+>>>>>>> 1.110
 Mon Nov 15 01:33:32 2004  Ben Pfaff  <blp@gnu.org>
 
        * q2c.c: (dump_header) Don't try to emit #includes at very top of
@@ -273,6 +289,7 @@ Mon Nov 15 00:30:33 2004  Ben Pfaff  <blp@gnu.org>
        (var_dtor_free) New function.
        (discard_variables) Use NULL instead of inline_file.
 
+>>>>>>> 1.106
 Fri Nov 12 10:07:11 WST 2004 John Darrington <john@darrington.wattle.id.au>
 
        * value-labs.c  Fixed the implmentation of value_to_string, so 
index 6d74e8e286ee3a8451b90db6951fb2f7599789b1..6033427c47354079a2863af36e4c041a52ce3f85 100644 (file)
@@ -90,6 +90,13 @@ struct casereader
     struct ccase c;                     /* Current case. */
   };
 
+/* Return the case number of the current case */
+unsigned long
+casereader_cnum(const struct casereader *r)
+{
+  return r->case_idx;
+}
+
 /* Doubly linked list of all casefiles. */
 static struct casefile *casefiles;
 
index 5674de0a327df61bb90566c912aa24ebe03091d9..a074cb36273b19a88be38f58fa8b8aa3eadfd518 100644 (file)
@@ -48,4 +48,6 @@ int casereader_read (struct casereader *, struct ccase *);
 int casereader_read_xfer (struct casereader *, struct ccase *);
 void casereader_destroy (struct casereader *);
 
+unsigned long casereader_cnum(const struct casereader *);
+
 #endif /* casefile.h */
index e9e0ca7ec252aff8468c96d2992fd21035009a1f..697176f3269f23cb6af8c2e9b67efc023896a823 100644 (file)
@@ -70,71 +70,107 @@ static struct variable **dependent_vars;
 
 static int n_dependent_vars;
 
-static struct hsh_table *hash_table_factors=0;
-
-
-
 
 struct factor 
 {
-  /* The independent variable for this factor */
-  struct variable *indep_var;
+  /* The independent variable */
+  struct variable *indep_var[2];
+
 
-  /* The  factor statistics for each value of the independent variable */
-  struct hsh_table *hash_table_val;
+  /* Hash table of factor stats indexed by 2 values */
+  struct hsh_table *fstats;
 
-  /* The subfactor (if any) */
-  struct factor *subfactor;
+  /* The hash table after it's been crunched */
+  struct factor_statistics **fs;
+
+  struct factor *next;
 
 };
 
+/* Linked list of factors */
+static struct factor *factors=0;
 
+static struct metrics *totals=0;
 
+void
+print_factors(void)
+{
+  struct factor *f = factors;
 
-/* Parse the clause specifying the factors */
-static int examine_parse_independent_vars(struct cmd_examine *cmd, 
-                                         struct hsh_table *hash_factors );
+  while (f) 
+    {
+      struct  factor_statistics **fs = f->fs;
+
+      printf("Factor: %s BY %s\n", 
+            var_to_string(f->indep_var[0]),
+            var_to_string(f->indep_var[1]) );
+
+
+      printf("Contains %d entries\n", hsh_count(f->fstats));
 
+      
+      while (*fs) 
+       {
+         printf("Factor %g; %g\n", (*fs)->id[0].f, (*fs)->id[1].f);
+         
+         /* 
+            printf("Factor %s; %s\n",
+            value_to_string(&(*fs)->id[0], f->indep_var[0]),
+            value_to_string(&(*fs)->id[1], f->indep_var[1]));
+         */
+
+                
+         printf("Sum is %g; ",(*fs)->m[0].sum);
+         printf("N is %g; ",(*fs)->m[0].n);
+         printf("Mean is %g\n",(*fs)->m[0].mean);
+
+         fs++ ;
+       }
 
+      f = f->next;
+    }
 
+  
+}
 
-/* Functions to support hashes of factors */
-int compare_factors(const struct factor *f1, const struct factor *f2, 
-                   void *aux);
 
-unsigned hash_factor(const struct factor *f, void *aux);
+/* Parse the clause specifying the factors */
+static int examine_parse_independent_vars(struct cmd_examine *cmd);
 
-void free_factor(struct factor *f, void *aux UNUSED);
 
 
 /* Output functions */
 static void show_summary(struct variable **dependent_var, int n_dep_var, 
-                        struct factor *f);
+                        const struct factor *f);
+
+static void show_extremes(struct variable **dependent_var, 
+                         int n_dep_var, 
+                         const struct factor *factor,
+                         int n_extremities);
 
 static void show_descriptives(struct variable **dependent_var, 
                              int n_dep_var, 
                              struct factor *factor);
 
 
-static void show_extremes(struct variable **dependent_var, 
-                         int n_dep_var, 
-                         struct factor *factor,
-                         int n_extremities);
-
+void np_plot(const struct metrics *m, const char *factorname);
 
-void np_plot(const struct metrics *m, const char *varname);
 
 
 
 /* Per Split function */
-static void run_examine(const struct casefile *cf, void *);
+static void run_examine(const struct casefile *cf, void *cmd_);
 
 static void output_examine(void);
 
 
-static struct factor_statistics *totals = 0;
+void factor_calc(struct ccase *c, int case_no, 
+                double weight, int case_missing);
 
 
+/* Function to use for testing for missing values */
+static is_missing_func value_is_missing;
+
 
 int
 cmd_examine(void)
@@ -142,48 +178,45 @@ cmd_examine(void)
 
   if ( !parse_examine(&cmd) )
     return CMD_FAILURE;
-  
+
+  /* If /MISSING=INCLUDE is set, then user missing values are ignored */
+  if (cmd.incl == XMN_INCLUDE ) 
+    value_is_missing = is_system_missing;
+  else
+    value_is_missing = is_missing;
+
   if ( cmd.st_n == SYSMIS ) 
     cmd.st_n = 5;
 
   if ( ! cmd.sbc_cinterval) 
     cmd.n_cinterval[0] = 95.0;
 
-
-  totals = xmalloc ( sizeof (struct factor_statistics *) );
-
-  totals->stats = xmalloc(sizeof ( struct metrics ) * n_dependent_vars);
-
-  multipass_procedure_with_splits (run_examine, NULL);
-
-
-  hsh_destroy(hash_table_factors);
-
-  free(totals->stats);
-  free(totals);
+  multipass_procedure_with_splits (run_examine, &cmd);
 
   return CMD_SUCCESS;
 };
 
 
+
 /* Show all the appropriate tables */
 static void
 output_examine(void)
 {
+  struct factor *fctr;
 
   /* Show totals if appropriate */
-  if ( ! cmd.sbc_nototal || 
-       ! hash_table_factors || 0 == hsh_count (hash_table_factors))
+  if ( ! cmd.sbc_nototal )
     {
-      show_summary(dependent_vars, n_dependent_vars,0);
+      show_summary(dependent_vars, n_dependent_vars, 0);
 
       if ( cmd.sbc_statistics ) 
        {
-         if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES]) 
-           show_descriptives(dependent_vars, n_dependent_vars, 0);
-         
          if ( cmd.a_statistics[XMN_ST_EXTREME]) 
            show_extremes(dependent_vars, n_dependent_vars, 0, cmd.st_n);
+
+         if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES]) 
+           show_descriptives(dependent_vars, n_dependent_vars, 0);
+
        }
 
       if ( cmd.sbc_plot) 
@@ -193,62 +226,75 @@ output_examine(void)
              int v;
 
              for ( v = 0 ; v < n_dependent_vars; ++v ) 
-               {
-                 np_plot(&totals->stats[v], var_to_string(dependent_vars[v]));
-               }
-
+                 np_plot(&totals[v], var_to_string(dependent_vars[v]));
            }
        }
 
+
     }
 
 
-  /* Show grouped statistics  if appropriate */
-  if ( hash_table_factors && 0 != hsh_count (hash_table_factors))
+  /* Show grouped statistics  as appropriate */
+  fctr = factors;
+  while ( fctr ) 
     {
-      struct hsh_iterator hi;
-      struct factor *f;
+      show_summary(dependent_vars, n_dependent_vars, fctr);
 
-      for(f = hsh_first(hash_table_factors,&hi);
-         f != 0;
-         f = hsh_next(hash_table_factors,&hi)) 
+      if ( cmd.sbc_statistics ) 
        {
-         show_summary(dependent_vars, n_dependent_vars,f);
-
-         if ( cmd.sbc_statistics )
-           {
-             if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
-               show_descriptives(dependent_vars, n_dependent_vars, f);
-             
-             if ( cmd.a_statistics[XMN_ST_EXTREME])
-               show_extremes(dependent_vars, n_dependent_vars, f, cmd.st_n);
-           }
+         if ( cmd.a_statistics[XMN_ST_EXTREME]) 
+           show_extremes(dependent_vars, n_dependent_vars, fctr, cmd.st_n);
 
+         if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES]) 
+           show_descriptives(dependent_vars, n_dependent_vars, fctr);
+       }
 
-         if ( cmd.sbc_plot) 
+      if ( cmd.sbc_plot) 
+       {
+         if ( cmd.a_plot[XMN_PLT_NPPLOT] ) 
            {
-             if ( cmd.a_plot[XMN_PLT_NPPLOT] ) 
+             int v;
+             for ( v = 0 ; v < n_dependent_vars; ++ v)
                {
-                 struct hsh_iterator h2;
-                 struct factor_statistics *foo ;
-                 for (foo = hsh_first(f->hash_table_val,&h2);
-                      foo != 0 ; 
-                      foo  = hsh_next(f->hash_table_val,&h2))
+                 
+                 struct factor_statistics **fs = fctr->fs ;
+                 for ( fs = fctr->fs ; *fs ; ++fs ) 
                    {
-                     int v;
-                     for ( v = 0 ; v < n_dependent_vars; ++ v)
+                     char buf1[100];
+                     char buf2[100];
+                     sprintf(buf1, "%s (",
+                             var_to_string(dependent_vars[v]));
+                     
+                     sprintf(buf2, "%s = %s",
+                            var_to_string(fctr->indep_var[0]),
+                            value_to_string(&(*fs)->id[0],fctr->indep_var[0]));
+                     
+                     strcat(buf1, buf2);
+
+                     
+                     if ( fctr->indep_var[1] ) 
+                       {
+                         sprintf(buf2, "; %s = %s)",
+                                 var_to_string(fctr->indep_var[1]),
+                                 value_to_string(&(*fs)->id[1],
+                                                 fctr->indep_var[1]));
+                         strcat(buf1, buf2);
+                       }
+                     else
                        {
-                         char buf[100];
-                         sprintf(buf, "%s (%s = %s)",
-                                 var_to_string(dependent_vars[v]),
-                                 var_to_string(f->indep_var),
-                                 value_to_string(foo->id,f->indep_var));
-                         np_plot(&foo->stats[v], buf);
+                         strcat(buf1, ")");
                        }
+
+                     np_plot(&(*fs)->m[v],buf1);
+
                    }
+                 
                }
+
            }
        }
+
+      fctr = fctr->next;
     }
 
 }
@@ -281,54 +327,6 @@ xmn_custom_nototal(struct cmd_examine *p)
 }
 
 
-/* Compare two factors */
-int 
-compare_factors (const struct factor *f1, 
-                const struct factor *f2, 
-                void *aux)
-{
-  int indep_var_cmp = strcmp(f1->indep_var->name, f2->indep_var->name);
-
-  if ( 0 != indep_var_cmp ) 
-    return indep_var_cmp;
-
-  /* If the names are identical, and there are no subfactors then
-     the factors are identical */
-  if ( ! f1->subfactor &&  ! f2->subfactor ) 
-    return 0;
-    
-  /* ... otherwise we must compare the subfactors */
-
-  return compare_factors(f1->subfactor, f2->subfactor, aux);
-
-}
-
-/* Create a hash of a factor */
-unsigned 
-hash_factor( const struct factor *f, void *aux)
-{
-  unsigned h;
-  h = hsh_hash_string(f->indep_var->name);
-  
-  if ( f->subfactor ) 
-    h += hash_factor(f->subfactor, aux);
-
-  return h;
-}
-
-
-/* Free up a factor */
-void
-free_factor(struct factor *f, void *aux)
-{
-  hsh_destroy(f->hash_table_val);
-
-  if ( f->subfactor ) 
-    free_factor(f->subfactor, aux);
-
-  free(f);
-}
-
 
 /* Parser for the variables sub command */
 static int
@@ -350,981 +348,1075 @@ xmn_custom_variables(struct cmd_examine *cmd )
 
   assert(n_dependent_vars);
 
+  totals = xmalloc( sizeof(struct metrics) * n_dependent_vars);
+
   if ( lex_match(T_BY))
     {
-      hash_table_factors = hsh_create(4, 
-                                     (hsh_compare_func *) compare_factors, 
-                                     (hsh_hash_func *) hash_factor, 
-                                     (hsh_free_func *) free_factor, 0);
-
-      return examine_parse_independent_vars(cmd, hash_table_factors);
+      return examine_parse_independent_vars(cmd);
     }
 
-  
-  
   return 1;
 }
 
 
+
 /* Parse the clause specifying the factors */
 static int
-examine_parse_independent_vars(struct cmd_examine *cmd, 
-                              struct hsh_table *hash_table_factors )
+examine_parse_independent_vars(struct cmd_examine *cmd)
 {
-  struct factor *f = 0;
+
+  struct factor *sf = xmalloc(sizeof(struct factor));
 
   if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
       && token != T_ALL)
     return 2;
 
-  if ( !f ) 
-    {
-      f = xmalloc(sizeof(struct factor));
-      f->indep_var = 0;
-      f->hash_table_val = 0;
-      f->subfactor = 0;
-    }
-  
-  f->indep_var = parse_variable();
-  
-  if ( ! f->hash_table_val ) 
-    f->hash_table_val = hsh_create(4,(hsh_compare_func *) compare_indep_values,
-                                  (hsh_hash_func *) hash_indep_value,
-                                  (hsh_free_func *) free_factor_stats,
-                                  (void *) f->indep_var->width);
+
+  sf->indep_var[0] = parse_variable();
+  sf->indep_var[1] = 0;
 
   if ( token == T_BY ) 
     {
+
       lex_match(T_BY);
 
       if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
          && token != T_ALL)
        return 2;
 
-      f->subfactor = xmalloc(sizeof(struct factor));
+      sf->indep_var[1] = parse_variable();
 
-      f->subfactor->indep_var = parse_variable();
-      
-      f->subfactor->subfactor = 0;
-
-      f->subfactor->hash_table_val = 
-       hsh_create(4,
-                  (hsh_compare_func *) compare_indep_values,
-                  (hsh_hash_func *) hash_indep_value,
-                  (hsh_free_func *) free_factor_stats,
-                  (void *) f->subfactor->indep_var->width);
     }
 
-  hsh_insert(hash_table_factors, f);
+
+  sf->fstats = hsh_create(4,
+                         (hsh_compare_func *) factor_statistics_compare,
+                         (hsh_hash_func *) factor_statistics_hash,
+                         (hsh_free_func *) factor_statistics_free,
+                         0);
+
+  sf->next = factors;
+  factors = sf;
   
   lex_match(',');
 
   if ( token == '.' || token == '/' ) 
     return 1;
 
-  return examine_parse_independent_vars(cmd, hash_table_factors);
+  return examine_parse_independent_vars(cmd);
 }
 
 
+
+
 void populate_descriptives(struct tab_table *t, int col, int row, 
                           const struct metrics *fs);
 
+void populate_extremes(struct tab_table *t, int col, int row, int n, 
+                      const struct metrics *m);
 
-void populate_extremities(struct tab_table *t, int col, int row, int n);
+void populate_summary(struct tab_table *t, int col, int row,
+                     const struct metrics *m);
 
 
-/* Show the descriptives table */
+
+
+static int bad_weight_warn = 1;
+
+
+/* Perform calculations for the sub factors */
 void
-show_descriptives(struct variable **dependent_var, 
-                 int n_dep_var, 
-                 struct factor *factor)
+factor_calc(struct ccase *c, int case_no, double weight, int case_missing)
 {
-  int i;
-  int heading_columns ;
-  int n_cols;
-  const int n_stat_rows = 13;
+  int v;
+  struct factor *fctr = factors;
 
-  const int heading_rows = 1;
-  int n_rows = heading_rows ;
+  while ( fctr) 
+    {
+      union value indep_vals[2] ;
 
-  struct tab_table *t;
+      indep_vals[0] = * case_data(c, fctr->indep_var[0]->fv);
 
+      if ( fctr->indep_var[1] ) 
+       indep_vals[1] = * case_data(c, fctr->indep_var[1]->fv);
+      else
+       indep_vals[1].f = SYSMIS;
 
-  if ( !factor ) 
-    {
-      heading_columns = 1;
-      n_rows += n_dep_var * n_stat_rows;
-    }
-  else
-    {
-      assert(factor->indep_var);
-      if ( factor->subfactor == 0 ) 
+      assert(fctr->fstats);
+
+      struct factor_statistics **foo = ( struct factor_statistics ** ) 
+       hsh_probe(fctr->fstats, (void *) &indep_vals);
+
+      if ( !*foo ) 
        {
-         heading_columns = 2;
-         n_rows += n_dep_var * hsh_count(factor->hash_table_val) * n_stat_rows;
+
+         *foo = create_factor_statistics(n_dependent_vars, 
+                                         &indep_vals[0],
+                                         &indep_vals[1]);
+
+         for ( v =  0 ; v  < n_dependent_vars ; ++v ) 
+           {
+             metrics_precalc( &(*foo)->m[v] );
+           }
+
        }
-      else
+
+      for ( v =  0 ; v  < n_dependent_vars ; ++v ) 
        {
-         heading_columns = 3;
-         n_rows += n_dep_var * hsh_count(factor->hash_table_val) * 
-           hsh_count(factor->subfactor->hash_table_val) * n_stat_rows ;
+         const struct variable *var = dependent_vars[v];
+         const union value *val = case_data (c, var->fv);
+
+         if ( value_is_missing(val,var) || case_missing ) 
+           val = 0;
+
+         metrics_calc( &(*foo)->m[v], val, weight, case_no );
        }
+
+      fctr = fctr->next;
     }
 
-  n_cols = heading_columns + 4;
 
-  t = tab_create (n_cols, n_rows, 0);
+}
 
-  tab_headers (t, heading_columns + 1, 0, heading_rows, 0);
 
-  tab_dim (t, tab_natural_dimensions);
 
-  /* Outline the box and have no internal lines*/
-  tab_box (t, 
-          TAL_2, TAL_2,
-          -1, -1,
-          0, 0,
-          n_cols - 1, n_rows - 1);
 
-  tab_hline (t, TAL_2, 0, n_cols - 1, heading_rows );
+static void 
+run_examine(const struct casefile *cf, void *cmd_ )
+{
+  struct casereader *r;
+  struct ccase c;
+  int v;
 
-  tab_vline (t, TAL_1, heading_columns, 0, n_rows - 1);
-  tab_vline (t, TAL_2, n_cols - 2, 0, n_rows - 1);
-  tab_vline (t, TAL_1, n_cols - 1, 0, n_rows - 1);
+  const struct cmd_examine *cmd = (struct cmd_examine *) cmd_;
 
-  tab_text (t, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Statistic"));
-  tab_text (t, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
+  /* Make sure we haven't got rubbish left over from a 
+     previous split */
 
+  struct factor *fctr = factors;
+  while (fctr) 
+    {
+      struct factor *next = fctr->next;
 
-  for ( i = 0 ; i < n_dep_var ; ++i ) 
+      hsh_clear(fctr->fstats);
+
+      fctr->fs = 0;
+
+      fctr = next;
+    }
+
+
+
+  for ( v = 0 ; v < n_dependent_vars ; ++v ) 
+    metrics_precalc(&totals[v]);
+
+  for(r = casefile_get_reader (cf);
+      casereader_read (r, &c) ;
+      case_destroy (&c) ) 
     {
-      int row;
-      int n_subfactors = 1;
-      int n_factors = 1;
-       
-      if ( factor ) 
+      int case_missing=0;
+      const int case_no = casereader_cnum(r);
+
+      const double weight = 
+       dict_get_case_weight(default_dict, &c, &bad_weight_warn);
+
+      if ( cmd->miss == XMN_LISTWISE ) 
        {
-         n_factors = hsh_count(factor->hash_table_val);
-         if (  factor->subfactor ) 
-           n_subfactors = hsh_count(factor->subfactor->hash_table_val);
+         for ( v = 0 ; v < n_dependent_vars ; ++v ) 
+           {
+             const struct variable *var = dependent_vars[v];
+             const union value *val = case_data (&c, var->fv);
+
+             if ( value_is_missing(val,var))
+               case_missing = 1;
+                  
+           }
        }
 
+      for ( v = 0 ; v < n_dependent_vars ; ++v ) 
+       {
+         const struct variable *var = dependent_vars[v];
+         const union value *val = case_data (&c, var->fv);
 
-      row = heading_rows + i * n_stat_rows * n_factors * n_subfactors; 
+         if ( value_is_missing(val,var) || case_missing ) 
+           val = 0;
+
+         metrics_calc(&totals[v], val, weight, case_no );
+    
+       }
+
+      factor_calc(&c, case_no, weight, case_missing);
+
+    }
 
-      if ( i > 0 )
-       tab_hline(t, TAL_1, 0, n_cols - 1, row );
 
-      if ( factor  )
+  for ( v = 0 ; v < n_dependent_vars ; ++v)
+    {
+      fctr = factors;
+      while ( fctr ) 
        {
          struct hsh_iterator hi;
-         const struct factor_statistics *fs;
-         int count = 0;
+         struct factor_statistics *fs;
 
-         tab_text (t, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE, 
-                   var_to_string(factor->indep_var));
+         for ( fs = hsh_first(fctr->fstats, &hi);
+               fs != 0 ;
+               fs = hsh_next(fctr->fstats, &hi))
+           {
+             metrics_postcalc(&fs->m[v]);
+           }
 
+         fctr = fctr->next;
+       }
+      metrics_postcalc(&totals[v]);
+    }
 
 
-         for (fs  = hsh_first(factor->hash_table_val, &hi);
-              fs != 0;
-              fs  = hsh_next(factor->hash_table_val,  &hi))
-           {
-             tab_text (t, 1, 
-                       row  + count * n_subfactors * n_stat_rows,
-                       TAB_RIGHT | TAT_TITLE, 
-                       value_to_string(fs->id, factor->indep_var)
-                       );
+  /* Make sure that the combination of factors are complete */
 
-             if ( count > 0 ) 
-               tab_hline (t, TAL_1, 1, n_cols - 1,  
-                          row  + count * n_subfactors * n_stat_rows);
+  fctr = factors;
+  while ( fctr ) 
+    {
+      struct hsh_iterator hi;
+      struct hsh_iterator hi0;
+      struct hsh_iterator hi1;
+      struct factor_statistics *fs;
+
+      struct hsh_table *idh0=0;
+      struct hsh_table *idh1=0;
+      union value *val0;
+      union value *val1;
+         
+      idh0 = hsh_create(4, (hsh_compare_func *) compare_values,
+                       (hsh_hash_func *) hash_value,
+                       0,0);
 
-             if ( factor->subfactor ) 
-               {
-                 int count2=0;
-                 struct hsh_iterator h2;
-                 const struct factor_statistics *sub_fs;
-             
-                 tab_text (t, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE, 
-                           var_to_string(factor->subfactor->indep_var));
-
-                 for ( sub_fs = hsh_first(factor->subfactor->hash_table_val, 
-                                          &h2);
-                       sub_fs != 0;
-                       sub_fs = hsh_next(factor->subfactor->hash_table_val, 
-                                         &h2))
-                   {
-                       
-                     tab_text(t, 2, 
-                              row
-                              + count * n_subfactors * n_stat_rows 
-                              + count2 * n_stat_rows,
-                              TAB_RIGHT | TAT_TITLE ,
-                              value_to_string(sub_fs->id, factor->subfactor->indep_var)
-                              );
-
-                     if ( count2 > 0 ) 
-                       tab_hline (t, TAL_1, 2, n_cols - 1,  
-                                  row
-                                  + count * n_subfactors * n_stat_rows 
-                                  + count2 * n_stat_rows);
-                              
-                     populate_descriptives(t, heading_columns,
-                                           row
-                                           + count * n_subfactors 
-                                           * n_stat_rows 
-                                           + count2 * n_stat_rows,
-                                           &sub_fs->stats[i]);
-                                           
-                       
-                     count2++;
-                   }
-               }
-             else
-               {
-                 
-                 populate_descriptives(t, heading_columns, 
-                                       row  
-                                       + count * n_subfactors * n_stat_rows, 
-                                       &fs->stats[i]);
-               }
+      idh1 = hsh_create(4, (hsh_compare_func *) compare_values,
+                       (hsh_hash_func *) hash_value,
+                       0,0);
 
-             count ++;
-           }
+
+      for ( fs = hsh_first(fctr->fstats, &hi);
+           fs != 0 ;
+           fs = hsh_next(fctr->fstats, &hi))
+       {
+         hsh_insert(idh0,(void *) &fs->id[0]);
+         hsh_insert(idh1,(void *) &fs->id[1]);
        }
-      else
+
+      /* Ensure that the factors combination is complete */
+      for ( val0 = hsh_first(idh0, &hi0);
+           val0 != 0 ;
+           val0 = hsh_next(idh0, &hi0))
        {
-         populate_descriptives(t, heading_columns, 
-                               row, &totals->stats[i]);
+         for ( val1 = hsh_first(idh1, &hi1);
+               val1 != 0 ;
+               val1 = hsh_next(idh1, &hi1))
+           {
+             struct factor_statistics **ffs;
+             union value key[2];
+             key[0] = *val0;
+             key[1] = *val1;
+                 
+             ffs = (struct factor_statistics **) 
+               hsh_probe(fctr->fstats, (void *) &key );
+
+             if ( !*ffs ) {
+               int i;
+               (*ffs) = create_factor_statistics (n_dependent_vars,
+                                                  &key[0], &key[1]);
+               for ( i = 0 ; i < n_dependent_vars ; ++i ) 
+                 metrics_precalc( &(*ffs)->m[i]);
+             }
+           }
        }
 
-      tab_text (t, 
-               0, row,
-               TAB_LEFT | TAT_TITLE, 
-               var_to_string(dependent_var[i])
-               );
+      hsh_destroy(idh0);
+      hsh_destroy(idh1);
+
+      fctr->fs = (struct factor_statistics **) hsh_sort_copy(fctr->fstats);
 
+      fctr = fctr->next;
     }
 
-  tab_title (t, 0, _("Descriptives"));
+  /* 
+  print_factors();
+  */
 
-  tab_submit(t);
+  output_examine();
 
 }
 
 
-
-/* Fill in the descriptives data */
-void
-populate_descriptives(struct tab_table *tbl, int col, int row, 
-                     const struct metrics *m)
+static void
+show_summary(struct variable **dependent_var, int n_dep_var, 
+            const struct factor *fctr)
 {
+  static const char *subtitle[]=
+    {
+      N_("Valid"),
+      N_("Missing"),
+      N_("Total")
+    };
 
-  const double t = gsl_cdf_tdist_Qinv(1 - cmd.n_cinterval[0]/100.0/2.0, \
-                                      m->n -1);
+  int i;
+  int heading_columns ;
+  int n_cols;
+  const int heading_rows = 3;
+  struct tab_table *tbl;
 
+  int n_rows ;
+  int n_factors = 1;
 
-  tab_text (tbl, col, 
-           row,
-           TAB_LEFT | TAT_TITLE,
-           _("Mean"));
+  if ( fctr )
+    {
+      heading_columns = 2;
+      n_factors = hsh_count(fctr->fstats);
+      n_rows = n_dep_var * n_factors ;
 
-  tab_float (tbl, col + 2,
-            row,
-            TAB_CENTER,
-            m->mean,
-            8,2);
-  
-  tab_float (tbl, col + 3,
-            row,
-            TAB_CENTER,
-            m->stderr,
-            8,3);
+      if ( fctr->indep_var[1] )
+         heading_columns = 3;
+    }
+  else
+    {
+      heading_columns = 1;
+      n_rows = n_dep_var;
+    }
+
+  n_rows += heading_rows;
+
+  n_cols = heading_columns + 6;
+
+  tbl = tab_create (n_cols,n_rows,0);
+  tab_headers (tbl, heading_columns, 0, heading_rows, 0);
+
+  tab_dim (tbl, tab_natural_dimensions);
   
+  /* Outline the box */
+  tab_box (tbl, 
+          TAL_2, TAL_2,
+          -1, -1,
+          0, 0,
+          n_cols - 1, n_rows - 1);
 
-  tab_text (tbl, col, 
-           row + 1,
-           TAB_LEFT | TAT_TITLE | TAT_PRINTF,
-           _("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]);
+  /* Vertical lines for the data only */
+  tab_box (tbl, 
+          -1, -1,
+          -1, TAL_1,
+          heading_columns, 0,
+          n_cols - 1, n_rows - 1);
 
 
-  tab_text (tbl, col + 1, 
-           row  + 1,
-           TAB_LEFT | TAT_TITLE,
-           _("Lower Bound"));
+  tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
+  tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 );
+  tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, heading_rows -1 );
 
-  tab_float (tbl, col + 2,
-            row + 1,
-            TAB_CENTER,
-            m->mean - t * m->stderr, 
-            8,3);
+  tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1);
 
-  tab_text (tbl, col + 1,  
-           row + 2,
-           TAB_LEFT | TAT_TITLE,
-           _("Upper Bound"));
 
+  tab_title (tbl, 0, _("Case Processing Summary"));
+  
 
-  tab_float (tbl, col + 2,
-            row + 2,
-            TAB_CENTER,
-            m->mean + t * m->stderr, 
-            8,3);
+  tab_joint_text(tbl, heading_columns, 0, 
+                n_cols -1, 0,
+                TAB_CENTER | TAT_TITLE,
+                _("Cases"));
 
-  tab_text (tbl, col, 
-           row + 3,
-           TAB_LEFT | TAT_TITLE,
-           _("5% Trimmed Mean"));
+  /* Remove lines ... */
+  tab_box (tbl, 
+          -1, -1,
+          TAL_0, TAL_0,
+          heading_columns, 0,
+          n_cols - 1, 0);
 
-  tab_float (tbl, col + 2, 
-           row + 3,
-            TAB_CENTER,
-            m->trimmed_mean,
-            8,2);
+  for ( i = 0 ; i < 3 ; ++i ) 
+    {
+      tab_text (tbl, heading_columns + i*2 , 2, TAB_CENTER | TAT_TITLE, 
+               _("N"));
 
-  tab_text (tbl, col, 
-           row + 4,
-           TAB_LEFT | TAT_TITLE,
-           _("Median"));
+      tab_text (tbl, heading_columns + i*2 + 1, 2, TAB_CENTER | TAT_TITLE, 
+               _("Percent"));
 
-  tab_text (tbl, col, 
-           row + 5,
-           TAB_LEFT | TAT_TITLE,
-           _("Variance"));
+      tab_joint_text(tbl, heading_columns + i*2 , 1,
+                    heading_columns + i*2 + 1, 1,
+                    TAB_CENTER | TAT_TITLE,
+                    subtitle[i]);
 
-  tab_float (tbl, col + 2,
-            row + 5,
-            TAB_CENTER,
-            m->var,
-            8,3);
+      tab_box (tbl, -1, -1,
+              TAL_0, TAL_0,
+              heading_columns + i*2, 1,
+              heading_columns + i*2 + 1, 1);
 
+    }
 
-  tab_text (tbl, col, 
-           row + 6,
-           TAB_LEFT | TAT_TITLE,
-           _("Std. Deviation"));
 
+  /* Titles for the independent variables */
+  if ( fctr ) 
+    {
+      tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE, 
+               var_to_string(fctr->indep_var[0]));
 
-  tab_float (tbl, col + 2,
-            row + 6,
-            TAB_CENTER,
-            m->stddev,
-            8,3);
+      if ( fctr->indep_var[1] ) 
+       {
+         tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE, 
+                   var_to_string(fctr->indep_var[1]));
+       }
+               
+    }
 
-  
-  tab_text (tbl, col, 
-           row + 7,
-           TAB_LEFT | TAT_TITLE,
-           _("Minimum"));
 
-  tab_float (tbl, col + 2,
-            row + 7,
-            TAB_CENTER,
-            m->min,
-            8,3);
+  for ( i = 0 ; i < n_dep_var ; ++i ) 
+    {
+      int n_factors = 1;
+      if ( fctr ) 
+       n_factors = hsh_count(fctr->fstats);
+      
 
-  tab_text (tbl, col, 
-           row + 8,
-           TAB_LEFT | TAT_TITLE,
-           _("Maximum"));
+      if ( i > 0 ) 
+       tab_hline(tbl, TAL_1, 0, n_cols -1 , i * n_factors + heading_rows);
+      
+      tab_text (tbl, 
+               0, i * n_factors + heading_rows,
+               TAB_LEFT | TAT_TITLE, 
+               var_to_string(dependent_var[i])
+               );
 
-  tab_float (tbl, col + 2,
-            row + 8,
-            TAB_CENTER,
-            m->max,
-            8,3);
 
+      if ( !fctr ) 
+       populate_summary(tbl, heading_columns, 
+                        (i * n_factors) + heading_rows,
+                        &totals[i]);
 
-  tab_text (tbl, col, 
-           row + 9,
-           TAB_LEFT | TAT_TITLE,
-           _("Range"));
 
+      else
+       {
+         struct factor_statistics **fs = fctr->fs;
+         int count = 0 ;
 
-  tab_float (tbl, col + 2,
-            row + 9,
-            TAB_CENTER,
-            m->max - m->min,
-            8,3);
+         while (*fs) 
+           {
+             static union value prev;
+             
+             if ( 0 != compare_values(&prev, &(*fs)->id[0], 
+                                      fctr->indep_var[0]->width))
+               {
+                  tab_text (tbl, 
+                            1,
+                            (i * n_factors ) + count + 
+                            heading_rows,
+                            TAB_LEFT | TAT_TITLE, 
+                            value_to_string(&(*fs)->id[0], fctr->indep_var[0])
+                            );
+
+                  if (fctr->indep_var[1] && count > 0 ) 
+                    tab_hline(tbl, TAL_1, 1, n_cols - 1, 
+                              (i * n_factors ) + count + heading_rows);
 
-  tab_text (tbl, col, 
-           row + 10,
-           TAB_LEFT | TAT_TITLE,
-           _("Interquartile Range"));
+               }
+             
+             prev = (*fs)->id[0];
 
-  tab_text (tbl, col, 
-           row + 11,
-           TAB_LEFT | TAT_TITLE,
-           _("Skewness"));
 
-  tab_text (tbl, col, 
-           row + 12,
-           TAB_LEFT | TAT_TITLE,
-           _("Kurtosis"));
+             if ( fctr->indep_var[1]) 
+               tab_text (tbl, 
+                         2,
+                         (i * n_factors ) + count + 
+                         heading_rows,
+                         TAB_LEFT | TAT_TITLE, 
+                         value_to_string(&(*fs)->id[1], fctr->indep_var[1])
+                         );
+
+             populate_summary(tbl, heading_columns, 
+                              (i * n_factors) + count 
+                              + heading_rows,
+                              &(*fs)->m[i]);
+
+             count++ ; 
+             fs++;
+           }
+       }
+    }
+
+  tab_submit (tbl);
 }
 
 
-void
-show_summary(struct variable **dependent_var, 
-            int n_dep_var, 
-            struct factor *factor)
+void 
+populate_summary(struct tab_table *t, int col, int row,
+                const struct metrics *m)
+
 {
-  static const char *subtitle[]=
-    {
-      N_("Valid"),
-      N_("Missing"),
-      N_("Total")
-    };
+  const double total = m->n + m->n_missing ; 
+
+  tab_float(t, col + 0, row + 0, TAB_RIGHT, m->n, 8, 0);
+  tab_float(t, col + 2, row + 0, TAB_RIGHT, m->n_missing, 8, 0);
+  tab_float(t, col + 4, row + 0, TAB_RIGHT, total, 8, 0);
+
+
+  if ( total > 0 ) {
+    tab_text (t, col + 1, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%", 
+             100.0 * m->n / total );
+
+    tab_text (t, col + 3, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%", 
+             100.0 * m->n_missing / total );
+
+    /* This seems a bit pointless !!! */
+    tab_text (t, col + 5, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%", 
+             100.0 * total / total );
+
+
+  }
+
+
+}  
 
+
+
+static void 
+show_extremes(struct variable **dependent_var, int n_dep_var, 
+             const struct factor *fctr, int n_extremities)
+{
   int i;
   int heading_columns ;
   int n_cols;
-  const int heading_rows = 3;
+  const int heading_rows = 1;
   struct tab_table *tbl;
 
-  int n_rows = heading_rows;
+  int n_factors = 1;
+  int n_rows ;
 
-  if ( !factor ) 
+  if ( fctr )
     {
-      heading_columns = 1;
-      n_rows += n_dep_var;
+      heading_columns = 2;
+      n_factors = hsh_count(fctr->fstats);
+
+      n_rows = n_dep_var * 2 * n_extremities * n_factors;
+
+      if ( fctr->indep_var[1] )
+         heading_columns = 3;
     }
   else
     {
-      assert(factor->indep_var);
-      if ( factor->subfactor == 0 ) 
-       {
-         heading_columns = 2;
-         n_rows += n_dep_var * hsh_count(factor->hash_table_val);
-       }
-      else
-       {
-         heading_columns = 3;
-         n_rows += n_dep_var * hsh_count(factor->hash_table_val) * 
-           hsh_count(factor->subfactor->hash_table_val) ;
-       }
+      heading_columns = 1;
+      n_rows = n_dep_var * 2 * n_extremities;
     }
 
+  n_rows += heading_rows;
 
-  n_cols = heading_columns + 6;
+  heading_columns += 2;
+  n_cols = heading_columns + 2;
 
   tbl = tab_create (n_cols,n_rows,0);
   tab_headers (tbl, heading_columns, 0, heading_rows, 0);
 
   tab_dim (tbl, tab_natural_dimensions);
   
-  /* Outline the box and have vertical internal lines*/
+  /* Outline the box, No internal lines*/
   tab_box (tbl, 
           TAL_2, TAL_2,
-          -1, TAL_1,
+          -1, -1,
           0, 0,
           n_cols - 1, n_rows - 1);
 
   tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
-  tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 );
-  tab_hline (tbl, TAL_1, 0, n_cols - 1, heading_rows -1 );
-
-  tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1);
 
+  tab_title (tbl, 0, _("Extreme Values"));
 
-  tab_title (tbl, 0, _("Case Processing Summary"));
-  
 
-  tab_joint_text(tbl, heading_columns, 0, 
-                n_cols -1, 0,
-                TAB_CENTER | TAT_TITLE,
-                _("Cases"));
+  tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows -1);
+  tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows -1);
 
-  /* Remove lines ... */
-  tab_box (tbl, 
-          -1, -1,
-          TAL_0, TAL_0,
-          heading_columns, 0,
-          n_cols - 1, 0);
-
-  if ( factor ) 
+  if ( fctr ) 
     {
       tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE, 
-               var_to_string(factor->indep_var));
+               var_to_string(fctr->indep_var[0]));
 
-      if ( factor->subfactor ) 
+      if ( fctr->indep_var[1] ) 
        tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE, 
-                 var_to_string(factor->subfactor->indep_var));
+                 var_to_string(fctr->indep_var[1]));
     }
 
-  for ( i = 0 ; i < 3 ; ++i ) 
-    {
-      tab_text (tbl, heading_columns + i*2 , 2, TAB_CENTER | TAT_TITLE, _("N"));
-      tab_text (tbl, heading_columns + i*2 + 1, 2, TAB_CENTER | TAT_TITLE, 
-               _("Percent"));
-
-      tab_joint_text(tbl, heading_columns + i*2 , 1,
-                    heading_columns + i*2 + 1, 1,
-                    TAB_CENTER | TAT_TITLE,
-                    subtitle[i]);
+  tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Value"));
+  tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Case Number"));
 
-      tab_box (tbl, -1, -1,
-              TAL_0, TAL_0,
-              heading_columns + i*2, 1,
-              heading_columns + i*2 + 1, 1);
 
-    }
 
 
   for ( i = 0 ; i < n_dep_var ; ++i ) 
     {
-      int n_subfactors = 1;
-      int n_factors = 1;
-       
-      if ( factor ) 
-       {
-         n_factors = hsh_count(factor->hash_table_val);
-         if (  factor->subfactor ) 
-           n_subfactors = hsh_count(factor->subfactor->hash_table_val);
-       }
 
-      tab_text (tbl, 
-               0, i * n_factors * n_subfactors + heading_rows,
+      if ( i > 0 ) 
+       tab_hline(tbl, TAL_1, 0, n_cols -1 , 
+                 i * 2 * n_extremities * n_factors + heading_rows);
+      
+      tab_text (tbl, 0,
+               i * 2 * n_extremities * n_factors  + heading_rows,
                TAB_LEFT | TAT_TITLE, 
                var_to_string(dependent_var[i])
                );
 
-      if ( factor  )
+
+      if ( !fctr ) 
+       populate_extremes(tbl, heading_columns - 2, 
+                         i * 2 * n_extremities * n_factors  + heading_rows,
+                         n_extremities, &totals[i]);
+
+      else
        {
-         struct hsh_iterator hi;
-         const struct factor_statistics *fs;
-         int count = 0;
+         struct factor_statistics **fs = fctr->fs;
+         int count = 0 ;
 
-         for (fs  = hsh_first(factor->hash_table_val, &hi);
-              fs != 0;
-              fs  = hsh_next(factor->hash_table_val,  &hi))
+         while (*fs) 
            {
-             tab_text (tbl, 1, 
-                       i * n_factors * n_subfactors + heading_rows
-                       + count * n_subfactors,
-                       TAB_RIGHT | TAT_TITLE, 
-                       value_to_string(fs->id, factor->indep_var)
-                       );
-
-             if ( factor->subfactor ) 
-               {
-                 int count2=0;
-                 struct hsh_iterator h2;
-                 const struct factor_statistics *sub_fs;
-               
-                 for ( sub_fs = hsh_first(factor->subfactor->hash_table_val, 
-                                          &h2);
-                       sub_fs != 0;
-                       sub_fs = hsh_next(factor->subfactor->hash_table_val, 
-                                         &h2))
-                   {
-                       
-                     tab_text(tbl, 2, 
-                              i * n_factors * n_subfactors + heading_rows
-                              + count * n_subfactors + count2,
-                              TAB_RIGHT | TAT_TITLE ,
-                              value_to_string(sub_fs->id, factor->subfactor->indep_var)
-                              );
-                       
-                     count2++;
-                   }
-               }
-             count ++;
-           }
-       }
-    }
+             static union value prev ;
 
+             const int row = heading_rows + ( 2 * n_extremities )  * 
+               ( ( i  * n_factors  ) +  count );
 
-  tab_submit (tbl);
-  
-}
 
-static int bad_weight_warn = 1;
+             if ( 0 != compare_values(&prev, &(*fs)->id[0], 
+                                      fctr->indep_var[0]->width))
+               {
+                 
+                 if ( count > 0 ) 
+                   tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
+
+                 tab_text (tbl, 
+                           1, row,
+                           TAB_LEFT | TAT_TITLE, 
+                           value_to_string(&(*fs)->id[0], fctr->indep_var[0])
+                           );
+               }
 
+             prev = (*fs)->id[0];
 
-static void 
-run_examine(const struct casefile *cf, void *aux UNUSED)
-{
-  struct hsh_iterator hi;
-  struct factor *fctr;
+             if (fctr->indep_var[1] && count > 0 ) 
+               tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
 
-  struct casereader *r;
-  struct ccase c;
-  int v;
+             if ( fctr->indep_var[1]) 
+               tab_text (tbl, 2, row,
+                         TAB_LEFT | TAT_TITLE, 
+                         value_to_string(&(*fs)->id[1], fctr->indep_var[1])
+                         );
 
-  /* Make sure we haven't got rubbish left over from a 
-     previous split */
-  if ( hash_table_factors ) 
-    {
-      for ( fctr = hsh_first(hash_table_factors, &hi);
-           fctr != 0;
-           fctr = hsh_next (hash_table_factors, &hi) )
-       {
-         hsh_clear(fctr->hash_table_val);
+             populate_extremes(tbl, heading_columns - 2, 
+                               row, n_extremities,
+                               &(*fs)->m[i]);
 
-         while ( (fctr = fctr->subfactor) )
-           hsh_clear(fctr->hash_table_val);
+             count++ ; 
+             fs++;
+           }
        }
     }
 
-  for ( v = 0 ; v < n_dependent_vars ; ++v ) 
-    metrics_precalc(&totals->stats[v]);
+  tab_submit(tbl);
+}
 
-  for(r = casefile_get_reader (cf);
-      casereader_read (r, &c) ;
-      case_destroy (&c) ) 
-    {
-      const double weight = 
-       dict_get_case_weight(default_dict, &c, &bad_weight_warn);
 
-      for ( v = 0 ; v < n_dependent_vars ; ++v ) 
-       {
-         const struct variable *var = dependent_vars[v];
-         const union value *val = case_data (&c, var->fv);
 
-         metrics_calc(&totals->stats[v], val, weight);
-       }
+/* Fill in the extremities table */
+void 
+populate_extremes(struct tab_table *t, 
+                 int col, int row, int n, const struct metrics *m)
+{
+  int extremity;
+  int idx=0;
 
-      if ( hash_table_factors ) 
-       {
-         for ( fctr = hsh_first(hash_table_factors, &hi);
-               fctr != 0;
-               fctr = hsh_next (hash_table_factors, &hi) )
-           {
-             const union value *indep_val = 
-               case_data(&c, fctr->indep_var->fv);
+  const int n_data = hsh_count(m->ordered_data);
 
-             struct factor_statistics **foo = ( struct factor_statistics ** ) 
-               hsh_probe(fctr->hash_table_val, (void *) &indep_val);
+  tab_text(t, col, row,
+          TAB_RIGHT | TAT_TITLE ,
+          _("Highest")
+          );
 
-             if ( !*foo ) 
-               {
-                 *foo = xmalloc ( sizeof ( struct factor_statistics));
-                 (*foo)->id = indep_val;
-                 (*foo)->stats = xmalloc ( sizeof ( struct metrics ) 
-                                           * n_dependent_vars);
+  tab_text(t, col, row + n ,
+          TAB_RIGHT | TAT_TITLE ,
+          _("Lowest")
+          );
 
-                 for ( v =  0 ; v  < n_dependent_vars ; ++v ) 
-                   metrics_precalc( &(*foo)->stats[v] );
 
-                 hsh_insert(fctr->hash_table_val, (void *) *foo);
-               }
+  tab_hline(t, TAL_1, col, col + 3, row + n );
+           
+  for (extremity = 0; extremity < n ; ++extremity ) 
+    {
+      /* Highest */
+      tab_float(t, col + 1, row + extremity,
+               TAB_RIGHT,
+               extremity + 1, 8, 0);
+
 
-             for ( v =  0 ; v  < n_dependent_vars ; ++v ) 
-               {
-                 const struct variable *var = dependent_vars[v];
-                 const union value *val = case_data (&c, var->fv);
+      /* Lowest */
+      tab_float(t, col + 1, row + extremity + n,
+               TAB_RIGHT,
+               extremity + 1, 8, 0);
 
-                 metrics_calc( &(*foo)->stats[v], val, weight );
-               }
+    }
 
-             if ( fctr->subfactor  ) 
-               {
-                 struct factor *sfctr  = fctr->subfactor;
 
-                 const union value *ii_val = 
-                   case_data (&c, sfctr->indep_var->fv);
+  /* Lowest */
+  for (idx = 0, extremity = 0; extremity < n && idx < n_data ; ++idx ) 
+    {
+      int j;
+      const struct weighted_value *wv = &m->wv[idx];
+      struct case_node *cn = wv->case_nos;
 
-                 struct factor_statistics **bar = 
-                   (struct factor_statistics **)
-                   hsh_probe(sfctr->hash_table_val, (void *) &ii_val);
+      
+      for (j = 0 ; j < wv->w ; ++j  )
+       {
+         if ( extremity + j >= n ) 
+           break ;
 
-                 if ( !*bar ) 
-                   {
-                     *bar = xmalloc ( sizeof ( struct factor_statistics));
-                     (*bar)->id = ii_val;
-                     (*bar)->stats = xmalloc ( sizeof ( struct metrics ) 
-                                               * n_dependent_vars);
-                 
-                     for ( v =  0 ; v  < n_dependent_vars ; ++v ) 
-                       metrics_precalc( &(*bar)->stats[v] );
+         tab_float(t, col + 3, row + extremity + j  + n,
+                   TAB_RIGHT,
+                   wv->v.f, 8, 2);
 
-                     hsh_insert(sfctr->hash_table_val, 
-                                (void *) *bar);
-                   }
+         tab_float(t, col + 2, row + extremity + j  + n,
+                   TAB_RIGHT,
+                   cn->num, 8, 0);
 
-                 for ( v =  0 ; v  < n_dependent_vars ; ++v ) 
-                   {
-                     const struct variable *var = dependent_vars[v];
-                     const union value *val = case_data (&c, var->fv);
+         if ( cn->next ) 
+             cn = cn->next;
 
-                     metrics_calc( &(*bar)->stats[v], val, weight );
-                   }
-               }
-           }
        }
 
+      extremity +=  wv->w ;
     }
 
-  for ( v = 0 ; v < n_dependent_vars ; ++v)
+
+  /* Highest */
+  for (idx = n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx ) 
     {
-      if ( hash_table_factors ) 
+      int j;
+      const struct weighted_value *wv = &m->wv[idx];
+      struct case_node *cn = wv->case_nos;
+
+      for (j = 0 ; j < wv->w ; ++j  )
        {
-       for ( fctr = hsh_first(hash_table_factors, &hi);
-             fctr != 0;
-             fctr = hsh_next (hash_table_factors, &hi) )
-         {
-           struct hsh_iterator h2;
-           struct factor_statistics *fs;
-
-           for ( fs = hsh_first(fctr->hash_table_val,&h2);
-                 fs != 0;
-                 fs = hsh_next(fctr->hash_table_val,&h2))
-             {
-               metrics_postcalc( &fs->stats[v] );
-             }
+         if ( extremity + j >= n ) 
+           break ;
 
-           if ( fctr->subfactor) 
-             {
-               struct hsh_iterator hsf;
-               struct factor_statistics *fss;
-
-               for ( fss = hsh_first(fctr->subfactor->hash_table_val,&hsf);
-                     fss != 0;
-                     fss = hsh_next(fctr->subfactor->hash_table_val,&hsf))
-                 {
-                   metrics_postcalc( &fss->stats[v] );
-                 }
-             }
-         }
-       }
+         tab_float(t, col + 3, row + extremity + j,
+                   TAB_RIGHT,
+                   wv->v.f, 8, 2);
 
-      metrics_postcalc(&totals->stats[v]);
-    }
+         tab_float(t, col + 2, row + extremity + j,
+                   TAB_RIGHT,
+                   cn->num, 8, 0);
 
-  output_examine();
+         if ( cn->next ) 
+             cn = cn->next;
+
+       }
 
+      extremity +=  wv->w ;
+    }
 }
 
 
-static void 
-show_extremes(struct variable **dependent_var, 
-             int n_dep_var, 
-             struct factor *factor,
-             int n_extremities)
+/* Show the descriptives table */
+void
+show_descriptives(struct variable **dependent_var, 
+                 int n_dep_var, 
+                 struct factor *fctr)
 {
   int i;
   int heading_columns ;
   int n_cols;
+  const int n_stat_rows = 13;
+
   const int heading_rows = 1;
-  struct tab_table *t;
 
-  int n_rows = heading_rows;
+  struct tab_table *tbl;
+
+  int n_factors = 1;
+  int n_rows ;
 
-  if ( !factor ) 
+  if ( fctr )
     {
-      heading_columns = 1 + 1;
-      n_rows += n_dep_var * 2 * n_extremities;
+      heading_columns = 4;
+      n_factors = hsh_count(fctr->fstats);
+
+      n_rows = n_dep_var * n_stat_rows * n_factors;
+
+      if ( fctr->indep_var[1] )
+         heading_columns = 5;
     }
   else
     {
-      assert(factor->indep_var);
-      if ( factor->subfactor == 0 ) 
-       {
-         heading_columns = 2 + 1;
-         n_rows += n_dep_var * 2 * n_extremities 
-           * hsh_count(factor->hash_table_val);
-       }
-      else
-       {
-         heading_columns = 3 + 1;
-         n_rows += n_dep_var * 2 * n_extremities 
-           * hsh_count(factor->hash_table_val)
-           * hsh_count(factor->subfactor->hash_table_val) ;
-       }
+      heading_columns = 3;
+      n_rows = n_dep_var * n_stat_rows;
     }
 
+  n_rows += heading_rows;
 
-  n_cols = heading_columns + 3;
-
-  t = tab_create (n_cols,n_rows,0);
-  tab_headers (t, heading_columns, 0, heading_rows, 0);
-
-  tab_dim (t, tab_natural_dimensions);
-  
-  /* Outline the box and have vertical internal lines*/
-  tab_box (t, 
-          TAL_2, TAL_2,
-          -1, TAL_1,
-          0, 0,
-          n_cols - 1, n_rows - 1);
-
-
-
-  tab_hline (t, TAL_2, 0, n_cols - 1, heading_rows );
+  n_cols = heading_columns + 2;
 
-  tab_title (t, 0, _("Extreme Values"));
 
+  tbl = tab_create (n_cols, n_rows, 0);
 
+  tab_headers (tbl, heading_columns + 1, 0, heading_rows, 0);
 
+  tab_dim (tbl, tab_natural_dimensions);
 
-  /* Remove lines ... */
-  tab_box (t, 
+  /* Outline the box and have no internal lines*/
+  tab_box (tbl, 
+          TAL_2, TAL_2,
           -1, -1,
-          TAL_0, TAL_0,
-          heading_columns, 0,
-          n_cols - 1, 0);
+          0, 0,
+          n_cols - 1, n_rows - 1);
 
-  if ( factor ) 
-    {
-      tab_text (t, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE, 
-               var_to_string(factor->indep_var));
+  tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
 
-      if ( factor->subfactor ) 
-       tab_text (t, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE, 
-                 var_to_string(factor->subfactor->indep_var));
-    }
+  tab_vline (tbl, TAL_1, heading_columns, 0, n_rows - 1);
+  tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows - 1);
+  tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1);
+
+  tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Statistic"));
+  tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
 
-  tab_text (t, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Value"));
-  tab_text (t, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Case Number"));
+  tab_title (tbl, 0, _("Descriptives"));
 
 
   for ( i = 0 ; i < n_dep_var ; ++i ) 
     {
-      int n_subfactors = 1;
-      int n_factors = 1;
-       
-      if ( factor ) 
-       {
-         n_factors = hsh_count(factor->hash_table_val);
-         if (  factor->subfactor ) 
-           n_subfactors = hsh_count(factor->subfactor->hash_table_val);
-       }
+      const int row = heading_rows + i * n_stat_rows * n_factors ;
+
+      if ( i > 0 )
+       tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
 
-      tab_text (t, 
-               0, i * 2 * n_extremities * n_factors * 
-               n_subfactors + heading_rows,
+      tab_text (tbl, 0,
+               i * n_stat_rows * n_factors  + heading_rows,
                TAB_LEFT | TAT_TITLE, 
                var_to_string(dependent_var[i])
                );
 
 
-      if ( i > 0 ) 
-       tab_hline (t, 
-                  TAL_1, 0, n_cols - 1,  
-                  heading_rows + 2 * n_extremities * 
-                  (i * n_factors * n_subfactors )
-                  );
-
-      if ( factor  )
+      if ( fctr  )
        {
-         struct hsh_iterator hi;
-         const struct factor_statistics *fs;
+         struct factor_statistics **fs = fctr->fs;
          int count = 0;
 
-         for ( fs  = hsh_first(factor->hash_table_val, &hi);
-               fs != 0;
-               fs  = hsh_next(factor->hash_table_val,  &hi))
+         tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE, 
+                   var_to_string(fctr->indep_var[0]));
+
+
+         if ( fctr->indep_var[1])
+           tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE, 
+                     var_to_string(fctr->indep_var[1]));
+
+         while( *fs ) 
            {
-             tab_text (t, 1, heading_rows + 2 * n_extremities * 
-                       (i * n_factors * n_subfactors 
-                        + count * n_subfactors),
-                       TAB_RIGHT | TAT_TITLE, 
-                       value_to_string(fs->id, factor->indep_var)
-                       );
 
-             if ( count > 0 ) 
-               tab_hline (t, TAL_1, 1, n_cols - 1,  
-                          heading_rows + 2 * n_extremities *
-                          (i * n_factors * n_subfactors 
-                           + count * n_subfactors));
+             static union value prev ;
 
+             const int row = heading_rows + n_stat_rows  * 
+               ( ( i  * n_factors  ) +  count );
 
-             if ( factor->subfactor ) 
-               {
-                 struct hsh_iterator h2;
-                 const struct factor_statistics *sub_fs;
-                 int count2=0;
-
-                 for ( sub_fs = hsh_first(factor->subfactor->hash_table_val, 
-                                          &h2);
-                       sub_fs != 0;
-                       sub_fs = hsh_next(factor->subfactor->hash_table_val, 
-                                         &h2))
-                   {
-                       
-                     tab_text(t, 2, heading_rows + 2 * n_extremities *
-                              (i * n_factors * n_subfactors 
-                               + count * n_subfactors + count2 ),
-                              TAB_RIGHT | TAT_TITLE ,
-                              value_to_string(sub_fs->id, 
-                                              factor->subfactor->indep_var)
-                              );
-
-
-                     if ( count2 > 0 ) 
-                       tab_hline (t, TAL_1, 2, n_cols - 1,  
-                                  heading_rows + 2 * n_extremities *
-                                  (i * n_factors * n_subfactors 
-                                   + count * n_subfactors + count2 ));
-
-                     populate_extremities(t,3, 
-                                          heading_rows + 2 * n_extremities *
-                                          (i * n_factors * n_subfactors 
-                                           + count * n_subfactors + count2),
-                                          n_extremities );
-                                          
-                     count2++;
-                   }
-               }
-             else
+
+             if ( 0 != compare_values(&prev, &(*fs)->id[0], 
+                                      fctr->indep_var[0]->width))
                {
-                 populate_extremities(t,2, 
-                                      heading_rows + 2 * n_extremities *
-                                      (i * n_factors * n_subfactors 
-                                       + count * n_subfactors),
-                                      n_extremities);
+                 
+                 if ( count > 0 ) 
+                   tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
+
+                 tab_text (tbl, 
+                           1, row,
+                           TAB_LEFT | TAT_TITLE, 
+                           value_to_string(&(*fs)->id[0], fctr->indep_var[0])
+                           );
                }
 
-             count ++;
+             prev = (*fs)->id[0];
+
+             if (fctr->indep_var[1] && count > 0 ) 
+               tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
+
+             if ( fctr->indep_var[1]) 
+               tab_text (tbl, 2, row,
+                         TAB_LEFT | TAT_TITLE, 
+                         value_to_string(&(*fs)->id[1], fctr->indep_var[1])
+                         );
+
+             populate_descriptives(tbl, heading_columns - 2, 
+                               row, &(*fs)->m[i]);
+
+             count++ ; 
+             fs++;
            }
+
        }
-      else
-       {
-         populate_extremities(t, 1, 
-                              heading_rows + 2 * n_extremities *
-                              (i * n_factors * n_subfactors ),
-                              n_extremities);
 
+      else 
+       {
+         
+         populate_descriptives(tbl, heading_columns - 2, 
+                               i * n_stat_rows * n_factors  + heading_rows,
+                               &totals[i]);
        }
     }
 
-  tab_submit (t);
+  tab_submit(tbl);
+
 }
 
 
 
-/* Fill in the extremities table */
-void 
-populate_extremities(struct tab_table *t, int col, int row, int n)
+
+
+
+/* Fill in the descriptives data */
+void
+populate_descriptives(struct tab_table *tbl, int col, int row, 
+                     const struct metrics *m)
 {
-  int i;
 
-  tab_text(t, col, row,
-          TAB_RIGHT | TAT_TITLE ,
-          _("Highest")
-          );
+  const double t = gsl_cdf_tdist_Qinv(1 - cmd.n_cinterval[0]/100.0/2.0, \
+                                     m->n -1);
 
-  tab_text(t, col, row + n ,
-          TAB_RIGHT | TAT_TITLE ,
-          _("Lowest")
-          );
 
-  for (i = 0; i < n ; ++i ) 
-    {
-      tab_float(t, col + 1, row + i,
-               TAB_RIGHT,
-               i + 1, 8, 0);
+  tab_text (tbl, col, 
+           row,
+           TAB_LEFT | TAT_TITLE,
+           _("Mean"));
 
-      tab_float(t, col + 1, row + i + n,
-               TAB_RIGHT,
-               i + 1, 8, 0);
-    }
+  tab_float (tbl, col + 2,
+            row,
+            TAB_CENTER,
+            m->mean,
+            8,2);
+  
+  tab_float (tbl, col + 3,
+            row,
+            TAB_CENTER,
+            m->stderr,
+            8,3);
+  
+
+  tab_text (tbl, col, 
+           row + 1,
+           TAB_LEFT | TAT_TITLE | TAT_PRINTF,
+           _("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]);
+
+
+  tab_text (tbl, col + 1, 
+           row  + 1,
+           TAB_LEFT | TAT_TITLE,
+           _("Lower Bound"));
+
+  tab_float (tbl, col + 2,
+            row + 1,
+            TAB_CENTER,
+            m->mean - t * m->stderr, 
+            8,3);
+
+  tab_text (tbl, col + 1,  
+           row + 2,
+           TAB_LEFT | TAT_TITLE,
+           _("Upper Bound"));
+
+
+  tab_float (tbl, col + 2,
+            row + 2,
+            TAB_CENTER,
+            m->mean + t * m->stderr, 
+            8,3);
+
+  tab_text (tbl, col, 
+           row + 3,
+           TAB_LEFT | TAT_TITLE,
+           _("5% Trimmed Mean"));
+
+  tab_float (tbl, col + 2, 
+            row + 3,
+            TAB_CENTER,
+            m->trimmed_mean,
+            8,2);
+
+  tab_text (tbl, col, 
+           row + 4,
+           TAB_LEFT | TAT_TITLE,
+           _("Median"));
+
+  tab_text (tbl, col, 
+           row + 5,
+           TAB_LEFT | TAT_TITLE,
+           _("Variance"));
+
+  tab_float (tbl, col + 2,
+            row + 5,
+            TAB_CENTER,
+            m->var,
+            8,3);
+
+
+  tab_text (tbl, col, 
+           row + 6,
+           TAB_LEFT | TAT_TITLE,
+           _("Std. Deviation"));
+
+
+  tab_float (tbl, col + 2,
+            row + 6,
+            TAB_CENTER,
+            m->stddev,
+            8,3);
+
+  
+  tab_text (tbl, col, 
+           row + 7,
+           TAB_LEFT | TAT_TITLE,
+           _("Minimum"));
+
+  tab_float (tbl, col + 2,
+            row + 7,
+            TAB_CENTER,
+            m->min,
+            8,3);
+
+  tab_text (tbl, col, 
+           row + 8,
+           TAB_LEFT | TAT_TITLE,
+           _("Maximum"));
+
+  tab_float (tbl, col + 2,
+            row + 8,
+            TAB_CENTER,
+            m->max,
+            8,3);
+
+
+  tab_text (tbl, col, 
+           row + 9,
+           TAB_LEFT | TAT_TITLE,
+           _("Range"));
+
+
+  tab_float (tbl, col + 2,
+            row + 9,
+            TAB_CENTER,
+            m->max - m->min,
+            8,3);
+
+  tab_text (tbl, col, 
+           row + 10,
+           TAB_LEFT | TAT_TITLE,
+           _("Interquartile Range"));
+
+  tab_text (tbl, col, 
+           row + 11,
+           TAB_LEFT | TAT_TITLE,
+           _("Skewness"));
+
+  tab_text (tbl, col, 
+           row + 12,
+           TAB_LEFT | TAT_TITLE,
+           _("Kurtosis"));
 }
 
 
 
+
+
 /* Plot the normal and detrended normal plots for m
    Label the plots with factorname */
 void
@@ -1346,6 +1438,10 @@ np_plot(const struct metrics *m, const char *factorname)
   const double slope = 1.0 / m->stddev;
   const double intercept = - m->mean / m->stddev;
 
+  /* Cowardly refuse to plot an empty data set */
+  if ( n_data == 0 ) 
+    return ; 
+
   chart_initialise(&np_chart);
   chart_write_title(&np_chart, _("Normal Q-Q Plot of %s"), factorname);
   chart_write_xlabel(&np_chart, _("Observed Value"));
index cb2197ada76e1b32bd365def88d9e73c165dbb39..16e1930d58188723db0ae4a7f64c3c9c9f083447 100644 (file)
@@ -35,7 +35,10 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
 void
 metrics_precalc(struct metrics *fs)
 {
+  assert (fs) ;
+
   fs->n = 0;
+  fs->n_missing = 0;
   fs->ssq = 0;
   fs->sum = 0;
   fs->min = DBL_MAX;
@@ -44,18 +47,29 @@ metrics_precalc(struct metrics *fs)
   fs->ordered_data = hsh_create(20,
                                (hsh_compare_func *) compare_values,
                                (hsh_hash_func *) hash_value,
-                               0,
+                               (hsh_free_func *) weighted_value_free,
                                (void *) 0);
+
 }
 
+
+/* Include val in the calculation for the metrics.
+   If val is null, then treat it as MISSING
+*/
 void
-metrics_calc(struct metrics *fs, const union value *val, double weight)
+metrics_calc(struct metrics *fs, const union value *val, 
+            double weight, int case_no)
 {
-
-
   struct weighted_value **wv;
-  const double x = val->f;
+  double x;
   
+  if ( ! val ) 
+    {
+      fs->n_missing += weight;
+      return ;
+    }
+
+  x = val->f;
   fs->n    += weight;
   fs->ssq  += x * x * weight;
   fs->sum  += x * weight;
@@ -69,23 +83,38 @@ metrics_calc(struct metrics *fs, const union value *val, double weight)
   if ( *wv  ) 
     {
       /* If this value has already been seen, then simply 
-        increase its weight */
+        increase its weight  and push a new case number */
+
+      struct case_node *cn;
 
       assert( (*wv)->v.f == val->f );
       (*wv)->w += weight;      
+
+      cn = xmalloc( sizeof (struct case_node) ) ;
+      cn->next = (*wv)->case_nos ;
+      cn->num = case_no;
+
+      (*wv)->case_nos = cn;
     }
   else
     {
-      *wv = xmalloc( sizeof (struct weighted_value) );
+      struct case_node *cn;
+
+      *wv = weighted_value_create();
       (*wv)->v = *val;
       (*wv)->w = weight;
-      hsh_insert(fs->ordered_data,(void *) *wv);
+      
+      cn = xmalloc( sizeof (struct case_node) ) ;
+      cn->next=0;
+      cn->num = case_no;
+      (*wv)->case_nos  = cn;
+
     }
 
 }
 
 void
-metrics_postcalc(struct metrics *fs)
+metrics_postcalc(struct metrics *m)
 {
   double sample_var; 
   double cc = 0.0;
@@ -99,44 +128,52 @@ metrics_postcalc(struct metrics *fs)
 
   int n_data;
   
-  fs->mean = fs->sum / fs->n;
+  m->mean = m->sum / m->n;
 
-  sample_var = ( fs->ssq / fs->n  - fs->mean * fs->mean );
+  sample_var = ( m->ssq / m->n  - m->mean * m->mean );
 
-  fs->var  = fs->n * sample_var / ( fs->n - 1) ;
-  fs->stddev = sqrt(fs->var);
+  m->var  = m->n * sample_var / ( m->n - 1) ;
+  m->stddev = sqrt(m->var);
 
 
   /* FIXME: Check this is correct ???
      Shouldn't we use the sample variance ??? */
-  fs->stderr = sqrt (fs->var / fs->n) ;
+  m->stderr = sqrt (m->var / m->n) ;
+
+  data = (struct weighted_value **) hsh_data(m->ordered_data);
+  n_data = hsh_count(m->ordered_data);
 
-  data = (struct weighted_value **) hsh_data(fs->ordered_data);
-  n_data = hsh_count(fs->ordered_data);
+  if ( n_data == 0 ) 
+    {
+      m->trimmed_mean = m->mean;
+      return;
+    }
 
-  fs->wv = xmalloc ( sizeof (struct weighted_value) * n_data);
+
+  m->wv = xmalloc(sizeof(struct weighted_value ) * n_data);
 
   for ( i = 0 ; i < n_data ; ++i )
-    fs->wv[i] = *(data[i]);
+      m->wv[i] = *(data[i]);
 
-  sort (fs->wv, n_data, sizeof (struct weighted_value) , 
+  sort (m->wv, n_data, sizeof (struct weighted_value) , 
        (algo_compare_func *) compare_values, 0);
 
-
   
-  tc = fs->n * 0.05 ;
+  /* Trimmed mean calculation */
+
+  tc = m->n * 0.05 ;
   k1 = -1;
   k2 = -1;
 
 
   for ( i = 0 ; i < n_data ; ++i ) 
     {
-      cc += fs->wv[i].w;
-      fs->wv[i].cc = cc;
+      cc += m->wv[i].w;
+      m->wv[i].cc = cc;
 
-      fs->wv[i].rank = j + (fs->wv[i].w - 1) / 2.0 ;
+      m->wv[i].rank = j + (m->wv[i].w - 1) / 2.0 ;
       
-      j += fs->wv[i].w;
+      j += m->wv[i].w;
       
       if ( cc < tc ) 
        k1 = i;
@@ -146,44 +183,127 @@ metrics_postcalc(struct metrics *fs)
   k2 = n_data;
   for ( i = n_data -1  ; i >= 0; --i ) 
     {
-      if ( tc > fs->n - fs->wv[i].cc) 
+      if ( tc > m->n - m->wv[i].cc) 
        k2 = i;
     }
 
 
-  fs->trimmed_mean = 0;
+  m->trimmed_mean = 0;
   for ( i = k1 + 2 ; i <= k2 - 1 ; ++i ) 
     {
-      fs->trimmed_mean += fs->wv[i].v.f * fs->wv[i].w;
+      m->trimmed_mean += m->wv[i].v.f * m->wv[i].w;
     }
 
 
-  fs->trimmed_mean += (fs->n - fs->wv[k2 - 1].cc - tc) * fs->wv[k2].v.f ;
-  fs->trimmed_mean += (fs->wv[k1 + 1].cc - tc) * fs->wv[k1 + 1].v.f ;
-  fs->trimmed_mean /= 0.9 * fs->n ;
+  m->trimmed_mean += (m->n - m->wv[k2 - 1].cc - tc) * m->wv[k2].v.f ;
+  m->trimmed_mean += (m->wv[k1 + 1].cc - tc) * m->wv[k1 + 1].v.f ;
+  m->trimmed_mean /= 0.9 * m->n ;
 
 }
 
 
-/* Functions for hashes */
+struct weighted_value *
+weighted_value_create(void)
+{
+  struct weighted_value *wv;
+  wv = xmalloc (sizeof (struct weighted_value ));
+
+  wv->cc = 0;
+  wv->case_nos = 0;
+
+  return wv;
+}
 
 void 
-free_factor_stats(struct factor_statistics *f, int width UNUSED)
+weighted_value_free(struct weighted_value *wv)
 {
-  free (f);
+  struct case_node *cn = wv->case_nos;
+
+  while(cn)
+    {
+      struct case_node *next = cn->next;
+      
+      free(cn);
+      cn = next;
+    }
+
+  free(wv);
+
 }
 
-int
-compare_indep_values(const struct factor_statistics *f1, 
-                    const struct factor_statistics *f2, 
-                    int width)
+
+
+
+
+/* Create a factor statistics object with for N dependent vars
+   and ID as the value of the independent variable */
+struct factor_statistics * 
+create_factor_statistics (int n, union value *id0, union value *id1)
+{
+  struct factor_statistics *f;
+
+  f =  xmalloc( sizeof  ( struct factor_statistics ));
+
+  f->id[0] = *id0;
+  f->id[1] = *id1;
+  f->m = xmalloc( sizeof ( struct metrics ) * n ) ;
+
+  return f;
+}
+
+
+void
+factor_statistics_free(struct factor_statistics *f)
 {
-  return compare_values(f1->id, f2->id, width);
+  free(f->m) ; 
+
+  free(f);
 }
 
 
-unsigned 
-hash_indep_value(const struct factor_statistics *f, int width)
+
+
+
+
+int 
+factor_statistics_compare(const struct factor_statistics *f0,
+                         const struct factor_statistics *f1, void *aux)
+{
+
+  int cmp0;
+
+  assert(f0);
+  assert(f1);
+
+  cmp0 = compare_values(&f0->id[0], &f1->id[0], aux);
+
+  if ( cmp0 != 0 ) 
+    return cmp0;
+
+
+  if ( ( f0->id[1].f == SYSMIS )  && (f1->id[1].f != SYSMIS) ) 
+    return 1;
+
+  if ( ( f0->id[1].f != SYSMIS )  && (f1->id[1].f == SYSMIS) ) 
+    return -1;
+
+  return compare_values(&f0->id[1], &f1->id[1], aux);
+  
+}
+
+unsigned int 
+factor_statistics_hash(const struct factor_statistics *f, void *aux)
 {
-  return hash_value(f->id, width);
+  
+  unsigned int h;
+
+  h = hash_value(&f->id[0], aux);
+  
+  if ( f->id[1].f != SYSMIS )
+    h += hash_value(&f->id[1], aux);
+
+
+  return h;
+
 }
+       
index c7f1216221f5401fa8444f4c0888c9483012ac54..cf660c6a1b25ec7634113ddd0e3c0fc91c0b8711 100644 (file)
@@ -28,25 +28,11 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
 #include "hash.h"
 #include "val.h"
 
-struct weighted_value 
-{
-  union value v;
-
-  /* The weight */
-  double w;
-
-  /* The cumulative weight */
-  double cc; 
-
-  /* The rank */
-  double rank;
-};
-
-
-
 struct metrics
 {
   double n;
+
+  double n_missing;
   
   double ssq;
   
@@ -66,45 +52,88 @@ struct metrics
 
   double trimmed_mean;
 
-  /* An ordered arary of data for this factor */
+  /* A hash of data for this factor */
   struct hsh_table *ordered_data;
 
-  /* An SORTED array of weighted values */
+  /* A SORTED array of weighted values */
   struct weighted_value *wv;
 };
 
 
 
+
+void metrics_precalc(struct metrics *m);
+
+void metrics_calc(struct metrics *m, const union value *f, double weight, 
+                 int case_no);
+
+void metrics_postcalc(struct metrics *m);
+
+
+/* Linked list of case nos */
+struct case_node
+{
+  int num;
+  struct case_node *next;
+};
+
+struct weighted_value 
+{
+  union value v;
+
+  /* The weight */
+  double w;
+
+  /* The cumulative weight */
+  double cc; 
+
+  /* The rank */
+  double rank;
+
+  /* Linked list of cases nos which have this value */
+  struct case_node *case_nos;
+  
+};
+
+
+struct weighted_value *weighted_value_create(void);
+
+void weighted_value_free(struct weighted_value *wv);
+
+
+
 struct factor_statistics {
 
-  /* The value of the independent variable for this factor */
-  const union value *id;
+  /* The value of the independent variable */
+  union value id[2];
 
-  /* An array of metrics indexed by dependent variable */
-  struct metrics *stats;
+  /* The an array stats for this factor, one for each dependent var */
+  struct metrics *m;
 
 };
 
 
+/* Create a factor statistics object with for N dependent vars
+   and ID as the value of the independent variable */
+struct factor_statistics * 
+create_factor_statistics (int n, union value *id0, union value *id1);
 
-void metrics_precalc(struct metrics *fs);
 
-void metrics_calc(struct metrics *fs, const union value *f, double weight);
+void factor_statistics_free(struct factor_statistics *f);
 
-void metrics_postcalc(struct metrics *fs);
 
+int 
+factor_statistics_compare(const struct factor_statistics *f0,
+                         const struct factor_statistics *f1, void *aux);
 
+                             
 
+unsigned int 
+factor_statistics_hash(const struct factor_statistics *f, void *aux);
 
-/* These functions are necessary for creating hashes */
 
-int compare_indep_values(const struct factor_statistics *f1, 
-                    const struct factor_statistics *f2, 
-                    int width);
 
-unsigned hash_indep_value(const struct factor_statistics *f, int width) ;
 
-void  free_factor_stats(struct factor_statistics *f, int width );
 
 
 #endif
index 457be8ace071b856fff5cd17d5fc5dd94d14bd16..5b35a9f1dbeb02c52f721a1dca431f625ffce373 100644 (file)
@@ -185,7 +185,7 @@ done_glob(void)
 {
   dict_destroy(default_dict);
   free(logfn);
-  done_settings();
+  /* done_settings(); */
   ds_destroy (&tokstr);
 }
 
index 588b311b45774c37cc52d5476b4a79b24b589b17..a73256589ab69d55fce8252be7fa16074f0834a0 100644 (file)
@@ -175,6 +175,8 @@ hsh_clear (struct hsh_table *h)
 
   for (i = 0; i < h->size; i++)
     h->entries[i] = NULL;
+
+  h->used = 0;
 }
 
 /* Destroys table H and all its contents. */
@@ -259,6 +261,9 @@ comparison_helper (const void *a_, const void *b_, void *h_)
   void *const *b = b_;
   struct hsh_table *h = h_;
 
+  assert(a);
+  assert(b);
+
   return h->compare (*a, *b, h->aux);
 }
 
index 8b1d32869e2622398111f5bc2afcb9024ddbec80..16e649687cb577db05891b75f9ee913251d92f6e 100644 (file)
@@ -500,7 +500,13 @@ value_to_string(const union value *val, const struct variable *var)
 {
   static char buf[100];
   char *s;
-  const struct val_labs *val_labs = var->val_labs;
+  const struct val_labs *val_labs ;
+  
+  if ( !val || ! var ) 
+    return 0;
+
+  val_labs = var->val_labs;
+
   
   s = val_labs_find (val_labs, *val);
 
index 4c9781bf5a8c1aab7d028eaa326ea66370261ccf..7d78027ed78011bb54bb3de332ba512502ac2449 100644 (file)
@@ -74,5 +74,8 @@ cmd_variable_labels (void)
 const char *
 var_to_string(const struct variable *var)
 {
+  if ( !var ) 
+    return 0;
+
   return ( var->label ? var->label : var->name);
 }
index 9bc159a38cf491fb46f97e6918b74c0ef8d3ecb1..7dd403f855cd14ada21eefdba847f4a93a75d01a 100644 (file)
@@ -1,3 +1,5 @@
+       * command/examine.sh Added
+       
 Mon Nov 15 23:52:55 2004  Ben Pfaff  <blp@gnu.org>
 
        * bugs/random.sh: Update expected random values to reflect the GSL
index 2736d5f2582da8c18b06d913c11f121e1ca98dd4..9786d328bc82faadc132e4ce04cb5eed99f2cd53 100644 (file)
@@ -9,6 +9,7 @@ TESTS = \
        command/count.sh \
        command/data-list.sh \
        command/erase.sh \
+       command/examine.sh \
        command/file-label.sh \
        command/filter.sh \
        command/flip.sh \
diff --git a/tests/command/examine.sh b/tests/command/examine.sh
new file mode 100755 (executable)
index 0000000..5306496
--- /dev/null
@@ -0,0 +1,255 @@
+#!/bin/sh
+
+# This program tests  the EXAMINE command.
+
+TEMPDIR=/tmp/pspp-tst-$$
+
+here=`pwd`;
+
+# ensure that top_srcdir is absolute
+cd $top_srcdir; top_srcdir=`pwd`
+
+export STAT_CONFIG_PATH=$top_srcdir/config
+
+
+cleanup()
+{
+     rm -rf $TEMPDIR
+}
+
+
+fail()
+{
+    echo $activity
+    echo FAILED
+    cleanup;
+    exit 1;
+}
+
+
+no_result()
+{
+    echo $activity
+    echo NO RESULT;
+    cleanup;
+    exit 2;
+}
+
+pass()
+{
+    cleanup;
+    exit 0;
+}
+
+mkdir -p $TEMPDIR
+
+cd $TEMPDIR
+
+activity="create program"
+cat > $TEMPDIR/out.stat <<EOF
+DATA LIST LIST /quality * w * brand * .
+BEGIN DATA
+3  1  1
+2  2  1
+1  2  1
+1  1  1
+4  1  1
+4  1  1
+5  1  2
+2  1  2
+4  4  2
+2  1  2
+3  1  2
+7  1  3
+4  2  3
+5  3  3
+3  1  3
+6  1  3
+END DATA
+
+WEIGHT BY w.
+
+VARIABLE LABELS brand   'Manufacturer'.
+VARIABLE LABELS quality 'Breaking Strain'.
+
+VALUE LABELS /brand 1 'Aspeger' 2 'Bloggs' 3 'Charlies'.
+
+LIST /FORMAT=NUMBERED.
+
+EXAMINE
+       quality BY brand
+       /STATISTICS descriptives extreme(3)
+       .
+EOF
+if [ $? -ne 0 ] ; then no_result ; fi
+
+
+activity="run program"
+$SUPERVISOR $here/../src/pspp -o raw-ascii $TEMPDIR/out.stat
+if [ $? -ne 0 ] ; then no_result ; fi
+
+
+# NOTE:  In the following data: Only the extreme values have been checked
+# The descriptives have been blindly pasted.
+activity="compare results"
+diff $TEMPDIR/pspp.list - << EOF
+1.1 DATA LIST.  Reading free-form data from the command file.
++--------+------+
+|Variable|Format|
+#========#======#
+|QUALITY |F8.0  |
+|W       |F8.0  |
+|BRAND   |F8.0  |
++--------+------+
+
+Case#  QUALITY        W    BRAND
+----- -------- -------- --------
+    1     3.00     1.00     1.00 
+    2     2.00     2.00     1.00 
+    3     1.00     2.00     1.00 
+    4     1.00     1.00     1.00 
+    5     4.00     1.00     1.00 
+    6     4.00     1.00     1.00 
+    7     5.00     1.00     2.00 
+    8     2.00     1.00     2.00 
+    9     4.00     4.00     2.00 
+   10     2.00     1.00     2.00 
+   11     3.00     1.00     2.00 
+   12     7.00     1.00     3.00 
+   13     4.00     2.00     3.00 
+   14     5.00     3.00     3.00 
+   15     3.00     1.00     3.00 
+   16     6.00     1.00     3.00 
+
+2.1 EXAMINE.  Case Processing Summary
+#===============#=============================#
+#               #            Cases            #
+#               #---------+---------+---------#
+#               #  Valid  | Missing |  Total  #
+#---------------#-+-------+-+-------+-+-------#
+#               #N|Percent|N|Percent|N|Percent#
+#===============#=#=======#=#=======#=#=======#
+#Breaking Strain# |       | |       | |       #
+#===============#=#=======#=#=======#=#=======#
+
+2.2 EXAMINE.  Descriptives
+#===============#===========================================#=========#==========#
+#               |                                           #Statistic|Std. Error#
+#===============#===========================================#=========#==========#
+#Breaking Strain|Mean                                       #   3.54  |   .324   #
+#               |95% Confidence Interval for MeanLower Bound#  3.562  |          #
+#               |                                Upper Bound#  3.521  |          #
+#               |5% Trimmed Mean                            #   3.50  |          #
+#               |Median                                     #         |          #
+#               |Variance                                   #  2.520  |          #
+#               |Std. Deviation                             #  1.587  |          #
+#               |Minimum                                    #  1.000  |          #
+#               |Maximum                                    #  7.000  |          #
+#               |Range                                      #  6.000  |          #
+#               |Interquartile Range                        #         |          #
+#               |Skewness                                   #         |          #
+#               |Kurtosis                                   #         |          #
+#===============#===========================================#=========#==========#
+
+2.3 EXAMINE.  Extreme Values
+#=======================#===========#=====#
+#                       #Case Number|Value#
+#=======================#===========#=====#
+#Breaking StrainHighest1#         12| 7.00#
+#                      2#         16| 6.00#
+#                      3#         14| 5.00#
+#                Lowest1#          4| 1.00#
+#                      2#          3| 1.00#
+#                      3#          3| 1.00#
+#=======================#===========#=====#
+
+2.4 EXAMINE.  Case Processing Summary
+#===============#============#=============================#
+#               |            #            Cases            #
+#               |            #---------+---------+---------#
+#               |            #  Valid  | Missing |  Total  #
+#---------------+------------#-+-------+-+-------+-+-------#
+#               |Manufacturer#N|Percent|N|Percent|N|Percent#
+#===============#============#=#=======#=#=======#=#=======#
+#Breaking Strain|     Aspeger# |       | |       | |       #
+#               |      Bloggs# |       | |       | |       #
+#               |    Charlies# |       | |       | |       #
+#===============#============#=#=======#=#=======#=#=======#
+
+2.5 EXAMINE.  Descriptives
+#===========================#===========================================#=========#==========#
+#               Manufacturer|                                           #Statistic|Std. Error#
+#===========================#===========================================#=========#==========#
+#Breaking Strain     Aspeger|Mean                                       #   2.25  |   .453   #
+#                           |95% Confidence Interval for MeanLower Bound#  2.279  |          #
+#                           |                                Upper Bound#  2.221  |          #
+#                           |5% Trimmed Mean                            #   2.22  |          #
+#                           |Median                                     #         |          #
+#                           |Variance                                   #  1.643  |          #
+#                           |Std. Deviation                             #  1.282  |          #
+#                           |Minimum                                    #  1.000  |          #
+#                           |Maximum                                    #  4.000  |          #
+#                           |Range                                      #  3.000  |          #
+#                           |Interquartile Range                        #         |          #
+#                           |Skewness                                   #         |          #
+#                           |Kurtosis                                   #         |          #
+#               ------------+-------------------------------------------#---------+----------#
+#                     Bloggs|Mean                                       #   3.50  |   .378   #
+#                           |95% Confidence Interval for MeanLower Bound#  3.525  |          #
+#                           |                                Upper Bound#  3.475  |          #
+#                           |5% Trimmed Mean                            #   3.50  |          #
+#                           |Median                                     #         |          #
+#                           |Variance                                   #  1.143  |          #
+#                           |Std. Deviation                             #  1.069  |          #
+#                           |Minimum                                    #  2.000  |          #
+#                           |Maximum                                    #  5.000  |          #
+#                           |Range                                      #  3.000  |          #
+#                           |Interquartile Range                        #         |          #
+#                           |Skewness                                   #         |          #
+#                           |Kurtosis                                   #         |          #
+#               ------------+-------------------------------------------#---------+----------#
+#                   Charlies|Mean                                       #   4.88  |   .441   #
+#                           |95% Confidence Interval for MeanLower Bound#  4.904  |          #
+#                           |                                Upper Bound#  4.846  |          #
+#                           |5% Trimmed Mean                            #   4.86  |          #
+#                           |Median                                     #         |          #
+#                           |Variance                                   #  1.554  |          #
+#                           |Std. Deviation                             #  1.246  |          #
+#                           |Minimum                                    #  3.000  |          #
+#                           |Maximum                                    #  7.000  |          #
+#                           |Range                                      #  4.000  |          #
+#                           |Interquartile Range                        #         |          #
+#                           |Skewness                                   #         |          #
+#                           |Kurtosis                                   #         |          #
+#===========================#===========================================#=========#==========#
+
+2.6 EXAMINE.  Extreme Values
+#===================================#===========#=====#
+#               Manufacturer        #Case Number|Value#
+#===================================#===========#=====#
+#Breaking Strain     AspegerHighest1#          6| 4.00#
+#                                  2#          5| 4.00#
+#                                  3#          1| 3.00#
+#                            Lowest1#          4| 1.00#
+#                                  2#          3| 1.00#
+#                                  3#          3| 1.00#
+#               --------------------#-----------+-----#
+#                     BloggsHighest1#          7| 5.00#
+#                                  2#          9| 4.00#
+#                                  3#          9| 4.00#
+#                            Lowest1#         10| 2.00#
+#                                  2#          8| 2.00#
+#                                  3#         11| 3.00#
+#               --------------------#-----------+-----#
+#                   CharliesHighest1#         12| 7.00#
+#                                  2#         16| 6.00#
+#                                  3#         14| 5.00#
+#                            Lowest1#         15| 3.00#
+#                                  2#         13| 4.00#
+#                                  3#         13| 4.00#
+#===================================#===========#=====#
+
+EOF
+if [ $? -ne 0 ] ; then fail ; fi
+
+pass