#include "hash.h"
#include "casefile.h"
#include "factor_stats.h"
+#include "moments.h"
+#include "percentiles.h"
+
/* (headers) */
#include "chart.h"
rep:report/!noreport,
incl:include/!exclude;
+compare=cmp:variables/!groups;
+ +percentiles=custom;
+plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none;
+cinterval=double;
+statistics[st_]=descriptives,:extreme(*d:n),all,none.
/* Hash table of factor stats indexed by 2 values */
struct hsh_table *fstats;
- /* The hash table after it's been crunched */
+ /* The hash table after it has been crunched */
struct factor_statistics **fs;
struct factor *next;
static struct metrics *totals=0;
-void
-print_factors(void)
-{
- struct factor *f = factors;
-
- while (f)
- {
- struct factor_statistics **fs = f->fs;
-
- printf("Factor: %s BY %s\n",
- var_to_string(f->indep_var[0]),
- var_to_string(f->indep_var[1]) );
-
-
- printf("Contains %d entries\n", hsh_count(f->fstats));
-
-
- while (*fs)
- {
- printf("Factor %g; %g\n", (*fs)->id[0].f, (*fs)->id[1].f);
-
- /*
- printf("Factor %s; %s\n",
- value_to_string(&(*fs)->id[0], f->indep_var[0]),
- value_to_string(&(*fs)->id[1], f->indep_var[1]));
- */
-
-
- printf("Sum is %g; ",(*fs)->m[0].sum);
- printf("N is %g; ",(*fs)->m[0].n);
- printf("Mean is %g\n",(*fs)->m[0].mean);
-
- fs++ ;
- }
-
- f = f->next;
- }
-
-
-}
-
-
/* Parse the clause specifying the factors */
static int examine_parse_independent_vars(struct cmd_examine *cmd);
int n_dep_var,
struct factor *factor);
+static void show_percentiles(struct variable **dependent_var,
+ int n_dep_var,
+ struct factor *factor);
+
-void np_plot(const struct metrics *m, const char *factorname);
+void np_plot(const struct metrics *m, const char *factorname);
static is_missing_func value_is_missing;
+/* PERCENTILES */
+
+static subc_list_double percentile_list;
+
+static enum pc_alg percentile_algorithm;
+
+static short sbc_percentile;
+
+
int
cmd_examine(void)
{
+ subc_list_double_create(&percentile_list);
+ percentile_algorithm = PC_HAVERAGE;
+
if ( !parse_examine(&cmd) )
return CMD_FAILURE;
if ( ! cmd.sbc_cinterval)
cmd.n_cinterval[0] = 95.0;
+
+ /* If descriptives have been requested, make sure the
+ quartiles are calculated */
+ if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] )
+ {
+ subc_list_double_push(&percentile_list, 25);
+ subc_list_double_push(&percentile_list, 50);
+ subc_list_double_push(&percentile_list, 75);
+ }
+
multipass_procedure_with_splits (run_examine, &cmd);
+ if ( totals )
+ free(totals);
+
+ subc_list_double_destroy(&percentile_list);
+
return CMD_SUCCESS;
};
show_descriptives(dependent_vars, n_dependent_vars, 0);
}
+ if ( sbc_percentile )
+ show_percentiles(dependent_vars, n_dependent_vars, 0);
if ( cmd.sbc_plot)
{
+ int v;
if ( cmd.a_plot[XMN_PLT_NPPLOT] )
{
- int v;
+ for ( v = 0 ; v < n_dependent_vars; ++v )
+ np_plot(&totals[v], var_to_string(dependent_vars[v]));
+ }
+ if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
+ {
for ( v = 0 ; v < n_dependent_vars; ++v )
- np_plot(&totals[v], var_to_string(dependent_vars[v]));
+ {
+ struct normal_curve normal;
+
+ normal.N = totals[v].n;
+ normal.mean = totals[v].mean;
+ normal.stddev = totals[v].stddev;
+
+ histogram_plot(totals[v].histogram,
+ var_to_string(dependent_vars[v]),
+ &normal, 0);
+ }
}
- }
+ }
}
show_descriptives(dependent_vars, n_dependent_vars, fctr);
}
+ if ( sbc_percentile )
+ show_percentiles(dependent_vars, n_dependent_vars, fctr);
+
+
if ( cmd.sbc_plot)
{
- if ( cmd.a_plot[XMN_PLT_NPPLOT] )
+ int v;
+
+ struct factor_statistics **fs = fctr->fs ;
+
+ for ( v = 0 ; v < n_dependent_vars; ++v )
{
- int v;
- for ( v = 0 ; v < n_dependent_vars; ++ v)
+
+ for ( fs = fctr->fs ; *fs ; ++fs )
{
-
- struct factor_statistics **fs = fctr->fs ;
- for ( fs = fctr->fs ; *fs ; ++fs )
- {
- char buf1[100];
- char buf2[100];
- sprintf(buf1, "%s (",
- var_to_string(dependent_vars[v]));
+ char buf1[100];
+ char buf2[100];
+ sprintf(buf1, "%s (",
+ var_to_string(dependent_vars[v]));
+
+ snprintf(buf2, 100, "%s = %s",
+ var_to_string(fctr->indep_var[0]),
+ value_to_string(&(*fs)->id[0],fctr->indep_var[0]));
- sprintf(buf2, "%s = %s",
- var_to_string(fctr->indep_var[0]),
- value_to_string(&(*fs)->id[0],fctr->indep_var[0]));
+ strcat(buf1, buf2);
+ if ( fctr->indep_var[1] )
+ {
+ sprintf(buf2, "; %s = %s)",
+ var_to_string(fctr->indep_var[1]),
+ value_to_string(&(*fs)->id[1],
+ fctr->indep_var[1]));
strcat(buf1, buf2);
+ }
+ else
+ {
+ strcat(buf1, ")");
+ }
-
- if ( fctr->indep_var[1] )
- {
- sprintf(buf2, "; %s = %s)",
- var_to_string(fctr->indep_var[1]),
- value_to_string(&(*fs)->id[1],
- fctr->indep_var[1]));
- strcat(buf1, buf2);
- }
- else
- {
- strcat(buf1, ")");
- }
-
- np_plot(&(*fs)->m[v],buf1);
+ if ( cmd.a_plot[XMN_PLT_NPPLOT] )
+ np_plot(&(*fs)->m[v],buf1);
+
+
+ if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
+ {
+ struct normal_curve normal;
+ normal.N = (*fs)->m[v].n;
+ normal.mean = (*fs)->m[v].mean;
+ normal.stddev = (*fs)->m[v].stddev;
+
+ histogram_plot((*fs)->m[v].histogram,
+ buf1, &normal, 0);
}
- }
+ } /* for ( fs .... */
+
+ } /* for ( v = 0 ..... */
- }
}
fctr = fctr->next;
}
+static struct hsh_table *
+list_to_ptile_hash(const subc_list_double *l)
+{
+ int i;
+
+ struct hsh_table *h ;
+
+ h = hsh_create(subc_list_double_count(l),
+ (hsh_compare_func *) ptile_compare,
+ (hsh_hash_func *) ptile_hash,
+ (hsh_free_func *) free,
+ 0);
+
+
+ for ( i = 0 ; i < subc_list_double_count(l) ; ++i )
+ {
+ struct percentile *p = xmalloc (sizeof (struct percentile));
+
+ p->p = subc_list_double_at(l,i);
+
+ hsh_insert(h, p);
+
+ }
+
+ return h;
+
+}
+
+/* Parse the PERCENTILES subcommand */
+static int
+xmn_custom_percentiles(struct cmd_examine *p UNUSED)
+{
+ sbc_percentile = 1;
+
+ lex_match('=');
+
+ lex_match('(');
+
+ while ( lex_double_p() )
+ {
+ subc_list_double_push(&percentile_list,lex_double());
+
+ lex_get();
+
+ lex_match(',') ;
+ }
+ lex_match(')');
+
+ lex_match('=');
+
+ if ( lex_match_id("HAVERAGE"))
+ percentile_algorithm = PC_HAVERAGE;
+
+ else if ( lex_match_id("WAVERAGE"))
+ percentile_algorithm = PC_WAVERAGE;
+
+ else if ( lex_match_id("ROUND"))
+ percentile_algorithm = PC_ROUND;
+
+ else if ( lex_match_id("EMPIRICAL"))
+ percentile_algorithm = PC_EMPIRICAL;
+
+ else if ( lex_match_id("AEMPIRICAL"))
+ percentile_algorithm = PC_AEMPIRICAL;
+
+ else if ( lex_match_id("NONE"))
+ percentile_algorithm = PC_NONE;
+
+
+ if ( 0 == subc_list_double_count(&percentile_list))
+ {
+ subc_list_double_push(&percentile_list, 5);
+ subc_list_double_push(&percentile_list, 10);
+ subc_list_double_push(&percentile_list, 25);
+ subc_list_double_push(&percentile_list, 50);
+ subc_list_double_push(&percentile_list, 75);
+ subc_list_double_push(&percentile_list, 90);
+ subc_list_double_push(&percentile_list, 95);
+ }
+
+ return 1;
+}
/* TOTAL and NOTOTAL are simple, mutually exclusive flags */
static int
+void populate_percentiles(struct tab_table *tbl, int col, int row,
+ const struct metrics *m);
+
void populate_descriptives(struct tab_table *t, int col, int row,
const struct metrics *fs);
fs != 0 ;
fs = hsh_next(fctr->fstats, &hi))
{
+
+ fs->m[v].ptile_hash = list_to_ptile_hash(&percentile_list);
+ fs->m[v].ptile_alg = percentile_algorithm;
metrics_postcalc(&fs->m[v]);
}
fctr = fctr->next;
}
+
+ totals[v].ptile_hash = list_to_ptile_hash(&percentile_list);
+ totals[v].ptile_alg = percentile_algorithm;
metrics_postcalc(&totals[v]);
}
fctr = fctr->next;
}
- /*
- print_factors();
- */
-
output_examine();
+ for ( v = 0 ; v < n_dependent_vars ; ++v )
+ hsh_destroy(totals[v].ordered_data);
+
}
int extremity;
int idx=0;
- const int n_data = hsh_count(m->ordered_data);
tab_text(t, col, row,
TAB_RIGHT | TAT_TITLE ,
/* Lowest */
- for (idx = 0, extremity = 0; extremity < n && idx < n_data ; ++idx )
+ for (idx = 0, extremity = 0; extremity < n && idx < m->n_data ; ++idx )
{
int j;
- const struct weighted_value *wv = &m->wv[idx];
+ const struct weighted_value *wv = m->wvp[idx];
struct case_node *cn = wv->case_nos;
/* Highest */
- for (idx = n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx )
+ for (idx = m->n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx )
{
int j;
- const struct weighted_value *wv = &m->wv[idx];
+ const struct weighted_value *wv = m->wvp[idx];
struct case_node *cn = wv->case_nos;
for (j = 0 ; j < wv->w ; ++j )
+
+
+
+
+
/* Fill in the descriptives data */
void
populate_descriptives(struct tab_table *tbl, int col, int row,
tab_float (tbl, col + 3,
row,
TAB_CENTER,
- m->stderr,
+ m->se_mean,
8,3);
tab_float (tbl, col + 2,
row + 1,
TAB_CENTER,
- m->mean - t * m->stderr,
+ m->mean - t * m->se_mean,
8,3);
tab_text (tbl, col + 1,
tab_float (tbl, col + 2,
row + 2,
TAB_CENTER,
- m->mean + t * m->stderr,
+ m->mean + t * m->se_mean,
8,3);
tab_text (tbl, col,
TAB_LEFT | TAT_TITLE,
_("Median"));
+ {
+ struct percentile *p;
+ double d = 50;
+
+ p = hsh_find(m->ptile_hash, &d);
+
+ assert(p);
+
+ tab_float (tbl, col + 2,
+ row + 4,
+ TAB_CENTER,
+ p->v,
+ 8, 2);
+ }
+
tab_text (tbl, col,
row + 5,
TAB_LEFT | TAT_TITLE,
TAB_LEFT | TAT_TITLE,
_("Interquartile Range"));
+ {
+ struct percentile *p1;
+ struct percentile *p2;
+
+ double d = 75;
+ p1 = hsh_find(m->ptile_hash, &d);
+
+ d = 25;
+ p2 = hsh_find(m->ptile_hash, &d);
+
+ assert(p1);
+ assert(p2);
+
+ tab_float (tbl, col + 2,
+ row + 10,
+ TAB_CENTER,
+ p1->v - p2->v,
+ 8, 2);
+ }
+
+
+
tab_text (tbl, col,
row + 11,
TAB_LEFT | TAT_TITLE,
_("Skewness"));
+
+ tab_float (tbl, col + 2,
+ row + 11,
+ TAB_CENTER,
+ m->skewness,
+ 8,3);
+
+ /* stderr of skewness */
+ tab_float (tbl, col + 3,
+ row + 11,
+ TAB_CENTER,
+ calc_seskew(m->n),
+ 8,3);
+
+
tab_text (tbl, col,
row + 12,
TAB_LEFT | TAT_TITLE,
_("Kurtosis"));
-}
+ tab_float (tbl, col + 2,
+ row + 12,
+ TAB_CENTER,
+ m->kurtosis,
+ 8,3);
+
+ /* stderr of kurtosis */
+ tab_float (tbl, col + 3,
+ row + 12,
+ TAB_CENTER,
+ calc_sekurt(m->n),
+ 8,3);
+
+}
/* Plot the normal and detrended normal plots for m
/* Detrended Normal Plot */
struct chart dnp_chart;
- const struct weighted_value *wv = m->wv;
- const int n_data = hsh_count(m->ordered_data) ;
-
/* The slope and intercept of the ideal normal probability line */
const double slope = 1.0 / m->stddev;
const double intercept = - m->mean / m->stddev;
/* Cowardly refuse to plot an empty data set */
- if ( n_data == 0 )
+ if ( m->n_data == 0 )
return ;
chart_initialise(&np_chart);
chart_write_xlabel(&dnp_chart, _("Observed Value"));
chart_write_ylabel(&dnp_chart, _("Dev from Normal"));
- yfirst = gsl_cdf_ugaussian_Pinv (wv[0].rank / ( m->n + 1));
- ylast = gsl_cdf_ugaussian_Pinv (wv[n_data-1].rank / ( m->n + 1));
+ yfirst = gsl_cdf_ugaussian_Pinv (m->wvp[0]->rank / ( m->n + 1));
+ ylast = gsl_cdf_ugaussian_Pinv (m->wvp[m->n_data-1]->rank / ( m->n + 1));
+
{
/* Need to make sure that both the scatter plot and the ideal fit into the
double x_upper = max(m->max, (ylast - intercept) / slope) ;
double slack = (x_upper - x_lower) * 0.05 ;
- chart_write_xscale(&np_chart, x_lower - slack, x_upper + slack,
- chart_rounded_tick((m->max - m->min) / 5.0));
+ chart_write_xscale(&np_chart, x_lower - slack, x_upper + slack, 5);
-
- chart_write_xscale(&dnp_chart, m->min, m->max,
- chart_rounded_tick((m->max - m->min) / 5.0));
+ chart_write_xscale(&dnp_chart, m->min, m->max, 5);
}
- chart_write_yscale(&np_chart, yfirst, ylast,
- chart_rounded_tick((ylast - yfirst)/5.0) );
+ chart_write_yscale(&np_chart, yfirst, ylast, 5);
{
/* We have to cache the detrended data, beacause we need to
find its limits before we can plot it */
double *d_data;
- d_data = xmalloc (n_data * sizeof(double));
+ d_data = xmalloc (m->n_data * sizeof(double));
double d_max = -DBL_MAX;
double d_min = DBL_MAX;
- for ( i = 0 ; i < n_data; ++i )
+ for ( i = 0 ; i < m->n_data; ++i )
{
- const double ns = gsl_cdf_ugaussian_Pinv (wv[i].rank / ( m->n + 1));
+ const double ns = gsl_cdf_ugaussian_Pinv (m->wvp[i]->rank / ( m->n + 1));
- chart_datum(&np_chart, 0, wv[i].v.f, ns);
+ chart_datum(&np_chart, 0, m->wvp[i]->v.f, ns);
- d_data[i] = (wv[i].v.f - m->mean) / m->stddev - ns;
+ d_data[i] = (m->wvp[i]->v.f - m->mean) / m->stddev - ns;
if ( d_data[i] < d_min ) d_min = d_data[i];
if ( d_data[i] > d_max ) d_max = d_data[i];
}
+ chart_write_yscale(&dnp_chart, d_min, d_max, 5);
- chart_write_yscale(&dnp_chart, d_min, d_max,
- chart_rounded_tick((d_max - d_min) / 5.0));
-
- for ( i = 0 ; i < n_data; ++i )
- chart_datum(&dnp_chart, 0, wv[i].v.f, d_data[i]);
+ for ( i = 0 ; i < m->n_data; ++i )
+ chart_datum(&dnp_chart, 0, m->wvp[i]->v.f, d_data[i]);
free(d_data);
}
chart_finalise(&dnp_chart);
}
+
+
+
+
+/* Show the percentiles */
+void
+show_percentiles(struct variable **dependent_var,
+ int n_dep_var,
+ struct factor *fctr)
+{
+ struct tab_table *tbl;
+ int i;
+
+ int n_cols, n_rows;
+ int n_factors;
+
+ struct hsh_table *ptiles ;
+
+ int n_heading_columns;
+ const int n_heading_rows = 2;
+ const int n_stat_rows = 2;
+
+ int n_ptiles ;
+
+ if ( fctr )
+ {
+ struct factor_statistics **fs = fctr->fs ;
+ n_heading_columns = 3;
+ n_factors = hsh_count(fctr->fstats);
+
+ ptiles = (*fs)->m[0].ptile_hash;
+
+ if ( fctr->indep_var[1] )
+ n_heading_columns = 4;
+ }
+ else
+ {
+ n_factors = 1;
+ n_heading_columns = 2;
+
+ ptiles = totals[0].ptile_hash;
+ }
+
+ n_ptiles = hsh_count(ptiles);
+
+ n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors;
+
+ n_cols = n_heading_columns + n_ptiles ;
+
+ tbl = tab_create (n_cols, n_rows, 0);
+
+ tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0);
+
+ tab_dim (tbl, tab_natural_dimensions);
+
+ /* Outline the box and have no internal lines*/
+ tab_box (tbl,
+ TAL_2, TAL_2,
+ -1, -1,
+ 0, 0,
+ n_cols - 1, n_rows - 1);
+
+ tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows );
+
+ tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1);
+
+
+ tab_title (tbl, 0, _("Percentiles"));
+
+
+ tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 );
+
+
+ tab_box (tbl,
+ -1, -1,
+ -1, TAL_1,
+ 0, n_heading_rows,
+ n_heading_columns - 1, n_rows - 1);
+
+
+ tab_box (tbl,
+ -1, -1,
+ -1, TAL_1,
+ n_heading_columns, n_heading_rows - 1,
+ n_cols - 1, n_rows - 1);
+
+ tab_joint_text(tbl, n_heading_columns + 1, 0,
+ n_cols - 1 , 0,
+ TAB_CENTER | TAT_TITLE ,
+ _("Percentiles"));
+
+
+ {
+ /* Put in the percentile break points as headings */
+
+ struct percentile **p = (struct percentile **) hsh_sort(ptiles);
+
+ i = 0;
+ while ( (*p) )
+ {
+ tab_float(tbl, n_heading_columns + i++ , 1,
+ TAB_CENTER,
+ (*p)->p, 8, 0);
+
+ p++;
+ }
+
+ }
+
+ for ( i = 0 ; i < n_dep_var ; ++i )
+ {
+ const int n_stat_rows = 2;
+ const int row = n_heading_rows + i * n_stat_rows * n_factors ;
+
+ if ( i > 0 )
+ tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
+
+ tab_text (tbl, 0,
+ i * n_stat_rows * n_factors + n_heading_rows,
+ TAB_LEFT | TAT_TITLE,
+ var_to_string(dependent_var[i])
+ );
+
+ if ( fctr )
+ {
+ struct factor_statistics **fs = fctr->fs;
+ int count = 0;
+
+ tab_text (tbl, 1, n_heading_rows - 1,
+ TAB_CENTER | TAT_TITLE,
+ var_to_string(fctr->indep_var[0]));
+
+
+ if ( fctr->indep_var[1])
+ tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE,
+ var_to_string(fctr->indep_var[1]));
+
+ while( *fs )
+ {
+
+ static union value prev ;
+
+ const int row = n_heading_rows + n_stat_rows *
+ ( ( i * n_factors ) + count );
+
+
+ if ( 0 != compare_values(&prev, &(*fs)->id[0],
+ fctr->indep_var[0]->width))
+ {
+
+ if ( count > 0 )
+ tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
+
+ tab_text (tbl,
+ 1, row,
+ TAB_LEFT | TAT_TITLE,
+ value_to_string(&(*fs)->id[0], fctr->indep_var[0])
+ );
+
+
+ }
+
+ prev = (*fs)->id[0];
+
+ if (fctr->indep_var[1] && count > 0 )
+ tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
+
+ if ( fctr->indep_var[1])
+ tab_text (tbl, 2, row,
+ TAB_LEFT | TAT_TITLE,
+ value_to_string(&(*fs)->id[1], fctr->indep_var[1])
+ );
+
+
+ populate_percentiles(tbl, n_heading_columns - 1,
+ row, &(*fs)->m[i]);
+
+
+ count++ ;
+ fs++;
+ }
+
+
+ }
+ else
+ {
+ populate_percentiles(tbl, n_heading_columns - 1,
+ i * n_stat_rows * n_factors + n_heading_rows,
+ &totals[i]);
+ }
+
+
+ }
+
+
+ tab_submit(tbl);
+
+
+}
+
+
+
+
+void
+populate_percentiles(struct tab_table *tbl, int col, int row,
+ const struct metrics *m)
+{
+ int i;
+
+ struct percentile **p = (struct percentile **) hsh_sort(m->ptile_hash);
+
+ tab_text (tbl,
+ col, row + 1,
+ TAB_LEFT | TAT_TITLE,
+ _("Tukey\'s Hinges")
+ );
+
+ tab_text (tbl,
+ col, row,
+ TAB_LEFT | TAT_TITLE,
+ ptile_alg_desc[m->ptile_alg]
+ );
+
+
+ i = 0;
+ while ( (*p) )
+ {
+ tab_float(tbl, col + i + 1 , row,
+ TAB_CENTER,
+ (*p)->v, 8, 2);
+ if ( (*p)->p == 25 )
+ tab_float(tbl, col + i + 1 , row + 1,
+ TAB_CENTER,
+ m->hinges[0], 8, 2);
+
+ if ( (*p)->p == 50 )
+ tab_float(tbl, col + i + 1 , row + 1,
+ TAB_CENTER,
+ m->hinges[1], 8, 2);
+
+ if ( (*p)->p == 75 )
+ tab_float(tbl, col + i + 1 , row + 1,
+ TAB_CENTER,
+ m->hinges[2], 8, 2);
+
+
+ i++;
+
+ p++;
+ }
+
+
+
+
+}
+