/* PSPP - a program for statistical analysis.
- Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc.
+ Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2016 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
/* FIXME:
- - T values for Spearman's R and Pearson's R are wrong.
- - How to calculate significance of symmetric and directional measures?
- - Asymmetric ASEs and T values for lambda are wrong.
- - ASE of Goodman and Kruskal's tau is not calculated.
- - ASE of symmetric somers' d is wrong.
- - Approx. T of uncertainty coefficient is wrong.
+ - How to calculate significance of some symmetric and directional measures?
+ - How to calculate ASE for symmetric Somers ' d?
+ - How to calculate ASE for Goodman and Kruskal's tau?
+ - How to calculate approx. T of symmetric uncertainty coefficient?
*/
#include "data/value-labels.h"
#include "data/variable.h"
#include "language/command.h"
+#include "language/stats/freq.h"
#include "language/dictionary/split-file.h"
#include "language/lexer/lexer.h"
#include "language/lexer/variable-parser.h"
#include "libpspp/pool.h"
#include "libpspp/str.h"
#include "output/tab.h"
+#include "output/chart-item.h"
+#include "output/charts/barchart.h"
#include "gl/minmax.h"
#include "gl/xalloc.h"
*^tables=custom;
+variables=custom;
missing=miss:!table/include/report;
+ count=roundwhat:asis/case/!cell,
+ roundhow:!round/truncate;
+write[wr_]=none,cells,all;
+format=val:!avalue/dvalue,
indx:!noindex/index,
tabl:!tables/notables,
box:!box/nobox,
pivot:!pivot/nopivot;
+ +barchart=;
+cells[cl_]=count,expected,row,column,total,residual,sresidual,
asresidual,all,none;
+statistics[st_]=chisq,phi,cc,lambda,uc,none,btau,ctau,risk,gamma,d,
/* Number of directional statistics. */
#define N_DIRECTIONAL 13
-/* A single table entry for general mode. */
-struct table_entry
- {
- struct hmap_node node; /* Entry in hash table. */
- double freq; /* Frequency count. */
- union value values[1]; /* Values. */
- };
-
-static size_t
-table_entry_size (size_t n_values)
-{
- return (offsetof (struct table_entry, values)
- + n_values * sizeof (union value));
-}
/* Indexes into the 'vars' member of struct pivot_table and
struct crosstab member. */
/* Data. */
struct hmap data;
- struct table_entry **entries;
+ struct freq **entries;
size_t n_entries;
/* Column values, number of columns. */
enum { INTEGER, GENERAL } mode;
enum mv_class exclude;
bool pivot;
+ bool barchart;
bool bad_warn;
struct fmt_spec weight_format;
unsigned int cells; /* Bit k is 1 if cell k is requested. */
int a_cells[CRS_CL_count]; /* 0...n_cells-1 are the requested cells. */
+ /* Rounding of cells. */
+ bool round_case_weights; /* Round case weights? */
+ bool round_cells; /* If !round_case_weights, round cells? */
+ bool round_down; /* Round down? (otherwise to nearest) */
+
/* STATISTICS. */
unsigned int statistics; /* Bit k is 1 if statistic k is requested. */
static void postcalc (struct crosstabs_proc *);
static void submit (struct pivot_table *, struct tab_table *);
+static double
+round_weight (const struct crosstabs_proc *proc, double weight)
+{
+ return proc->round_down ? floor (weight) : floor (weight + 0.5);
+}
+
/* Parses and executes the CROSSTABS procedure. */
int
cmd_crosstabs (struct lexer *lexer, struct dataset *ds)
}
proc.mode = proc.n_variables ? INTEGER : GENERAL;
-
+ proc.barchart = cmd.sbc_barchart > 0;
proc.descending = cmd.val == CRS_DVALUE;
+ proc.round_case_weights = cmd.sbc_count && cmd.roundwhat == CRS_CASE;
+ proc.round_cells = cmd.sbc_count && cmd.roundwhat == CRS_CELL;
+ proc.round_down = cmd.roundhow == CRS_TRUNCATE;
+
/* CELLS. */
if (!cmd.sbc_cells)
proc.cells = 1u << CRS_CL_COUNT;
{
double weight = dict_get_case_weight (dataset_dict (ds), c,
&proc.bad_warn);
+ if (cmd.roundwhat == CRS_CASE)
+ {
+ weight = round_weight (&proc, weight);
+ if (weight == 0.)
+ continue;
+ }
if (should_tabulate_case (pt, c, proc.exclude))
{
if (proc.mode == GENERAL)
if (!lex_match (lexer, T_BY))
{
if (n_by < 2)
- {
- lex_force_match (lexer, T_BY);
- goto done;
- }
+ goto done;
else
break;
}
vr->var = var;
vr->min = min;
- vr->max = max + 1.;
+ vr->max = max;
vr->count = max - min + 1;
hmap_insert (&proc->var_ranges, &vr->hmap_node,
hash_pointer (var, 0));
if (range != NULL)
{
double num = case_num (c, var);
- if (num < range->min || num > range->max)
+ if (num < range->min || num >= range->max + 1.)
return false;
}
}
tabulate_integer_case (struct pivot_table *pt, const struct ccase *c,
double weight)
{
- struct table_entry *te;
+ struct freq *te;
size_t hash;
int j;
hash = hash_int (case_num (c, pt->vars[j]), hash);
}
- HMAP_FOR_EACH_WITH_HASH (te, struct table_entry, node, hash, &pt->data)
+ HMAP_FOR_EACH_WITH_HASH (te, struct freq, node, hash, &pt->data)
{
for (j = 0; j < pt->n_vars; j++)
if ((int) case_num (c, pt->vars[j]) != (int) te->values[j].f)
goto no_match;
/* Found an existing entry. */
- te->freq += weight;
+ te->count += weight;
return;
no_match: ;
/* No existing entry. Create a new one. */
te = xmalloc (table_entry_size (pt->n_vars));
- te->freq = weight;
+ te->count = weight;
for (j = 0; j < pt->n_vars; j++)
te->values[j].f = (int) case_num (c, pt->vars[j]);
hmap_insert (&pt->data, &te->node, hash);
tabulate_general_case (struct pivot_table *pt, const struct ccase *c,
double weight)
{
- struct table_entry *te;
+ struct freq *te;
size_t hash;
int j;
hash = value_hash (case_data (c, var), var_get_width (var), hash);
}
- HMAP_FOR_EACH_WITH_HASH (te, struct table_entry, node, hash, &pt->data)
+ HMAP_FOR_EACH_WITH_HASH (te, struct freq, node, hash, &pt->data)
{
for (j = 0; j < pt->n_vars; j++)
{
}
/* Found an existing entry. */
- te->freq += weight;
+ te->count += weight;
return;
no_match: ;
/* No existing entry. Create a new one. */
te = xmalloc (table_entry_size (pt->n_vars));
- te->freq = weight;
+ te->count = weight;
for (j = 0; j < pt->n_vars; j++)
{
const struct variable *var = pt->vars[j];
\f
/* Post-data reading calculations. */
-static int compare_table_entry_vars_3way (const struct table_entry *a,
- const struct table_entry *b,
+static int compare_table_entry_vars_3way (const struct freq *a,
+ const struct freq *b,
const struct pivot_table *pt,
int idx0, int idx1);
static int compare_table_entry_3way (const void *ap_, const void *bp_,
static void
postcalc (struct crosstabs_proc *proc)
{
- struct pivot_table *pt;
+
+ /* Round hash table entries, if requested
+
+ If this causes any of the cell counts to fall to zero, delete those
+ cells. */
+ if (proc->round_cells)
+ for (struct pivot_table *pt = proc->pivots;
+ pt < &proc->pivots[proc->n_pivots]; pt++)
+ {
+ struct freq *e, *next;
+ HMAP_FOR_EACH_SAFE (e, next, struct freq, node, &pt->data)
+ {
+ e->count = round_weight (proc, e->count);
+ if (e->count == 0.0)
+ {
+ hmap_delete (&pt->data, &e->node);
+ free (e);
+ }
+ }
+ }
/* Convert hash tables into sorted arrays of entries. */
- for (pt = &proc->pivots[0]; pt < &proc->pivots[proc->n_pivots]; pt++)
+ for (struct pivot_table *pt = proc->pivots;
+ pt < &proc->pivots[proc->n_pivots]; pt++)
{
- struct table_entry *e;
- size_t i;
+ struct freq *e;
pt->n_entries = hmap_count (&pt->data);
pt->entries = xnmalloc (pt->n_entries, sizeof *pt->entries);
- i = 0;
- HMAP_FOR_EACH (e, struct table_entry, node, &pt->data)
+ size_t i = 0;
+ HMAP_FOR_EACH (e, struct freq, node, &pt->data)
pt->entries[i++] = e;
hmap_destroy (&pt->data);
sort (pt->entries, pt->n_entries, sizeof *pt->entries,
proc->descending ? compare_table_entry_3way_inv : compare_table_entry_3way,
pt);
+
}
make_summary_table (proc);
/* Output each pivot table. */
- for (pt = &proc->pivots[0]; pt < &proc->pivots[proc->n_pivots]; pt++)
+ for (struct pivot_table *pt = proc->pivots;
+ pt < &proc->pivots[proc->n_pivots]; pt++)
{
if (proc->pivot || pt->n_vars == 2)
output_pivot_table (proc, pt);
output_pivot_table (proc, &subset);
}
}
+ if (proc->barchart)
+ chart_item_submit
+ (barchart_create (pt->vars, pt->n_vars, _("Count"), false, pt->entries, pt->n_entries));
}
/* Free output and prepare for next split file. */
- for (pt = &proc->pivots[0]; pt < &proc->pivots[proc->n_pivots]; pt++)
+ for (struct pivot_table *pt = proc->pivots;
+ pt < &proc->pivots[proc->n_pivots]; pt++)
{
- size_t i;
-
pt->missing = 0.0;
/* Free the members that were allocated in this function(and the values
lower level (in output_pivot_table), or both allocated and destroyed
at a higher level (in crs_custom_tables and free_proc,
respectively). */
- for (i = 0; i < pt->n_vars; i++)
+ for (size_t i = 0; i < pt->n_vars; i++)
{
int width = var_get_width (pt->vars[i]);
if (value_needs_init (width))
}
}
- for (i = 0; i < pt->n_entries; i++)
+ for (size_t i = 0; i < pt->n_entries; i++)
free (pt->entries[i]);
free (pt->entries);
}
}
static int
-compare_table_entry_var_3way (const struct table_entry *a,
- const struct table_entry *b,
+compare_table_entry_var_3way (const struct freq *a,
+ const struct freq *b,
const struct pivot_table *pt,
int idx)
{
}
static int
-compare_table_entry_vars_3way (const struct table_entry *a,
- const struct table_entry *b,
+compare_table_entry_vars_3way (const struct freq *a,
+ const struct freq *b,
const struct pivot_table *pt,
int idx0, int idx1)
{
return 0;
}
-/* Compare the struct table_entry at *AP to the one at *BP and
+/* Compare the struct freq at *AP to the one at *BP and
return a strcmp()-type result. */
static int
compare_table_entry_3way (const void *ap_, const void *bp_, const void *pt_)
{
- const struct table_entry *const *ap = ap_;
- const struct table_entry *const *bp = bp_;
- const struct table_entry *a = *ap;
- const struct table_entry *b = *bp;
+ const struct freq *const *ap = ap_;
+ const struct freq *const *bp = bp_;
+ const struct freq *a = *ap;
+ const struct freq *b = *bp;
const struct pivot_table *pt = pt_;
int cmp;
return pt->n_vars - 1;
else
{
- const struct table_entry *a = pt->entries[row];
- const struct table_entry *b = pt->entries[row - 1];
+ const struct freq *a = pt->entries[row];
+ const struct freq *b = pt->entries[row - 1];
int col;
for (col = pt->n_vars - 1; col >= 0; col--)
valid = 0.;
for (i = 0; i < pt->n_entries; i++)
- valid += pt->entries[i]->freq;
+ valid += pt->entries[i]->count;
n[0] = valid;
n[1] = pt->missing;
const int row_var_width = var_get_width (x->vars[ROW_VAR]);
int col, row;
double *mp;
- struct table_entry **p;
+ struct freq **p;
mp = x->mat;
col = row = 0;
for (p = x->entries; p < &x->entries[x->n_entries]; p++)
{
- const struct table_entry *te = *p;
+ const struct freq *te = *p;
while (!value_equal (&x->rows[row], &te->values[ROW_VAR], row_var_width))
{
col++;
}
- *mp++ = te->freq;
+ *mp++ = te->count;
if (++col >= x->n_cols)
{
col = 0;
for (row1 = row0 + 1; row1 < pt->n_entries; row1++)
{
- struct table_entry *a = pt->entries[row0];
- struct table_entry *b = pt->entries[row1];
+ struct freq *a = pt->entries[row0];
+ struct freq *b = pt->entries[row1];
if (compare_table_entry_vars_3way (a, b, pt, 2, pt->n_vars) != 0)
break;
}
hmapx_init (&set);
for (i = 0; i < pt->n_entries; i++)
{
- const struct table_entry *te = pt->entries[i];
+ const struct freq *te = pt->entries[i];
const union value *value = &te->values[var_idx];
size_t hash = value_hash (value, width, 0);
static int calc_directional (struct crosstabs_proc *, struct pivot_table *,
double[N_DIRECTIONAL], double[N_DIRECTIONAL],
- double[N_DIRECTIONAL]);
+ double[N_DIRECTIONAL], double[N_DIRECTIONAL]);
/* Display directional measures. */
static void
double direct_v[N_DIRECTIONAL];
double direct_ase[N_DIRECTIONAL];
double direct_t[N_DIRECTIONAL];
+ double sig[N_DIRECTIONAL];
int i;
- if (!calc_directional (proc, pt, direct_v, direct_ase, direct_t))
+ if (!calc_directional (proc, pt, direct_v, direct_ase, direct_t, sig))
return;
tab_offset (direct, pt->n_consts + pt->n_vars - 2, -1);
tab_double (direct, 4, 0, TAB_RIGHT, direct_ase[i], NULL, RC_OTHER);
if (direct_t[i] != SYSMIS)
tab_double (direct, 5, 0, TAB_RIGHT, direct_t[i], NULL, RC_OTHER);
- /*tab_double (direct, 6, 0, TAB_RIGHT, normal_sig (direct_v[i]), NULL, RC_PVALUE);*/
+ tab_double (direct, 6, 0, TAB_RIGHT, sig[i], NULL, RC_PVALUE);
tab_next_row (direct);
}
}
}
-/* Calculate the value of Pearson's r. r is stored into R, ase_1 into
- ASE_1, and ase_0 into ASE_0. The row and column values must be
+/* Calculate the value of Pearson's r. r is stored into R, its T value into
+ T, and standard error into ERROR. The row and column values must be
passed in PT and Y. */
static void
calc_r (struct pivot_table *pt,
- double *PT, double *Y, double *r, double *ase_0, double *ase_1)
+ double *PT, double *Y, double *r, double *t, double *error)
{
double SX, SY, S, T;
double Xbar, Ybar;
SY = sum_Y2c - pow2 (sum_Yc) / pt->total;
T = sqrt (SX * SY);
*r = S / T;
- *ase_0 = sqrt ((sum_X2Y2f - pow2 (sum_XYf) / pt->total) / (sum_X2r * sum_Y2c));
+ *t = *r / sqrt (1 - pow2 (*r)) * sqrt (pt->total - 2);
{
double s, c, y, t;
c = (t - s) - y;
s = t;
}
- *ase_1 = sqrt (s) / (T * T);
+ *error = sqrt (s) / (T * T);
}
}
if (proc->statistics & (1u << CRS_ST_D))
{
somers_d_v[0] = (P - Q) / (.5 * (Dc + Dr));
- somers_d_ase[0] = 2. * btau_var / (Dr + Dc) * sqrt (Dr * Dc);
+ somers_d_ase[0] = SYSMIS;
somers_d_t[0] = (somers_d_v[0]
/ (4 / (Dc + Dr)
* sqrt (ctau_cum - pow2 (P - Q) / pt->total)));
}
calc_r (pt, R, C, &v[6], &t[6], &ase[6]);
- t[6] = v[6] / t[6];
free (R);
free (C);
calc_r (pt, (double *) pt->rows, (double *) pt->cols, &v[7], &t[7], &ase[7]);
- t[7] = v[7] / t[7];
}
/* Cohen's kappa. */
if (proc->statistics & (1u << CRS_ST_KAPPA) && pt->ns_rows == pt->ns_cols)
{
+ double ase_under_h0;
double sum_fii, sum_rici, sum_fiiri_ci, sum_fijri_ci2, sum_riciri_ci;
int i, j;
v[8] = (pt->total * sum_fii - sum_rici) / (pow2 (pt->total) - sum_rici);
- ase[8] = sqrt ((pow2 (pt->total) * sum_rici
- + sum_rici * sum_rici
- - pt->total * sum_riciri_ci)
- / (pt->total * (pow2 (pt->total) - sum_rici) * (pow2 (pt->total) - sum_rici)));
-#if 0
- t[8] = v[8] / sqrt (pt->total * (((sum_fii * (pt->total - sum_fii))
+ ase_under_h0 = sqrt ((pow2 (pt->total) * sum_rici
+ + sum_rici * sum_rici
+ - pt->total * sum_riciri_ci)
+ / (pt->total * (pow2 (pt->total) - sum_rici) * (pow2 (pt->total) - sum_rici)));
+
+ ase[8] = sqrt (pt->total * (((sum_fii * (pt->total - sum_fii))
/ pow2 (pow2 (pt->total) - sum_rici))
+ ((2. * (pt->total - sum_fii)
* (2. * sum_fii * sum_rici
- pt->total * sum_fiiri_ci))
- / cube (pow2 (pt->total) - sum_rici))
+ / pow3 (pow2 (pt->total) - sum_rici))
+ (pow2 (pt->total - sum_fii)
* (pt->total * sum_fijri_ci2 - 4.
* sum_rici * sum_rici)
/ pow4 (pow2 (pt->total) - sum_rici))));
-#else
- t[8] = v[8] / ase[8];
-#endif
+
+ t[8] = v[8] / ase_under_h0;
}
return 1;
static int
calc_directional (struct crosstabs_proc *proc, struct pivot_table *pt,
double v[N_DIRECTIONAL], double ase[N_DIRECTIONAL],
- double t[N_DIRECTIONAL])
+ double t[N_DIRECTIONAL], double sig[N_DIRECTIONAL])
{
{
int i;
for (i = 0; i < N_DIRECTIONAL; i++)
- v[i] = ase[i] = t[i] = SYSMIS;
+ v[i] = ase[i] = t[i] = sig[i] = SYSMIS;
}
/* Lambda. */
/* ASE1 for Y given PT. */
{
- double accum;
-
- for (accum = 0., i = 0; i < pt->n_rows; i++)
- for (j = 0; j < pt->n_cols; j++)
- {
- const int deltaj = j == cm_index;
- accum += (pt->mat[j + i * pt->n_cols]
- * pow2 ((j == fim_index[i])
- - deltaj
- + v[0] * deltaj));
- }
+ double accum;
- ase[2] = sqrt (accum - pt->total * v[0]) / (pt->total - cm);
+ accum = 0.;
+ for (i = 0; i < pt->n_rows; i++)
+ if (cm_index == fim_index[i])
+ accum += fim[i];
+ ase[2] = sqrt ((pt->total - sum_fim) * (sum_fim + cm - 2. * accum)
+ / pow3 (pt->total - cm));
}
/* ASE0 for Y given PT. */
/* ASE1 for PT given Y. */
{
- double accum;
+ double accum;
- for (accum = 0., i = 0; i < pt->n_rows; i++)
- for (j = 0; j < pt->n_cols; j++)
- {
- const int deltaj = i == rm_index;
- accum += (pt->mat[j + i * pt->n_cols]
- * pow2 ((i == fmj_index[j])
- - deltaj
- + v[0] * deltaj));
- }
-
- ase[1] = sqrt (accum - pt->total * v[0]) / (pt->total - rm);
+ accum = 0.;
+ for (j = 0; j < pt->n_cols; j++)
+ if (rm_index == fmj_index[j])
+ accum += fmj[j];
+ ase[1] = sqrt ((pt->total - sum_fmj) * (sum_fmj + rm - 2. * accum)
+ / pow3 (pt->total - rm));
}
/* ASE0 for PT given Y. */
* pow2 (temp0 + (v[0] - 1.) * temp1));
}
ase[0] = sqrt (accum1 - 4. * pt->total * v[0] * v[0]) / (2. * pt->total - rm - cm);
- t[0] = v[0] / (sqrt (accum0 - pow2 ((sum_fim + sum_fmj - cm - rm) / pt->total))
+ t[0] = v[0] / (sqrt (accum0 - pow2 (sum_fim + sum_fmj - cm - rm) / pt->total)
/ (2. * pt->total - rm - cm));
}
+ for (i = 0; i < 3; i++)
+ sig[i] = 2 * gsl_cdf_ugaussian_Q (t[i]);
+
free (fim);
free (fim_index);
free (fmj);
free (fmj_index);
+ /* Tau. */
{
double sum_fij2_ri, sum_fij2_ci;
double sum_ri2, sum_cj2;
v[5] = 2. * ((UX + UY - UXY) / (UX + UY));
ase[5] = (2. / (pt->total * pow2 (UX + UY))) * sqrt (ase1_sym);
- t[5] = v[5] / ((2. / (pt->total * (UX + UY)))
- * sqrt (P - pow2 (UX + UY - UXY) / pt->total));
+ t[5] = SYSMIS;
v[6] = (UX + UY - UXY) / UX;
ase[6] = sqrt (ase1_xy) / (pt->total * UX * UX);
v[8 + i] = somers_d_v[i];
ase[8 + i] = somers_d_ase[i];
t[8 + i] = somers_d_t[i];
+ sig[8 + i] = 2 * gsl_cdf_ugaussian_Q (fabs (somers_d_t[i]));
}
}
}