#include "data/case.h"
#include "data/casegrouper.h"
#include "data/casereader.h"
+#include "data/casewriter.h"
#include "data/dictionary.h"
#include "data/format.h"
#include "data/procedure.h"
#include "data/settings.h"
+#include "data/subcase.h"
#include "data/value-labels.h"
#include "data/variable.h"
#include "language/command.h"
#include "libpspp/str.h"
#include "math/histogram.h"
#include "math/moments.h"
+#include "math/sort.h"
#include "output/chart-item.h"
#include "output/charts/piechart.h"
#include "output/charts/plot-hist.h"
+#include "output/pivot-table.h"
#include "output/tab.h"
#include "gl/minmax.h"
{
double p; /* the %ile to be calculated */
double value; /* the %ile's value */
- double x1; /* The datum value <= the percentile */
- double x2; /* The datum value >= the percentile */
- int flag;
- int flag2; /* Set to 1 if this percentile value has been found */
bool show; /* True to show this percentile in the statistics box. */
};
/* Entire frequency table. */
struct freq_tab
{
- struct hmap data; /* Hash table for accumulating counts. */
+ struct casewriter *sorter;
+ struct casereader *data;
+
struct freq *valid; /* Valid freqs. */
- int n_valid; /* Number of total freqs. */
+ int n_valid; /* Number of valid freqs. */
const struct dictionary *dict; /* Source of entries in the table. */
struct freq *missing; /* Missing freqs. */
static void determine_charts (struct frq_proc *,
const struct cmd_frequencies *);
-static void calc_stats (const struct frq_proc *, const struct var_freqs *,
- double d[FRQ_N_STATS]);
+static void calc_stats (const struct var_freqs *, double d[FRQ_N_STATS]);
+static void calc_percentiles (const struct frq_proc *,
+ const struct var_freqs *);
static void precalc (struct frq_proc *, struct casereader *, struct dataset *);
static void calc (struct frq_proc *, const struct ccase *,
for (i = 0; i < frq->n_vars; i++)
{
struct var_freqs *vf = &frq->vars[i];
- const union value *value = case_data (c, vf->var);
- size_t hash = value_hash (value, vf->width, 0);
- struct freq *f;
+ struct casewriter *writer = vf->tab.sorter;
+ struct ccase *f;
- f = freq_hmap_search (&vf->tab.data, value, vf->width, hash);
- if (f == NULL)
- f = freq_hmap_insert (&vf->tab.data, value, vf->width, hash);
+ f = case_create (casewriter_get_proto (writer));
+ value_copy (case_data_rw_idx (f, 0), case_data (c, vf->var), vf->width);
+ case_data_rw_idx (f, 1)->f = weight;
- f->count += weight;
+ casewriter_write (writer, f);
}
}
+static struct ccase *
+combine_freq_cases (struct ccase *a, struct ccase *b, void *aux UNUSED)
+{
+ a = case_unshare (a);
+ case_data_rw_idx (a, 1)->f += case_data_idx (b, 1)->f;
+ case_unref (b);
+
+ return a;
+}
+
/* Prepares each variable that is the target of FREQUENCIES by setting
up its hash table. */
static void
}
for (i = 0; i < frq->n_vars; i++)
- hmap_init (&frq->vars[i].tab.data);
+ {
+ int width = var_get_width (frq->vars[i].var);
+ struct caseproto *proto;
+ struct subcase ordering;
+
+ proto = caseproto_create ();
+ proto = caseproto_add_width (proto, width);
+ proto = caseproto_add_width (proto, 0);
+
+ subcase_init (&ordering, 0, width, SC_ASCEND);
+ frq->vars[i].tab.sorter = sort_distinct_create_writer (
+ &ordering, proto, combine_freq_cases, NULL, NULL);
+
+ caseproto_unref (proto);
+ subcase_destroy (&ordering);
+ }
}
/* Finishes up with the variables after frequencies have been
double d[FRQ_N_STATS];
struct histogram *histogram;
- calc_stats (frq, vf, d);
+ calc_stats (vf, d);
histogram = freq_tab_to_hist (frq, &vf->tab, vf->var);
{
struct freq_tab *ft = &vf->tab;
struct freq_compare_aux aux;
+ struct casereader *reader;
+ struct ccase *c;
size_t count;
struct freq *freqs, *f;
size_t i;
/* Extract data from hash table. */
- count = hmap_count (&ft->data);
- freqs = freq_hmap_extract (&ft->data);
+ reader = casewriter_make_reader (ft->sorter);
+ ft->data = casereader_clone (reader);
+ freqs = xnmalloc (casereader_count_cases (reader), sizeof *freqs);
+ for (count = 0; (c = casereader_read (reader)) != NULL; count++)
+ {
+ struct freq *f = &freqs[count];
+ value_clone (&f->value, case_data_idx (c, 0), vf->width);
+ f->count = case_num_idx (c, 1);
+ case_unref (c);
+ }
+ casereader_destroy (reader);
/* Put data into ft. */
ft->valid = freqs;
static void
cleanup_freq_tab (struct var_freqs *vf)
{
+ if (value_needs_init (vf->width))
+ {
+ int i;
+
+ for (i = 0; i < vf->tab.n_valid + vf->tab.n_missing; i++)
+ value_destroy (&vf->tab.valid[i].value, vf->width);
+ }
+ casereader_destroy (vf->tab.data);
free (vf->tab.valid);
- freq_hmap_destroy (&vf->tab.data, vf->width);
}
/* Parses the VARIABLES subcommand. */
static void
dump_freq_table (const struct var_freqs *vf, const struct variable *wv)
{
- const struct fmt_spec *wfmt = wv ? var_get_print_format (wv) : &F_8_0;
- const struct freq_tab *ft = &vf->tab;
- int n_categories;
- struct freq *f;
- struct tab_table *t;
- int r, x;
- double cum_total = 0.0;
- double cum_freq = 0.0;
-
- static const char *headings[] = {
- N_("Value Label"),
- N_("Value"),
- N_("Frequency"),
- N_("Percent"),
- N_("Valid Percent"),
- N_("Cum Percent")
- };
+ struct variable *value_var, *freq_var;
+ struct pivot_table *pt;
- n_categories = ft->n_valid + ft->n_missing;
- t = tab_create (6, n_categories + 2);
- tab_headers (t, 0, 0, 1, 0);
+ pt = xmalloc (sizeof *pt);
- for (x = 0; x < 6; x++)
- tab_text (t, x, 0, TAB_CENTER | TAT_TITLE, gettext (headings[x]));
+ pt->data = casereader_clone (vf->tab.data);
- r = 1;
- for (f = ft->valid; f < ft->missing; f++)
+ pt->dict = dict_create ();
+ value_var = dict_clone_var_assert (pt->dict, vf->var);
+ if (wv != NULL)
+ freq_var = dict_clone_var_assert (pt->dict, wv);
+ else
{
- const char *label;
- double percent, valid_percent;
-
- cum_freq += f->count;
-
- percent = f->count / ft->total_cases * 100.0;
- valid_percent = f->count / ft->valid_cases * 100.0;
- cum_total += valid_percent;
-
- label = var_lookup_value_label (vf->var, &f->value);
- if (label != NULL)
- tab_text (t, 0, r, TAB_LEFT, label);
-
- tab_value (t, 1, r, TAB_NONE, &f->value, ft->dict, &vf->print);
- tab_double (t, 2, r, TAB_NONE, f->count, wfmt);
- tab_double (t, 3, r, TAB_NONE, percent, NULL);
- tab_double (t, 4, r, TAB_NONE, valid_percent, NULL);
- tab_double (t, 5, r, TAB_NONE, cum_total, NULL);
- r++;
+ freq_var = dict_create_var (pt->dict, "$Frequency", 0);
+ var_set_both_formats (freq_var, &F_8_0);
+ var_set_label (freq_var, _("Frequency"));
}
- for (; f < &ft->valid[n_categories]; f++)
- {
- const char *label;
-
- cum_freq += f->count;
- label = var_lookup_value_label (vf->var, &f->value);
- if (label != NULL)
- tab_text (t, 0, r, TAB_LEFT, label);
+ subcase_init_empty (&pt->split);
+ subcase_init_empty (&pt->dimensions[TABLE_HORZ]);
+ subcase_init_empty (&pt->dimensions[TABLE_VERT]);
+ subcase_add_var (&pt->dimensions[TABLE_VERT], value_var, SC_ASCEND);
+
+ pt->pane.n[TABLE_HORZ] = 4;
+ pt->pane.n[TABLE_VERT] = 1;
+ pt->pane.cells = xmalloc (sizeof *pt->pane.cells);
+ pt->pane.cells[0] = xmalloc (4 * sizeof **pt->pane.cells);
+
+ /* Frequency. */
+ pivot_cell_init (
+ &pt->pane.cells[0][0],
+ pivot_value_create (freq_var, PIVOT_SUM, 0, 0, PIVOT_INCLUDE_ALL));
+
+ /* Percent. */
+ pivot_cell_init (
+ &pt->pane.cells[0][1],
+ pivot_value_create (freq_var, PIVOT_SUM, 0, 0, PIVOT_INCLUDE_ALL));
+ pt->pane.cells[0][1].cmp = PIVOT_PERCENT;
+ pt->pane.cells[0][1].cmp_args[0] = pivot_value_create (
+ freq_var, PIVOT_SUM, 0, 1, PIVOT_INCLUDE_ALL);
+ pt->pane.cells[0][1].label = xstrdup ("Percent");
+
+ /* Valid Percent. */
+ pivot_cell_init (
+ &pt->pane.cells[0][2],
+ pivot_value_create (freq_var, PIVOT_SUM, 0, 0, PIVOT_INCLUDE_VALID));
+ pt->pane.cells[0][2].cmp = PIVOT_PERCENT;
+ pt->pane.cells[0][2].cmp_args[0] = pivot_value_create (
+ freq_var, PIVOT_SUM, 0, 1, PIVOT_INCLUDE_VALID);
+ pt->pane.cells[0][2].label = xstrdup ("Valid Percent");
+
+ /* Cumulative (Valid) Percent. */
+ pivot_cell_init (
+ &pt->pane.cells[0][3],
+ pivot_value_create (freq_var, PIVOT_SUM, 0, 0, PIVOT_INCLUDE_VALID));
+ pt->pane.cells[0][3].base->n_cum_vars[TABLE_VERT] = 1;
+ pt->pane.cells[0][3].cmp = PIVOT_PERCENT;
+ pt->pane.cells[0][3].cmp_args[0] = pivot_value_create (
+ freq_var, PIVOT_SUM, 0, 1, PIVOT_INCLUDE_VALID);
+ pt->pane.cells[0][3].label = xstrdup ("Cum Percent");
+
+ pivot_table_dump (pt);
+}
+\f
+/* Statistical display. */
- tab_value (t, 1, r, TAB_NONE, &f->value, ft->dict, &vf->print);
- tab_double (t, 2, r, TAB_NONE, f->count, wfmt);
- tab_double (t, 3, r, TAB_NONE,
- f->count / ft->total_cases * 100.0, NULL);
- tab_text (t, 4, r, TAB_NONE, _("Missing"));
- r++;
- }
+static double
+calc_percentile (double p, double valid_cases, double x1, double x2)
+{
+ double s, dummy;
- tab_box (t, TAL_1, TAL_1, -1, TAL_1, 0, 0, 5, r);
- tab_hline (t, TAL_2, 0, 5, 1);
- tab_hline (t, TAL_2, 0, 5, r);
- tab_joint_text (t, 0, r, 1, r, TAB_RIGHT | TAT_TITLE, _("Total"));
- tab_vline (t, TAL_0, 1, r, r);
- tab_double (t, 2, r, TAB_NONE, cum_freq, wfmt);
- tab_fixed (t, 3, r, TAB_NONE, 100.0, 5, 1);
- tab_fixed (t, 4, r, TAB_NONE, 100.0, 5, 1);
+ s = (settings_get_algorithm () != COMPATIBLE
+ ? modf ((valid_cases - 1) * p, &dummy)
+ : modf ((valid_cases + 1) * p - 1, &dummy));
- tab_title (t, "%s", var_to_string (vf->var));
- tab_submit (t);
+ return x1 + (x2 - x1) * s;
}
-\f
-/* Statistical display. */
-/* Calculates all the pertinent statistics for variable V, putting them in
- array D[]. */
+/* Calculates all of the percentiles for VF within FRQ. */
static void
-calc_stats (const struct frq_proc *frq,
- const struct var_freqs *vf, double d[FRQ_N_STATS])
+calc_percentiles (const struct frq_proc *frq, const struct var_freqs *vf)
{
const struct freq_tab *ft = &vf->tab;
double W = ft->valid_cases;
- struct moments *m;
- struct freq *f=0;
- double prev_value;
- int most_often;
- double X_mode;
-
+ const struct freq *f;
+ int percentile_idx;
double rank;
- int i = 0;
- int idx;
-
- /* Calculate percentiles. */
assert (ft->n_valid > 0);
- for (i = 0; i < frq->n_percentiles; i++)
- {
- struct percentile *pc = &frq->percentiles[i];
-
- pc->flag = 0;
- pc->flag2 = 0;
- }
-
rank = 0;
- prev_value = SYSMIS;
- for (idx = 0; idx < ft->n_valid; ++idx)
+ percentile_idx = 0;
+ for (f = ft->valid; f < ft->missing; f++)
{
- f = &ft->valid[idx];
- rank += f->count ;
- for (i = 0; i < frq->n_percentiles; i++)
+ rank += f->count;
+ for (; percentile_idx < frq->n_percentiles; percentile_idx++)
{
- struct percentile *pc = &frq->percentiles[i];
- double tp;
-
- if ( pc->flag2 ) continue ;
-
- if ( settings_get_algorithm () != COMPATIBLE )
- tp =
- (ft->valid_cases - 1) * pc->p;
- else
- tp =
- (ft->valid_cases + 1) * pc->p - 1;
-
- if ( pc->flag )
- {
- pc->x2 = f->value.f;
- pc->x1 = prev_value;
- pc->flag2 = 1;
- continue;
- }
-
- if (rank > tp )
- {
- if ( f->count > 1 && rank - (f->count - 1) > tp )
- {
- pc->x2 = pc->x1 = f->value.f;
- pc->flag2 = 1;
- }
- else
- {
- pc->flag=1;
- }
-
- continue;
- }
- }
- prev_value = f->value.f;
- }
+ struct percentile *pc = &frq->percentiles[percentile_idx];
+ double tp;
- for (i = 0; i < frq->n_percentiles; i++)
- {
- struct percentile *pc = &frq->percentiles[i];
+ tp = (settings_get_algorithm () == ENHANCED
+ ? (W - 1) * pc->p
+ : (W + 1) * pc->p - 1);
- /* Catches the case when p == 100% */
- if ( ! pc->flag2 )
- pc->x1 = pc->x2 = f->value.f;
+ if (rank <= tp)
+ break;
- /*
- printf("percentile %d (p==%.2f); X1 = %g; X2 = %g\n",
- i,pc->p,pc->x1,pc->x2);
- */
+ if (f->count > 1
+ && (rank - (f->count - 1) > tp || f + 1 >= ft->missing))
+ pc->value = f->value.f;
+ else
+ pc->value = calc_percentile (pc->p, W, f->value.f, f[1].value.f);
+ }
}
-
- for (i = 0; i < frq->n_percentiles; i++)
+ for (; percentile_idx < frq->n_percentiles; percentile_idx++)
{
- struct percentile *pc = &frq->percentiles[i];
- double s;
-
- double dummy;
- if ( settings_get_algorithm () != COMPATIBLE )
- {
- s = modf((ft->valid_cases - 1) * pc->p , &dummy);
- }
- else
- {
- s = modf((ft->valid_cases + 1) * pc->p -1, &dummy);
- }
-
- pc->value = pc->x1 + (pc->x2 - pc->x1) * s ;
+ struct percentile *pc = &frq->percentiles[percentile_idx];
+ pc->value = ft->valid[ft->n_valid - 1].value.f;
}
+}
+
+/* Calculates all the pertinent statistics for VF, putting them in array
+ D[]. */
+static void
+calc_stats (const struct var_freqs *vf, double d[FRQ_N_STATS])
+{
+ const struct freq_tab *ft = &vf->tab;
+ double W = ft->valid_cases;
+ const struct freq *f;
+ struct moments *m;
+ int most_often;
+ double X_mode;
+ assert (ft->n_valid > 0);
/* Calculate the mode. */
most_often = -1;
var_get_name (vf->var));
return;
}
- calc_stats (frq, vf, stat_value);
+ calc_stats (vf, stat_value);
+ calc_percentiles (frq, vf);
t = tab_create (3, frq->n_stats + frq->n_show_percentiles + 2);