From 65be63912fedc9819f76099b1ed52b189fc5fc03 Mon Sep 17 00:00:00 2001 From: John Darrington <john@darrington.wattle.id.au> Date: Sat, 20 Sep 2008 07:55:21 +0800 Subject: [PATCH] Added the MEDIAN function to AGGREGATE. Users can now aggregate data by the median. Closes bug #11975 --- doc/transformation.texi | 5 ++- src/language/stats/aggregate.c | 82 +++++++++++++++++++++++++++++++--- tests/command/aggregate.sh | 27 ++++++----- 3 files changed, 96 insertions(+), 18 deletions(-) diff --git a/doc/transformation.texi b/doc/transformation.texi index 27bbb2da..2a52ad17 100644 --- a/doc/transformation.texi +++ b/doc/transformation.texi @@ -83,7 +83,7 @@ list. Each set must have exactly as many source variables as aggregation variables. Each aggregation variable receives the results of applying the specified aggregation function to the corresponding source -variable. The MEAN, SD, and SUM aggregation functions may only be +variable. The MEAN, MEDIAN, SD, and SUM aggregation functions may only be applied to numeric variables. All the rest may be applied to numeric and short and long string variables. @@ -128,6 +128,9 @@ dictionary information from the source variable. Arithmetic mean. Limited to numeric values. The default format is F8.2. +@item MEDIAN(var_name) +The median value. Limited to numeric values. The default format is F8.2. + @item MIN(var_name) Minimum value. The aggregation variable receives the complete dictionary information from the source variable. diff --git a/src/language/stats/aggregate.c b/src/language/stats/aggregate.c index f58b97cf..e1dcd123 100644 --- a/src/language/stats/aggregate.c +++ b/src/language/stats/aggregate.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2008 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -43,6 +43,8 @@ #include <libpspp/str.h> #include <math/moments.h> #include <math/sort.h> +#include <math/statistic.h> +#include <math/percentiles.h> #include "minmax.h" #include "xalloc.h" @@ -75,12 +77,17 @@ struct agr_var char *string; bool saw_missing; struct moments1 *moments; + double cc; + + struct variable *subject; + struct variable *weight; + struct casewriter *writer; }; /* Aggregation functions. */ enum { - NONE, SUM, MEAN, SD, MAX, MIN, PGT, PLT, PIN, POUT, FGT, FLT, FIN, + NONE, SUM, MEAN, MEDIAN, SD, MAX, MIN, PGT, PLT, PIN, POUT, FGT, FLT, FIN, FOUT, N, NU, NMISS, NUMISS, FIRST, LAST, N_AGR_FUNCS, N_NO_VARS, NU_NO_VARS, FUNC = 0x1f, /* Function mask. */ @@ -102,6 +109,7 @@ static const struct agr_func agr_func_tab[] = {"<NONE>", 0, -1, {0, 0, 0}}, {"SUM", 0, -1, {FMT_F, 8, 2}}, {"MEAN", 0, -1, {FMT_F, 8, 2}}, + {"MEDIAN", 0, -1, {FMT_F, 8, 2}}, {"SD", 0, -1, {FMT_F, 8, 2}}, {"MAX", 0, VAL_STRING, {-1, -1, -1}}, {"MIN", 0, VAL_STRING, {-1, -1, -1}}, @@ -135,7 +143,7 @@ enum missing_treatment struct agr_proc { /* Break variables. */ - struct case_ordering *sort; /* Sort criteria. */ + struct case_ordering *sort; /* Sort criteria (break variable). */ const struct variable **break_vars; /* Break variables. */ size_t break_var_cnt; /* Number of break variables. */ struct ccase break_case; /* Last values of break variables. */ @@ -149,6 +157,7 @@ struct agr_proc static void initialize_aggregate_info (struct agr_proc *, const struct ccase *); + static void accumulate_aggregate_info (struct agr_proc *, const struct ccase *); /* Prototypes. */ @@ -344,7 +353,8 @@ error: /* Parse all the aggregate functions. */ static bool -parse_aggregate_functions (struct lexer *lexer, const struct dictionary *dict, struct agr_proc *agr) +parse_aggregate_functions (struct lexer *lexer, const struct dictionary *dict, + struct agr_proc *agr) { struct agr_var *tail; /* Tail of linked list starting at agr->vars. */ @@ -545,7 +555,7 @@ parse_aggregate_functions (struct lexer *lexer, const struct dictionary *dict, s variables. */ for (i = 0; i < n_dest; i++) { - struct agr_var *v = xmalloc (sizeof *v); + struct agr_var *v = xzalloc (sizeof *v); /* Add variable to chain. */ if (agr->agr_vars != NULL) @@ -703,6 +713,10 @@ agr_destroy (struct agr_proc *agr) } else if (iter->function == SD) moments1_destroy (iter->moments); + + var_destroy (iter->subject); + var_destroy (iter->weight); + free (iter); } if (agr->dict != NULL) @@ -755,6 +769,25 @@ accumulate_aggregate_info (struct agr_proc *agr, const struct ccase *input) iter->dbl[0] += v->f * weight; iter->dbl[1] += weight; break; + case MEDIAN: + { + double wv ; + struct ccase cout; + case_create (&cout, 2); + + case_data_rw (&cout, iter->subject)->f = + case_data (input, iter->src)->f; + + wv = dict_get_case_weight (agr->src_dict, input, NULL); + + case_data_rw (&cout, iter->weight)->f = wv; + + iter->cc += wv; + + casewriter_write (iter->writer, &cout); + case_destroy (&cout); + } + break; case SD: moments1_add (iter->moments, v->f, weight); break; @@ -911,6 +944,7 @@ dump_aggregate_info (struct agr_proc *agr, struct casewriter *output) { union value *v = case_data_rw (&c, i->dest); + if (agr->missing == COLUMNWISE && i->saw_missing && (i->function & FUNC) != N && (i->function & FUNC) != NU && (i->function & FUNC) != NMISS && (i->function & FUNC) != NUMISS) @@ -919,6 +953,9 @@ dump_aggregate_info (struct agr_proc *agr, struct casewriter *output) memset (v->s, ' ', var_get_width (i->dest)); else v->f = SYSMIS; + + casewriter_destroy (i->writer); + continue; } @@ -930,6 +967,25 @@ dump_aggregate_info (struct agr_proc *agr, struct casewriter *output) case MEAN: v->f = i->dbl[1] != 0.0 ? i->dbl[0] / i->dbl[1] : SYSMIS; break; + case MEDIAN: + { + struct casereader *sorted_reader; + struct order_stats *median = percentile_create (0.5, i->cc); + + sorted_reader = casewriter_make_reader (i->writer); + + order_stats_accumulate (&median, 1, + sorted_reader, + i->weight, + i->subject, + i->exclude); + + v->f = percentile_calculate ((struct percentile *) median, + PC_HAVERAGE); + + statistic_destroy ((struct statistic *) median); + } + break; case SD: { double variance; @@ -1044,6 +1100,22 @@ initialize_aggregate_info (struct agr_proc *agr, const struct ccase *input) case MAX | FSTRING: memset (iter->string, 0, var_get_width (iter->src)); break; + case MEDIAN: + { + struct case_ordering *ordering = case_ordering_create (); + + if ( ! iter->subject) + iter->subject = var_create_internal (0); + + if ( ! iter->weight) + iter->weight = var_create_internal (1); + + case_ordering_add_var (ordering, iter->subject, SRT_ASCEND); + + iter->writer = sort_create_writer (ordering, 2); + iter->cc = 0; + } + break; case SD: if (iter->moments == NULL) iter->moments = moments1_create (MOMENT_VARIANCE); diff --git a/tests/command/aggregate.sh b/tests/command/aggregate.sh index 0992e45b..63abc6f3 100755 --- a/tests/command/aggregate.sh +++ b/tests/command/aggregate.sh @@ -146,6 +146,8 @@ cat > agg-skel.pspp <<EOF /NPOUT23I = pout.(n, 2, 3) /SPOUT23 = pout(s, '2', '3') /SPOUT23I = pout.(s, '2', '3') + /NMEDIAN = median(n) + /NMEDIANI = median.(n) /NSD = sd(n) /NSDI = sd.(n) /NSUM = sum(n) @@ -158,12 +160,13 @@ warning: AGGREGATE: The value arguments passed to the FOUT function are out-of-o warning: AGGREGATE: The value arguments passed to the FOUT function are out-of-order. They will be treated as if they had been specified in the correct order. warning: AGGREGATE: The value arguments passed to the FOUT function are out-of-order. They will be treated as if they had been specified in the correct order. warning: AGGREGATE: The value arguments passed to the FOUT function are out-of-order. They will be treated as if they had been specified in the correct order. -G N NI NU NUI NFGT2 NFGT2I SFGT2 SFGT2I NFIN23 NFIN23I SFIN23 SFIN23I NFLT2 NFLT2I SFLT2 SFLT2I NFIRST NFIRSTI SFIRST SFIRSTI NFOUT23 NFOUT23I SFOUT23 SFOUT23I NLAST NLASTI SLAST SLASTI NMAX NMAXI SMAX SMAXI NMEAN NMEANI NMIN NMINI SMIN SMINI NN NNI SN SNI NNMISS NNMISSI SNMISS SNMISSI NNU NNUI SNU SNUI NNUMISS NNUMISSI SNUMISS SNUMISSI NPGT2 NPGT2I SPGT2 SPGT2I NPIN23 NPIN23I SPIN23 SPIN23I NPLT2 NPLT2I SPLT2 SPLT2I NPOUT23 NPOUT23I SPOUT23 SPOUT23I NSD NSDI NSUM NSUMI -- -------- -------- ------- ------- ----- ------ ----- ------ ------ ------- ------ ------- ----- ------ ----- ------ ------ ------- ------ ------- ------- -------- ------- -------- ----- ------ ----- ------ ---- ----- ---- ----- -------- -------- ---- ----- ---- ----- -------- -------- -------- -------- -------- -------- -------- -------- ------- ------- ------- ------- ------- -------- ------- -------- ----- ------ ----- ------ ------ ------- ------ ------- ----- ------ ----- ------ ------- -------- ------- -------- -------- -------- -------- -------- -1 7.00 7.00 6 6 .333 .429 .333 .429 .333 .286 .333 .286 .500 .429 .500 .429 0 0 0 0 .667 .714 .667 .714 5 5 5 5 5 5 5 5 2.00 2.29 0 0 0 0 6.00 7.00 6.00 7.00 1.00 .00 1.00 .00 5 6 5 6 1 0 1 0 33.3 42.9 33.3 42.9 33.3 28.6 33.3 28.6 50.0 42.9 50.0 42.9 66.7 71.4 66.7 71.4 1.79 1.80 12.00 16.00 -2 5.00 5.00 4 4 1.000 1.000 1.000 1.000 .000 .000 .000 .000 .000 .000 .000 .000 6 6 6 4 1.000 1.000 1.000 1.000 8 8 8 8 8 8 8 8 7.00 7.00 6 6 6 4 3.00 3.00 3.00 5.00 2.00 2.00 2.00 .00 3 3 3 4 1 1 1 0 100.0 100.0 100.0 100.0 .0 .0 .0 .0 .0 .0 .0 .0 100.0 100.0 100.0 100.0 1.00 1.00 21.00 21.00 -3 2.00 2.00 1 1 .000 .000 .000 .000 .000 .000 .000 .000 1.000 1.000 1.000 1.000 1 1 1 1 1.000 1.000 1.000 1.000 1 1 1 1 1 1 1 1 1.00 1.00 1 1 1 1 2.00 2.00 2.00 2.00 .00 .00 .00 .00 1 1 1 1 0 0 0 0 .0 .0 .0 .0 .0 .0 .0 .0 100.0 100.0 100.0 100.0 100.0 100.0 100.0 100.0 .00 .00 2.00 2.00 -4 1.00 1.00 1 1 . . . 1.000 . . . .000 . . . .000 . . 4 . . . 1.000 . . 4 . . 4 . . . . 4 .00 .00 .00 1.00 1.00 1.00 1.00 .00 0 0 0 1 1 1 1 0 . . . 100.0 . . . .0 . . . .0 . . . 100.0 . . . . +G N NI NU NUI NFGT2 NFGT2I SFGT2 SFGT2I NFIN23 NFIN23I SFIN23 SFIN23I NFLT2 NFLT2I SFLT2 SFLT2I NFIRST NFIRSTI SFIRST SFIRSTI NFOUT23 NFOUT23I SFOUT23 SFOUT23I NLAST NLASTI SLAST SLASTI NMAX NMAXI SMAX SMAXI NMEAN NMEANI NMIN NMINI SMIN SMINI NN NNI SN SNI NNMISS NNMISSI SNMISS SNMISSI NNU NNUI SNU SNUI NNUMISS NNUMISSI SNUMISS SNUMISSI NPGT2 NPGT2I SPGT2 SPGT2I NPIN23 NPIN23I SPIN23 SPIN23I NPLT2 NPLT2I SPLT2 SPLT2I NPOUT23 NPOUT23I SPOUT23 SPOUT23I NMEDIAN NMEDIANI NSD NSDI NSUM NSUMI +- -------- -------- ------- ------- ----- ------ ----- ------ ------ ------- ------ ------- ----- ------ ----- ------ ------ ------- ------ ------- ------- -------- ------- -------- ----- ------ ----- ------ ---- ----- ---- ----- -------- -------- ---- ----- ---- ----- -------- -------- -------- -------- -------- -------- -------- -------- ------- ------- ------- ------- ------- -------- ------- -------- ----- ------ ----- ------ ------ ------- ------ ------- ----- ------ ----- ------ ------- -------- ------- -------- -------- -------- -------- -------- -------- -------- +1 7.00 7.00 6 6 .333 .429 .333 .429 .333 .286 .333 .286 .500 .429 .500 .429 0 0 0 0 .667 .714 .667 .714 5 5 5 5 5 5 5 5 2.00 2.29 0 0 0 0 6.00 7.00 6.00 7.00 1.00 .00 1.00 .00 5 6 5 6 1 0 1 0 33.3 42.9 33.3 42.9 33.3 28.6 33.3 28.6 50.0 42.9 50.0 42.9 66.7 71.4 66.7 71.4 1.50 2.00 1.79 1.80 12.00 16.00 +2 5.00 5.00 4 4 1.000 1.000 1.000 1.000 .000 .000 .000 .000 .000 .000 .000 .000 6 6 6 4 1.000 1.000 1.000 1.000 8 8 8 8 8 8 8 8 7.00 7.00 6 6 6 4 3.00 3.00 3.00 5.00 2.00 2.00 2.00 .00 3 3 3 4 1 1 1 0 100.0 100.0 100.0 100.0 .0 .0 .0 .0 .0 .0 .0 .0 100.0 100.0 100.0 100.0 7.00 7.00 1.00 1.00 21.00 21.00 +3 2.00 2.00 1 1 .000 .000 .000 .000 .000 .000 .000 .000 1.000 1.000 1.000 1.000 1 1 1 1 1.000 1.000 1.000 1.000 1 1 1 1 1 1 1 1 1.00 1.00 1 1 1 1 2.00 2.00 2.00 2.00 .00 .00 .00 .00 1 1 1 1 0 0 0 0 .0 .0 .0 .0 .0 .0 .0 .0 100.0 100.0 100.0 100.0 100.0 100.0 100.0 100.0 1.00 1.00 .00 .00 2.00 2.00 +4 1.00 1.00 1 1 . . . 1.000 . . . .000 . . . .000 . . 4 . . . 1.000 . . 4 . . 4 . . . . 4 .00 .00 .00 1.00 1.00 1.00 1.00 .00 0 0 0 1 1 1 1 0 . . . 100.0 . . . .0 . . . .0 . . . 100.0 NaN NaN . . . . + EOF activity="expected output (columnwise missing) create" @@ -172,12 +175,12 @@ warning: AGGREGATE: The value arguments passed to the FOUT function are out-of-o warning: AGGREGATE: The value arguments passed to the FOUT function are out-of-order. They will be treated as if they had been specified in the correct order. warning: AGGREGATE: The value arguments passed to the FOUT function are out-of-order. They will be treated as if they had been specified in the correct order. warning: AGGREGATE: The value arguments passed to the FOUT function are out-of-order. They will be treated as if they had been specified in the correct order. -G N NI NU NUI NFGT2 NFGT2I SFGT2 SFGT2I NFIN23 NFIN23I SFIN23 SFIN23I NFLT2 NFLT2I SFLT2 SFLT2I NFIRST NFIRSTI SFIRST SFIRSTI NFOUT23 NFOUT23I SFOUT23 SFOUT23I NLAST NLASTI SLAST SLASTI NMAX NMAXI SMAX SMAXI NMEAN NMEANI NMIN NMINI SMIN SMINI NN NNI SN SNI NNMISS NNMISSI SNMISS SNMISSI NNU NNUI SNU SNUI NNUMISS NNUMISSI SNUMISS SNUMISSI NPGT2 NPGT2I SPGT2 SPGT2I NPIN23 NPIN23I SPIN23 SPIN23I NPLT2 NPLT2I SPLT2 SPLT2I NPOUT23 NPOUT23I SPOUT23 SPOUT23I NSD NSDI NSUM NSUMI -- -------- -------- ------- ------- ----- ------ ----- ------ ------ ------- ------ ------- ----- ------ ----- ------ ------ ------- ------ ------- ------- -------- ------- -------- ----- ------ ----- ------ ---- ----- ---- ----- -------- -------- ---- ----- ---- ----- -------- -------- -------- -------- -------- -------- -------- -------- ------- ------- ------- ------- ------- -------- ------- -------- ----- ------ ----- ------ ------ ------- ------ ------- ----- ------ ----- ------ ------- -------- ------- -------- -------- -------- -------- -------- -1 7.00 7.00 6 6 . .429 . .429 . .286 . .286 . .429 . .429 . 0 0 . .714 . .714 . 5 5 . 5 5 . 2.29 . 0 0 6.00 7.00 6.00 7.00 1.00 .00 1.00 .00 5 6 5 6 1 0 1 0 . 42.9 . 42.9 . 28.6 . 28.6 . 42.9 . 42.9 . 71.4 . 71.4 . 1.80 . 16.00 -2 5.00 5.00 4 4 . . . 1.000 . . . .000 . . . .000 . . 4 . . . 1.000 . . 8 . . 8 . . . . 4 3.00 3.00 3.00 5.00 2.00 2.00 2.00 .00 3 3 3 4 1 1 1 0 . . . 100.0 . . . .0 . . . .0 . . . 100.0 . . . . -3 2.00 2.00 1 1 .000 .000 .000 .000 .000 .000 .000 .000 1.000 1.000 1.000 1.000 1 1 1 1 1.000 1.000 1.000 1.000 1 1 1 1 1 1 1 1 1.00 1.00 1 1 1 1 2.00 2.00 2.00 2.00 .00 .00 .00 .00 1 1 1 1 0 0 0 0 .0 .0 .0 .0 .0 .0 .0 .0 100.0 100.0 100.0 100.0 100.0 100.0 100.0 100.0 .00 .00 2.00 2.00 -4 1.00 1.00 1 1 . . . 1.000 . . . .000 . . . .000 . . 4 . . . 1.000 . . 4 . . 4 . . . . 4 .00 .00 .00 1.00 1.00 1.00 1.00 .00 0 0 0 1 1 1 1 0 . . . 100.0 . . . .0 . . . .0 . . . 100.0 . . . . +G N NI NU NUI NFGT2 NFGT2I SFGT2 SFGT2I NFIN23 NFIN23I SFIN23 SFIN23I NFLT2 NFLT2I SFLT2 SFLT2I NFIRST NFIRSTI SFIRST SFIRSTI NFOUT23 NFOUT23I SFOUT23 SFOUT23I NLAST NLASTI SLAST SLASTI NMAX NMAXI SMAX SMAXI NMEAN NMEANI NMIN NMINI SMIN SMINI NN NNI SN SNI NNMISS NNMISSI SNMISS SNMISSI NNU NNUI SNU SNUI NNUMISS NNUMISSI SNUMISS SNUMISSI NPGT2 NPGT2I SPGT2 SPGT2I NPIN23 NPIN23I SPIN23 SPIN23I NPLT2 NPLT2I SPLT2 SPLT2I NPOUT23 NPOUT23I SPOUT23 SPOUT23I NMEDIAN NMEDIANI NSD NSDI NSUM NSUMI +- -------- -------- ------- ------- ----- ------ ----- ------ ------ ------- ------ ------- ----- ------ ----- ------ ------ ------- ------ ------- ------- -------- ------- -------- ----- ------ ----- ------ ---- ----- ---- ----- -------- -------- ---- ----- ---- ----- -------- -------- -------- -------- -------- -------- -------- -------- ------- ------- ------- ------- ------- -------- ------- -------- ----- ------ ----- ------ ------ ------- ------ ------- ----- ------ ----- ------ ------- -------- ------- -------- -------- -------- -------- -------- -------- -------- +1 7.00 7.00 6 6 . .429 . .429 . .286 . .286 . .429 . .429 . 0 0 . .714 . .714 . 5 5 . 5 5 . 2.29 . 0 0 6.00 7.00 6.00 7.00 1.00 .00 1.00 .00 5 6 5 6 1 0 1 0 . 42.9 . 42.9 . 28.6 . 28.6 . 42.9 . 42.9 . 71.4 . 71.4 . 2.00 . 1.80 . 16.00 +2 5.00 5.00 4 4 . . . 1.000 . . . .000 . . . .000 . . 4 . . . 1.000 . . 8 . . 8 . . . . 4 3.00 3.00 3.00 5.00 2.00 2.00 2.00 .00 3 3 3 4 1 1 1 0 . . . 100.0 . . . .0 . . . .0 . . . 100.0 . . . . . . +3 2.00 2.00 1 1 .000 .000 .000 .000 .000 .000 .000 .000 1.000 1.000 1.000 1.000 1 1 1 1 1.000 1.000 1.000 1.000 1 1 1 1 1 1 1 1 1.00 1.00 1 1 1 1 2.00 2.00 2.00 2.00 .00 .00 .00 .00 1 1 1 1 0 0 0 0 .0 .0 .0 .0 .0 .0 .0 .0 100.0 100.0 100.0 100.0 100.0 100.0 100.0 100.0 1.00 1.00 .00 .00 2.00 2.00 +4 1.00 1.00 1 1 . . . 1.000 . . . .000 . . . .000 . . 4 . . . 1.000 . . 4 . . 4 . . . . 4 .00 .00 .00 1.00 1.00 1.00 1.00 .00 0 0 0 1 1 1 1 0 . . . 100.0 . . . .0 . . . .0 . . . 100.0 . . . . . . EOF for outfile in scratch active external; do -- 2.30.2