From b9e28aa5614a079548c616bcf97aa804024ad647 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 30 Mar 2004 00:42:46 +0000 Subject: [PATCH] Add multipass procedures. Add two-pass moments calculation. Rewrite DESCRIPTIVES as a multipass procedure to use two-pass moments. Fix some memory leaks. --- src/ChangeLog | 137 +++ src/Makefile.am | 47 +- src/aggregate.c | 37 +- src/algorithm.c | 31 +- src/bitvector.h | 3 +- src/casefile.c | 815 +++++++++++++++++ src/casefile.h | 48 + src/command.c | 6 + src/command.def | 2 + src/crosstabs.q | 62 +- src/debug.c | 71 +- src/descript.q | 860 ------------------ src/expr-evl.c | 78 +- src/expr-opt.c | 1 - src/expr-prs.c | 76 +- src/frequencies.q | 1 - src/levene.c | 6 +- src/misc.h | 23 + src/moments.c | 426 +++++++++ src/moments.h | 61 ++ src/sort.c | 143 +-- src/stats.c | 203 ----- src/stats.h | 88 -- src/t-test.q | 36 +- src/var.h | 31 - src/vfm.c | 307 ++----- src/vfm.h | 13 +- src/workspace.c | 53 ++ src/workspace.h | 28 + tests/ChangeLog | 20 + tests/Makefile.am | 7 +- tests/command/weight.sh | 2 +- .../descript-basic.sh} | 0 tests/stats/descript-missing.sh | 127 +++ tests/stats/moments.sh | 97 ++ tests/xforms/casefile.sh | 70 ++ tests/xforms/expressions.sh | 2 +- 37 files changed, 2319 insertions(+), 1699 deletions(-) create mode 100644 src/casefile.c create mode 100644 src/casefile.h delete mode 100644 src/descript.q create mode 100644 src/moments.c create mode 100644 src/moments.h delete mode 100644 src/stats.c delete mode 100644 src/stats.h create mode 100644 src/workspace.c create mode 100644 src/workspace.h rename tests/{command/descriptives.sh => stats/descript-basic.sh} (100%) create mode 100755 tests/stats/descript-missing.sh create mode 100755 tests/stats/moments.sh create mode 100755 tests/xforms/casefile.sh diff --git a/src/ChangeLog b/src/ChangeLog index ff80895816..6e121c9fa5 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,140 @@ +Mon Mar 29 16:26:40 2004 Ben Pfaff + + * debug.c: Removed. Moved cmd_debug_evaluate() into expr-evl.c. + + * expr-evl.c: (cmd_debug_evaluate) Moved here from debug.c. + +Mon Mar 29 16:03:08 2004 Ben Pfaff + + * algorithm.c: By default turn off some of the more expensive + assertions. + (expensive_assert) New macro which expands to assert if + EXTRA_CHECKS is defined, to nothing otherwise. + (unique) Use expensive_assert(). + (binary_search) Ditto. + (push_heap) Ditto. + (pop_heap) Ditto. + (make_heap) Ditto. + (sort_heap) Ditto. + + * command.c: (conflicting_3char_prefixes) Words that are the same + don't cause conflicts when they are abbreviated to the first three + letters. + + * expr-evl.c: (CONCAT_func) Fix memory leak by incrementing struct + nonterm_node's n earlier. + (generic_str_func) Ditto. + +Mon Mar 29 15:32:17 2004 Ben Pfaff + + Add support for multipass procedures. Rewrite DESCRIPTIVES to + test multipass support, take advantage of new moments + calculation, and to not be such crappy code. Get rid of q2c + processing for DESCRIPTIVES. + + * vfm.c: (struct multipass_split_aux_data) New structure. + (multipass_procedure_with_splits) New function. + (multipass_split_callback) New function. + (multipass_split_output) New function. + * descript.q: Removed. + + * descript.c: New file. + + * var.h: Removed descriptives enums. + (struct descriptives_proc) Removed. + (struct variable) Removed p.dsc. + + * Makefile.am: (q_sources_c) Remove descript.c. + (q_sources_q) Removed descript.q. + +Mon Mar 29 15:31:55 2004 Ben Pfaff + + New manager for keeping track of used workspace. + + * workspace.c: New file. + + * workspace.h: New file. + + * Makefile.am: (pspp_SOURCES) Add workspace.c, workspace.h. + + * sort.c: (do_internal_sort) Use workspace_malloc(). + (destroy_internal_sort) Use workspace_free(). + +Mon Mar 29 15:31:08 2004 Ben Pfaff + + New `struct casefile' for managing sets of cases. + + * casefile.c: New file. + + * casefile.h: New file. + + * command.def: Add DEBUG CASEFILE command. + + * Makefile.am: (pspp_SOURCES) Add casefile.c, casefile.h. + + * sort.c: (sort_cases) Move logic for sending storage file to disk + into do_external_sort(). + (struct internal_sort) Use an array of ccase pointers instead of a + case_list. + (do_internal_sort) Rewrite to handle casefiles. + (compare_case_list) Removed. + (compare_cases) New function. + (compare_case_dblptrs) New function. + (read_internal_sort_output) Deal with new struct internal_sort. + + * vfm.c: (static var workspace_overflow) Removed. + (struct storage_stream_info) Removed all the members. Added + struct casefile * member. + (storage_sink_open) Use casefile. + (open_storage_file) Removed. + (write_storage_file) Removed. + (storage_to_disk) Removed. + (destroy_storage_stream_info) Use casefile. + (storage_sink_write) Use casefile. + (storage_sink_make_source) Use casefile. + (storage_source_count) Use casefile. + (storage_source_read) Use casefile. + (storage_source_on_disk) Removed. + (storage_source_get_cases) Removed. + (storage_source_set_cases) Removed. + (storage_source_get_casefile) New function. + +Mon Mar 29 15:30:09 2004 Ben Pfaff + + New `struct moments' for calculating moments. + + * stats.c: Removed. + + * stats.h: Removed. + + * moments.c: New file. + + * moments.h: New file. + + * command.def: Add DEBUG MOMENTS command. + + * Makefile.am: (pspp_SOURCES) Add moments.c, moments.h. Remove + stats.c, stats.h. + + * aggregate.c: Modify AGGREGATE to use the new moments + calculation, even if not in such a great way. + (struct agr_var) Add `moments' member. + (parse_aggregate_functions) Set `moments' member to null. + (agr_destroy) Destroy `moments' member. + (accumulate_aggregate_info) Use `moments' for standard deviation. + (dump_aggregate_info) Ditto. + (initialize_aggregate_info) Create or clear `moments'. + + * misc.h: Add pow2(), pow3(), pow4() functions in place of sqr(), + cube(), pow4() that were in stats.h. All references updated. + + * crosstabs.q: stats.h had chi-square significance functions. Use + GSL instead. + (display_chisq) Use gsl_cdf_chisq_Q() instead of chisq_sig(). + + * expr-evl.c: (expr_evaluate) Use moments_of_values() for + OP_CFVAR, OP_MEAN, OP_SD, OP_VARIANCE. + Fri Mar 26 14:21:23 2004 Ben Pfaff * dictionary.c: (dict_compact_values) Compacted values need to diff --git a/src/Makefile.am b/src/Makefile.am index 8dd9c439c0..5e12142a59 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -27,37 +27,34 @@ $(q_sources_c): q2c$(EXEEXT) .q.c: ./q2c $< $@ -q_sources_c = correlations.c crosstabs.c descript.c file-handle.c \ +q_sources_c = correlations.c crosstabs.c file-handle.c \ frequencies.c list.c means.c set.c t-test.c -q_sources_q = correlations.q crosstabs.q descript.q file-handle.q \ +q_sources_q = correlations.q crosstabs.q file-handle.q \ frequencies.q list.q means.q set.q t-test.q -pspp_SOURCES = $(q_sources_c) \ -aggregate.c algorithm.c algorithm.h alloc.c alloc.h \ -apply-dict.c ascii.c autorecode.c bitvector.h \ -cmdline.c cmdline.h command.c command.def command.h compute.c \ -copyleft.c copyleft.h \ -count.c data-in.c data-in.h data-list.c data-list.h \ -data-out.c date.c debug.c debug-print.h devind.c devind.h dfm.c dfm.h \ -dictionary.c do-if.c do-ifP.h error.c error.h expr-evl.c expr-opt.c \ -expr-prs.c expr.h exprP.h file-handle.h file-type.c \ -filename.c filename.h flip.c font.h format.c format.def format.h \ -formats.c get.c getline.c getline.h glob.c glob.h \ -groff-font.c hash.c hash.h html.c htmlP.h include.c \ -inpt-pgm.c lexer.c lexer.h levene.c levene.h \ -log.h loop.c magic.c magic.h main.c \ -main.h matrix-data.c matrix.c matrix.h mis-val.c misc.c misc.h \ -modify-vars.c numeric.c output.c output.h pfm-read.c pfm-write.c pfm.h \ -pool.c pool.h postscript.c print.c random.c random.h recode.c \ -rename-vars.c repeat.c repeat.h sample.c sel-if.c settings.h \ +pspp_SOURCES = $(q_sources_c) aggregate.c algorithm.c algorithm.h \ +alloc.c alloc.h apply-dict.c ascii.c autorecode.c bitvector.h \ +casefile.c casefile.h cmdline.c cmdline.h command.c command.def \ +command.h compute.c copyleft.c copyleft.h count.c data-in.c data-in.h \ +data-list.c data-list.h data-out.c date.c debug-print.h descript.c \ +devind.c devind.h dfm.c dfm.h dictionary.c do-if.c do-ifP.h error.c \ +error.h expr-evl.c expr-opt.c expr-prs.c expr.h exprP.h file-handle.h \ +file-type.c filename.c filename.h flip.c font.h format.c format.def \ +format.h formats.c get.c getline.c getline.h glob.c glob.h \ +groff-font.c hash.c hash.h html.c htmlP.h include.c inpt-pgm.c lexer.c \ +lexer.h levene.c levene.h log.h loop.c magic.c magic.h main.c main.h \ +matrix-data.c matrix.c matrix.h mis-val.c misc.c misc.h modify-vars.c \ +moments.c moments.h numeric.c output.c output.h pfm-read.c pfm-write.c \ +pfm.h pool.c pool.h postscript.c print.c random.c random.h recode.c \ +rename-vars.c repeat.c repeat.h sample.c sel-if.c settings.h \ sfm-read.c sfm-write.c sfm.h sfmP.h som.c som.h sort.c sort.h \ -split-file.c stat.h stats.c stats.h str.c str.h sysfile-info.c tab.c \ -tab.h temporary.c title.c t-test.h val.h val-labs.c value-labels.c \ -value-labels.h var-labs.c var.h vars-atr.c vars-prs.c vector.c \ -version.c version.h vfm.c vfm.h vfmP.h weight.c +split-file.c str.c str.h sysfile-info.c tab.c tab.h temporary.c \ +title.c t-test.h val.h val-labs.c value-labels.c value-labels.h \ +var-labs.c var.h vars-atr.c vars-prs.c vector.c version.c version.h \ +vfm.c vfm.h vfmP.h weight.c workspace.c workspace.h -pspp_LDADD = ../lib/julcal/libjulcal.a \ +pspp_LDADD = ../lib/julcal/libjulcal.a \ ../lib/misc/libmisc.a \ @LIBINTL@ diff --git a/src/aggregate.c b/src/aggregate.c index 3821de78b8..b345ca113e 100644 --- a/src/aggregate.c +++ b/src/aggregate.c @@ -26,11 +26,11 @@ #include "file-handle.h" #include "lexer.h" #include "misc.h" +#include "moments.h" #include "pool.h" #include "settings.h" #include "sfm.h" #include "sort.h" -#include "stats.h" #include "str.h" #include "var.h" #include "vfm.h" @@ -53,6 +53,7 @@ struct agr_var int int1, int2; char *string; int missing; + struct moments *moments; }; /* Aggregation functions. */ @@ -511,6 +512,7 @@ parse_aggregate_functions (struct agr_proc *agr) agr->vars = v; tail = v; tail->next = NULL; + v->moments = NULL; /* Create the target variable in the aggregate dictionary. */ @@ -663,6 +665,8 @@ agr_destroy (struct agr_proc *agr) free (iter->arg[i].c); free (iter->string); } + else if (iter->function == SD) + moments_destroy (iter->moments); free (iter); } free (agr->prev_break); @@ -825,14 +829,9 @@ accumulate_aggregate_info (struct agr_proc *agr, iter->dbl[0] += v->f * weight; iter->dbl[1] += weight; break; - case SD: - { - double product = v->f * weight; - iter->dbl[0] += product; - iter->dbl[1] += product * v->f; - iter->dbl[2] += weight; - break; - } + case SD: + moments_pass_two (iter->moments, v->f, weight); + break; case MAX: iter->dbl[0] = max (iter->dbl[0], v->f); iter->int1 = 1; @@ -992,9 +991,17 @@ dump_aggregate_info (struct agr_proc *agr, struct ccase *output) v->f = i->dbl[1] != 0.0 ? i->dbl[0] / i->dbl[1] : SYSMIS; break; case SD: - v->f = ((i->dbl[2] > 1.0) - ? calc_stddev (calc_variance (i->dbl, i->dbl[2])) - : SYSMIS); + { + double variance; + + /* FIXME: we should use two passes. */ + moments_calculate (i->moments, NULL, NULL, &variance, + NULL, NULL); + if (variance != SYSMIS) + v->f = sqrt (variance); + else + v->f = SYSMIS; + } break; case MAX: case MIN: @@ -1088,6 +1095,12 @@ initialize_aggregate_info (struct agr_proc *agr) case MAX | FSTRING: memset (iter->string, 0, iter->src->width); break; + case SD: + if (iter->moments == NULL) + iter->moments = moments_create (MOMENT_VARIANCE); + else + moments_clear (iter->moments); + break; default: iter->dbl[0] = iter->dbl[1] = iter->dbl[2] = 0.0; iter->int1 = iter->int2 = 0; diff --git a/src/algorithm.c b/src/algorithm.c index f294887b2d..89e813defa 100644 --- a/src/algorithm.c +++ b/src/algorithm.c @@ -97,10 +97,12 @@ #include "alloc.h" #include "random.h" -/* Some of the assertions in this file are very expensive. If - we're optimizing, don't include them. */ -#if __OPTIMIZE__ -#define NDEBUG +/* Some of the assertions in this file are very expensive. We + don't use them by default. */ +#ifdef EXTRA_CHECKS +#define expensive_assert(X) assert(X) +#else +#define expensive_assert(X) ((void) 0) #endif #include "error.h" @@ -202,7 +204,8 @@ unique (void *array, size_t count, size_t size, first += size; if (first >= last) { - assert (adjacent_find_equal (array, count, size, compare, aux) == NULL); + assert (adjacent_find_equal (array, count, + size, compare, aux) == NULL); return count; } @@ -471,7 +474,7 @@ binary_search (const void *array, size_t count, size_t size, } } - assert (find (array, count, size, value, compare, aux) == NULL); + expensive_assert (find (array, count, size, value, compare, aux) == NULL); return NULL; } @@ -825,7 +828,8 @@ push_heap (void *array, size_t count, size_t size, unsigned char *first = array; size_t i; - assert (count < 1 || is_heap (array, count - 1, size, compare, aux)); + expensive_assert (count < 1 || is_heap (array, count - 1, + size, compare, aux)); for (i = count; i > 1; i /= 2) { unsigned char *parent = first + (i / 2 - 1) * size; @@ -835,7 +839,7 @@ push_heap (void *array, size_t count, size_t size, else break; } - assert (is_heap (array, count, size, compare, aux)); + expensive_assert (is_heap (array, count, size, compare, aux)); } /* ARRAY contains COUNT elements of SIZE bytes each. Initially @@ -886,10 +890,11 @@ pop_heap (void *array, size_t count, size_t size, { unsigned char *first = array; - assert (is_heap (array, count, size, compare, aux)); + expensive_assert (is_heap (array, count, size, compare, aux)); SWAP (first, first + (count - 1) * size, size); heapify (first, count - 1, size, 1, compare, aux); - assert (count < 1 || is_heap (array, count - 1, size, compare, aux)); + expensive_assert (count < 1 || is_heap (array, count - 1, + size, compare, aux)); } /* Turns ARRAY, which contains COUNT elements of SIZE bytes, into @@ -903,7 +908,7 @@ make_heap (void *array, size_t count, size_t size, for (idx = count / 2; idx >= 1; idx--) heapify (array, count, size, idx, compare, aux); - assert (count < 1 || is_heap (array, count, size, compare, aux)); + expensive_assert (count < 1 || is_heap (array, count, size, compare, aux)); } /* ARRAY contains COUNT elements of SIZE bytes each. Initially @@ -917,13 +922,13 @@ sort_heap (void *array, size_t count, size_t size, unsigned char *first = array; size_t idx; - assert (is_heap (array, count, size, compare, aux)); + expensive_assert (is_heap (array, count, size, compare, aux)); for (idx = count; idx >= 2; idx--) { SWAP (first, first + (idx - 1) * size, size); heapify (array, idx - 1, size, 1, compare, aux); } - assert (is_sorted (array, count, size, compare, aux)); + expensive_assert (is_sorted (array, count, size, compare, aux)); } /* ARRAY contains COUNT elements of SIZE bytes each. This diff --git a/src/bitvector.h b/src/bitvector.h index 6c3a878b16..5e06799511 100644 --- a/src/bitvector.h +++ b/src/bitvector.h @@ -39,7 +39,6 @@ (((unsigned char *) X)[(Y) / CHAR_BIT] & (1 << ((Y) % CHAR_BIT))) /* Returns 2**X, 0 <= X < 32. */ -#define BIT_INDEX(X) \ - (1ul << (X)) +#define BIT_INDEX(X) (1ul << (X)) #endif /* bitvector.h */ diff --git a/src/casefile.c b/src/casefile.c new file mode 100644 index 0000000000..9c889b6422 --- /dev/null +++ b/src/casefile.c @@ -0,0 +1,815 @@ +/* PSPP - computes sample statistics. + Copyright (C) 2004 Free Software Foundation, Inc. + Written by Ben Pfaff . + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + 02111-1307, USA. */ + +#include +#include "casefile.h" +#include +#include +#include +#include +#include +#include +#include +#include "alloc.h" +#include "error.h" +#include "misc.h" +#include "var.h" +#include "workspace.h" + +#define IO_BUF_SIZE 8192 + +/* A casefile is a sequentially accessible array of immutable + cases. It may be stored in memory or on disk as workspace + allows. Cases may be appended to the end of the file. Cases + may be read sequentially starting from the beginning of the + file. Once any cases have been read, no more cases may be + appended. The entire file is discarded at once. */ + +/* A casefile. */ +struct casefile + { + /* Basic data. */ + struct casefile *next, *prev; /* Next, prev in global list. */ + size_t case_size; /* Case size in bytes. */ + size_t case_list_size; /* Bytes to allocate for case_lists. */ + unsigned long case_cnt; /* Number of cases stored. */ + enum { MEMORY, DISK } storage; /* Where cases are stored. */ + enum { WRITE, READ } mode; /* Is writing or reading allowed? */ + struct casereader *readers; /* List of our readers. */ + + /* Memory storage. */ + struct case_list *head, *tail; /* Beginning, end of list of cases. */ + + /* Disk storage. */ + int fd; /* File descriptor, -1 if none. */ + char *filename; /* Filename. */ + unsigned char *buffer; /* I/O buffer, NULL if none. */ + size_t buffer_used; /* Number of bytes used in buffer. */ + size_t buffer_size; /* Buffer size in bytes. */ + }; + +/* For reading out the casing in a casefile. */ +struct casereader + { + struct casereader *next, *prev; /* Next, prev in casefile's list. */ + struct casefile *cf; /* Our casefile. */ + unsigned long case_idx; /* Case number of current case. */ + + /* Memory storage. */ + struct case_list *cur; /* Current case. */ + + /* Disk storage. */ + int fd; /* File descriptor. */ + unsigned char *buffer; /* I/O buffer. */ + size_t buffer_pos; /* Byte offset of buffer position. */ + }; + +struct casefile *casefiles; + +static void register_atexit (void); +static void exit_handler (void); + +static void reader_open_file (struct casereader *reader); +static void write_case_to_disk (struct casefile *cf, const struct ccase *c); +static void flush_buffer (struct casefile *cf); +static void fill_buffer (struct casereader *reader); + +static int safe_open (const char *filename, int flags); +static int safe_close (int fd); +static int full_read (int fd, char *buffer, size_t size); +static int full_write (int fd, const char *buffer, size_t size); + +/* Creates and returns a casefile to store cases of CASE_SIZE bytes each. */ +struct casefile * +casefile_create (size_t case_size) +{ + struct casefile *cf = xmalloc (sizeof *cf); + cf->next = casefiles; + cf->prev = NULL; + if (cf->next != NULL) + cf->next->prev = cf; + casefiles = cf; + cf->case_size = case_size; + cf->case_list_size = sizeof *cf->head + case_size - sizeof *cf->head->c.data; + cf->case_cnt = 0; + cf->storage = MEMORY; + cf->mode = WRITE; + cf->readers = NULL; + cf->head = cf->tail = NULL; + cf->fd = -1; + cf->filename = NULL; + cf->buffer = NULL; + cf->buffer_size = ROUND_UP (case_size, IO_BUF_SIZE); + if (case_size > 0 && cf->buffer_size % case_size > 512) + cf->buffer_size = case_size; + cf->buffer_used = 0; + register_atexit (); + return cf; +} + +/* Destroys casefile CF. */ +void +casefile_destroy (struct casefile *cf) +{ + if (cf != NULL) + { + struct case_list *iter, *next; + + if (cf->next != NULL) + cf->next->prev = cf->prev; + if (cf->prev != NULL) + cf->prev->next = cf->next; + if (casefiles == cf) + casefiles = cf->next; + + while (cf->readers != NULL) + casereader_destroy (cf->readers); + + for (iter = cf->head; iter != NULL; iter = next) + { + next = iter->next; + workspace_free (iter, cf->case_list_size); + } + + if (cf->fd != -1) + safe_close (cf->fd); + + if (cf->filename != NULL && remove (cf->filename) == -1) + msg (ME, _("%s: Removing temporary file: %s."), + cf->filename, strerror (errno)); + + free (cf->buffer); + + free (cf); + } +} + +/* Returns nonzero only if casefile CF is stored in memory (instead of on + disk). */ +int +casefile_in_core (const struct casefile *cf) +{ + assert (cf != NULL); + + return cf->storage == MEMORY; +} + +/* Returns the number of bytes in a case for CF. */ +size_t +casefile_get_case_size (const struct casefile *cf) +{ + assert (cf != NULL); + + return cf->case_size; +} + +/* Returns the number of cases in casefile CF. */ +unsigned long +casefile_get_case_cnt (const struct casefile *cf) +{ + assert (cf != NULL); + + return cf->case_cnt; +} + +/* Appends case C to casefile CF. Not valid after any reader for CF has been + created. */ +void +casefile_append (struct casefile *cf, const struct ccase *c) +{ + assert (cf != NULL); + assert (c != NULL); + assert (cf->mode == WRITE); + + cf->case_cnt++; + + /* Try memory first. */ + if (cf->storage == MEMORY) + { + struct case_list *new_case; + + new_case = workspace_malloc (cf->case_list_size); + if (new_case != NULL) + { + memcpy (&new_case->c, c, cf->case_size); + new_case->next = NULL; + if (cf->head != NULL) + cf->tail->next = new_case; + else + cf->head = new_case; + cf->tail = new_case; + } + else + { + casefile_to_disk (cf); + assert (cf->storage == DISK); + write_case_to_disk (cf, c); + } + } + else + write_case_to_disk (cf, c); +} + +/* Writes case C to casefile CF's disk buffer, first flushing the buffer to + disk if it would otherwise overflow. */ +static void +write_case_to_disk (struct casefile *cf, const struct ccase *c) +{ + memcpy (cf->buffer + cf->buffer_used, c->data, cf->case_size); + cf->buffer_used += cf->case_size; + if (cf->buffer_used + cf->case_size > cf->buffer_size) + flush_buffer (cf); + +} + +static void +flush_buffer (struct casefile *cf) +{ + if (cf->buffer_used > 0) + { + if (!full_write (cf->fd, cf->buffer, cf->buffer_size)) + msg (FE, _("Error writing temporary file: %s."), strerror (errno)); + + cf->buffer_used = 0; + } +} + +/* Creates a temporary file and stores its name in *FILENAME and + a file descriptor for it in *FD. Returns success. Caller is + responsible for freeing *FILENAME. */ +static int +make_temp_file (int *fd, char **filename) +{ + const char *parent_dir; + + assert (filename != NULL); + assert (fd != NULL); + + if (getenv ("TMPDIR") != NULL) + parent_dir = getenv ("TMPDIR"); + else + parent_dir = P_tmpdir; + + *filename = xmalloc (strlen (parent_dir) + 32); + sprintf (*filename, "%s%cpsppXXXXXX", parent_dir, DIR_SEPARATOR); + *fd = mkstemp (*filename); + if (*fd < 0) + { + msg (FE, _("%s: Creating temporary file: %s."), + *filename, strerror (errno)); + free (*filename); + *filename = NULL; + return 0; + } + return 1; +} + +/* If CF is currently stored in memory, writes it to disk. Readers, if any, + retain their current positions. */ +void +casefile_to_disk (struct casefile *cf) +{ + struct case_list *iter, *next; + struct casereader *reader; + + assert (cf != NULL); + + if (cf->storage == MEMORY) + { + assert (cf->filename == NULL); + assert (cf->fd == -1); + assert (cf->buffer_used == 0); + + cf->storage = DISK; + if (!make_temp_file (&cf->fd, &cf->filename)) + err_failure (); +#if HAVE_POSIX_FADVISE + posix_fadvise (cf->fd, 0, 0, POSIX_FADV_SEQUENTIAL); +#endif + cf->buffer = xmalloc (cf->buffer_size); + memset (cf->buffer, 0, cf->buffer_size); + + for (iter = cf->head; iter != NULL; iter = next) + { + next = iter->next; + write_case_to_disk (cf, &iter->c); + workspace_free (iter, cf->case_list_size); + } + flush_buffer (cf); + cf->head = cf->tail = NULL; + + for (reader = cf->readers; reader != NULL; reader = reader->next) + reader_open_file (reader); + } +} + +/* Merges lists A and B into a single list, which is returned. Cases are + compared according to comparison function COMPARE, which receives auxiliary + data AUX. */ +static struct case_list * +merge (struct case_list *a, struct case_list *b, + int (*compare) (const struct ccase *, + const struct ccase *, void *aux), + void *aux) +{ + struct case_list head; + struct case_list *tail = &head; + + while (a != NULL && b != NULL) + if (compare (&a->c, &b->c, aux) < 0) + { + tail->next = a; + tail = a; + a = a->next; + } + else + { + tail->next = b; + tail = b; + b = b->next; + } + + tail->next = a == NULL ? b : a; + + return head.next; +} + +/* Sorts the list beginning at FIRST, returning the new first case. Cases are + compared according to comparison function COMPARE, which receives auxiliary + data AUX. */ +static struct case_list * +merge_sort (struct case_list *first, + int (*compare) (const struct ccase *, + const struct ccase *, void *aux), + void *aux) +{ + /* FIXME: we should use a "natural" merge sort to take + advantage of the natural order of the data. */ + struct case_list *middle, *last, *tmp; + + /* A list of zero or one elements is already sorted. */ + if (first == NULL || first->next == NULL) + return first; + + middle = first; + last = first->next; + while (last != NULL && last->next != NULL) + { + middle = middle->next; + last = last->next->next; + } + tmp = middle; + middle = middle->next; + tmp->next = NULL; + return merge (merge_sort (first, compare, aux), + merge_sort (middle, compare, aux), + compare, aux); +} + +/* Tries to sort casefile CF according to comparison function + COMPARE, which is passes auxiliary data AUX. If successful, + returns nonzero. Currently only sorting of in-memory + casefiles is implemented. */ +int +casefile_sort (struct casefile *cf, + int (*compare) (const struct ccase *, + const struct ccase *, void *aux), + void *aux) +{ + assert (cf != NULL); + assert (compare != NULL); + + cf->mode = WRITE; + + if (cf->case_cnt < 2) + return 1; + else if (cf->storage == DISK) + return 0; + else + { + cf->head = cf->tail = merge_sort (cf->head, compare, aux); + while (cf->tail->next != NULL) + cf->tail = cf->tail->next; + + return 1; + } +} + +/* Creates and returns a casereader for CF. A casereader can be used to + sequentially read the cases in a casefile. */ +struct casereader * +casefile_get_reader (const struct casefile *cf_) +{ + struct casefile *cf = (struct casefile *) cf_; + struct casereader *reader; + + /* Flush the buffer to disk if it's not empty. */ + if (cf->mode == WRITE && cf->storage == DISK) + flush_buffer (cf); + + cf->mode = READ; + + reader = xmalloc (sizeof *reader); + reader->cf = cf; + reader->next = cf->readers; + if (cf->readers != NULL) + reader->next->prev = reader; + reader->prev = NULL; + cf->readers = reader; + reader->case_idx = 0; + reader->cur = NULL; + reader->fd = -1; + reader->buffer = NULL; + reader->buffer_pos = 0; + + if (reader->cf->storage == MEMORY) + reader->cur = cf->head; + else + reader_open_file (reader); + + return reader; +} + +/* Opens a disk file for READER and seeks to the current position as indicated + by case_idx. Normally the current position is the beginning of the file, + but casefile_to_disk may cause the file to be opened at a different + position. */ +static void +reader_open_file (struct casereader *reader) +{ + size_t buffer_case_cnt; + off_t file_ofs; + + if (reader->case_idx >= reader->cf->case_cnt) + return; + + if (reader->cf->fd != -1) + { + reader->fd = reader->cf->fd; + reader->cf->fd = -1; + } + else + { + reader->fd = safe_open (reader->cf->filename, O_RDONLY); + if (reader->fd < 0) + msg (FE, _("%s: Opening temporary file: %s."), + reader->cf->filename, strerror (errno)); + } + + if (reader->cf->buffer != NULL) + { + reader->buffer = reader->cf->buffer; + reader->cf->buffer = NULL; + } + else + { + reader->buffer = xmalloc (reader->cf->buffer_size); + memset (reader->buffer, 0, reader->cf->buffer_size); + } + + if (reader->cf->case_size != 0) + { + buffer_case_cnt = reader->cf->buffer_size / reader->cf->case_size; + file_ofs = ((off_t) reader->case_idx + / buffer_case_cnt * reader->cf->buffer_size); + reader->buffer_pos = (reader->case_idx % buffer_case_cnt + * reader->cf->case_size); + } + else + file_ofs = 0; +#if HAVE_POSIX_FADVISE + posix_fadvise (reader->fd, file_ofs, 0, POSIX_FADV_SEQUENTIAL); +#endif + if (lseek (reader->fd, file_ofs, SEEK_SET) != file_ofs) + msg (FE, _("%s: Seeking temporary file: %s."), + reader->cf->filename, strerror (errno)); + + if (reader->cf->case_cnt > 0 && reader->cf->case_size > 0) + fill_buffer (reader); +} + +/* Fills READER's buffer by reading a block from disk. */ +static void +fill_buffer (struct casereader *reader) +{ + int retval = full_read (reader->fd, reader->buffer, reader->cf->buffer_size); + if (retval < 0) + msg (FE, _("%s: Reading temporary file: %s."), + reader->cf->filename, strerror (errno)); + else if (retval != reader->cf->buffer_size) + msg (FE, _("%s: Temporary file ended unexpectedly."), + reader->cf->filename); +} + +int +casereader_read (struct casereader *reader, const struct ccase **c) +{ + assert (reader != NULL); + + if (reader->case_idx >= reader->cf->case_cnt) + { + *c = NULL; + return 0; + } + + reader->case_idx++; + if (reader->cf->storage == MEMORY) + { + assert (reader->cur != NULL); + *c = &reader->cur->c; + reader->cur = reader->cur->next; + return 1; + } + else + { + if (reader->buffer_pos + reader->cf->case_size > reader->cf->buffer_size) + { + fill_buffer (reader); + reader->buffer_pos = 0; + } + + *c = (struct ccase *) (reader->buffer + reader->buffer_pos); + reader->buffer_pos += reader->cf->case_size; + return 1; + } +} + +void +casereader_destroy (struct casereader *reader) +{ + assert (reader != NULL); + + if (reader->next != NULL) + reader->next->prev = reader->prev; + if (reader->prev != NULL) + reader->prev->next = reader->next; + if (reader->cf->readers == reader) + reader->cf->readers = reader->next; + + if (reader->cf->buffer == NULL) + reader->cf->buffer = reader->buffer; + else + free (reader->buffer); + + if (reader->cf->fd == -1) + reader->cf->fd = reader->fd; + else + safe_close (reader->fd); + + free (reader); +} + +static int +safe_open (const char *filename, int flags) +{ + int fd; + + do + { + fd = open (filename, flags); + } + while (fd == -1 && errno == EINTR); + + return fd; +} + +static int safe_close (int fd) +{ + int retval; + + do + { + retval = close (fd); + } + while (retval == -1 && errno == EINTR); + + return retval; +} + +static int +full_read (int fd, char *buffer, size_t size) +{ + size_t bytes_read = 0; + + while (bytes_read < size) + { + int retval = read (fd, buffer + bytes_read, size - bytes_read); + if (retval > 0) + bytes_read += retval; + else if (retval == 0) + return bytes_read; + else if (errno != EINTR) + return -1; + } + + return bytes_read; +} + +static int +full_write (int fd, const char *buffer, size_t size) +{ + size_t bytes_written = 0; + + while (bytes_written < size) + { + int retval = write (fd, buffer + bytes_written, size - bytes_written); + if (retval >= 0) + bytes_written += retval; + else if (errno != EINTR) + return -1; + } + + return bytes_written; +} + +static void +register_atexit (void) +{ + static int registered = 0; + if (!registered) + { + registered = 1; + atexit (exit_handler); + } +} + +static void +exit_handler (void) +{ + while (casefiles != NULL) + casefile_destroy (casefiles); +} + +#include +#include "command.h" +#include "random.h" +#include "lexer.h" + +static void test_casefile (int pattern, size_t case_size, size_t case_cnt); +static struct ccase *get_random_case (size_t case_size, size_t case_idx); +static void write_random_case (struct casefile *cf, size_t case_idx); +static void read_and_verify_random_case (struct casefile *cf, + struct casereader *reader, + size_t case_idx); +static void fail_test (const char *message, ...); + +int +cmd_debug_casefile (void) +{ + static const size_t sizes[] = + { + 1, 2, 3, 4, 5, 6, 7, 14, 15, 16, 17, 31, 55, 73, + 100, 137, 257, 521, 1031, 2053 + }; + int size_max; + int case_max; + int pattern; + + size_max = sizeof sizes / sizeof *sizes; + if (lex_match_id ("SMALL")) + { + size_max -= 4; + case_max = 511; + } + else + case_max = 4095; + if (token != '.') + return lex_end_of_command (); + + for (pattern = 0; pattern < 5; pattern++) + { + const size_t *size; + + for (size = sizes; size < sizes + size_max; size++) + { + size_t case_cnt; + + for (case_cnt = 0; case_cnt <= case_max; + case_cnt = (case_cnt * 2) + 1) + test_casefile (pattern, *size * sizeof (union value), case_cnt); + } + } + printf ("Casefile tests succeeded.\n"); + return CMD_SUCCESS; +} + +static void +test_casefile (int pattern, size_t case_size, size_t case_cnt) +{ + int zero = 0; + struct casefile *cf; + struct casereader *r1, *r2; + const struct ccase *c; + struct rng *rng; + size_t i, j; + + rng = rng_create (); + rng_seed (rng, &zero, sizeof zero); + cf = casefile_create (case_size); + for (i = 0; i < case_cnt; i++) + write_random_case (cf, i); + r1 = casefile_get_reader (cf); + r2 = casefile_get_reader (cf); + switch (pattern) + { + case 0: + for (i = 0; i < case_cnt; i++) + { + read_and_verify_random_case (cf, r1, i); + read_and_verify_random_case (cf, r2, i); + } + break; + case 1: + for (i = 0; i < case_cnt; i++) + read_and_verify_random_case (cf, r1, i); + for (i = 0; i < case_cnt; i++) + read_and_verify_random_case (cf, r2, i); + break; + case 2: + case 3: + case 4: + for (i = j = 0; i < case_cnt; i++) + { + read_and_verify_random_case (cf, r1, i); + if (rng_get_int (rng) % pattern == 0) + read_and_verify_random_case (cf, r2, j++); + if (i == case_cnt / 2) + casefile_to_disk (cf); + } + for (; j < case_cnt; j++) + read_and_verify_random_case (cf, r2, j); + break; + } + if (casereader_read (r1, &c)) + fail_test ("Casereader 1 not at end of file."); + if (casereader_read (r2, &c)) + fail_test ("Casereader 2 not at end of file."); + if (pattern != 1) + casereader_destroy (r1); + if (pattern != 2) + casereader_destroy (r2); + casefile_destroy (cf); + rng_destroy (rng); +} + +static struct ccase * +get_random_case (size_t case_size, size_t case_idx) +{ + struct ccase *c = xmalloc (case_size); + memset (c, case_idx % 257, case_size); + return c; +} + +static void +write_random_case (struct casefile *cf, size_t case_idx) +{ + struct ccase *c = get_random_case (casefile_get_case_size (cf), case_idx); + casefile_append (cf, c); + free (c); +} + +static void +read_and_verify_random_case (struct casefile *cf, + struct casereader *reader, size_t case_idx) +{ + const struct ccase *read_case; + struct ccase *expected_case; + size_t case_size; + + case_size = casefile_get_case_size (cf); + expected_case = get_random_case (case_size, case_idx); + if (!casereader_read (reader, &read_case)) + fail_test ("Premature end of casefile."); + if (memcmp (read_case, expected_case, case_size)) + fail_test ("Case %lu fails comparison.", (unsigned long) case_idx); + free (expected_case); +} + +static void +fail_test (const char *message, ...) +{ + va_list args; + + va_start (args, message); + vprintf (message, args); + putchar ('\n'); + va_end (args); + + exit (1); +} diff --git a/src/casefile.h b/src/casefile.h new file mode 100644 index 0000000000..65ab4915fe --- /dev/null +++ b/src/casefile.h @@ -0,0 +1,48 @@ +/* PSPP - computes sample statistics. + Copyright (C) 2004 Free Software Foundation, Inc. + Written by Ben Pfaff . + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + 02111-1307, USA. */ + +#ifndef HEADER_CASEFILE +#define HEADER_CASEFILE + +#include + +struct ccase; +struct casefile; +struct casereader; + +struct casefile *casefile_create (size_t case_size); +void casefile_destroy (struct casefile *); + +int casefile_in_core (const struct casefile *); +size_t casefile_get_case_size (const struct casefile *); +unsigned long casefile_get_case_cnt (const struct casefile *); + +void casefile_append (struct casefile *, const struct ccase *); +void casefile_to_disk (struct casefile *); + +int casefile_sort (struct casefile *, + int (*compare) (const struct ccase *, + const struct ccase *, void *aux), + void *aux); + +struct casereader *casefile_get_reader (const struct casefile *); +int casereader_read (struct casereader *, const struct ccase **); +void casereader_destroy (struct casereader *); + +#endif /* casefile.h */ diff --git a/src/command.c b/src/command.c index 071c7e4e9e..dff2ff4732 100644 --- a/src/command.c +++ b/src/command.c @@ -293,6 +293,12 @@ conflicting_3char_prefixes (const char *a, const char *b) bw = find_word (b, &bw_len); assert (aw != NULL && bw != NULL); + /* Words that are the same don't conflict. */ + if (aw_len == bw_len && !memcmp (aw, bw, aw_len)) + return 0; + + /* Words that are otherwise the same in the first three letters + do conflict. */ return ((aw_len > 3 && bw_len > 3) || (aw_len == 3 && bw_len > 3) || (bw_len == 3 && aw_len > 3)) && !memcmp (aw, bw, 3); diff --git a/src/command.def b/src/command.def index 09f809cd4c..cfb74112b0 100644 --- a/src/command.def +++ b/src/command.def @@ -39,7 +39,9 @@ DEFCMD ("CONDESCRIPTIVES", ERRO, ERRO, PROC, PROC, cmd_descriptives) DEFCMD ("COUNT", ERRO, INPU, TRAN, TRAN, cmd_count) DEFCMD ("CROSSTABS", ERRO, ERRO, PROC, PROC, cmd_crosstabs) DEFCMD ("DATA LIST", TRAN, INPU, TRAN, TRAN, cmd_data_list) +DEFCMD ("DEBUG CASEFILE", INIT, INPU, TRAN, PROC, cmd_debug_casefile) DEFCMD ("DEBUG EVALUATE", INIT, INPU, TRAN, PROC, cmd_debug_evaluate) +DEFCMD ("DEBUG MOMENTS", INIT, INPU, TRAN, PROC, cmd_debug_moments) DEFCMD ("DESCRIPTIVES", ERRO, ERRO, PROC, PROC, cmd_descriptives) DEFCMD ("DISPLAY", ERRO, INPU, TRAN, PROC, cmd_display) DEFCMD ("DO IF", ERRO, INPU, TRAN, TRAN, cmd_do_if) diff --git a/src/crosstabs.q b/src/crosstabs.q index c3a17aafac..1f66f066b3 100644 --- a/src/crosstabs.q +++ b/src/crosstabs.q @@ -34,6 +34,7 @@ #include #include #include +#include #include "algorithm.h" #include "alloc.h" #include "hash.h" @@ -43,7 +44,6 @@ #include "error.h" #include "magic.h" #include "misc.h" -#include "stats.h" #include "output.h" #include "tab.h" #include "value-labels.h" @@ -2030,7 +2030,7 @@ display_chisq (void) tab_float (chisq, 1, 0, TAB_RIGHT, chisq_v[i], 8, 3); tab_float (chisq, 2, 0, TAB_RIGHT, df[i], 8, 0); tab_float (chisq, 3, 0, TAB_RIGHT, - chisq_sig (chisq_v[i], df[i]), 8, 3); + gsl_cdf_chisq_Q (chisq_v[i], df[i]), 8, 3); } else { @@ -2695,10 +2695,10 @@ calc_symmetric (double v[N_SYMMETRIC], double ase[N_SYMMETRIC], if (cmd.a_statistics[CRS_ST_D]) { - d_yx_cum += fij * sqr (Dr * (Cij - Dij) - - (P - Q) * (W - row_tot[i])); - d_xy_cum += fij * sqr (Dc * (Dij - Cij) - - (Q - P) * (W - col_tot[j])); + d_yx_cum += fij * pow2 (Dr * (Cij - Dij) + - (P - Q) * (W - row_tot[i])); + d_xy_cum += fij * pow2 (Dc * (Dij - Cij) + - (Q - P) * (W - col_tot[j])); } if (++j == n_cols) @@ -2718,8 +2718,8 @@ calc_symmetric (double v[N_SYMMETRIC], double ase[N_SYMMETRIC], } btau_var = ((btau_cum - - (W * sqr (W * (P - Q) / sqrt (Dr * Dc) * (Dr + Dc)))) - / sqr (Dr * Dc)); + - (W * pow2 (W * (P - Q) / sqrt (Dr * Dc) * (Dr + Dc)))) + / pow2 (Dr * Dc)); if (cmd.a_statistics[CRS_ST_BTAU]) { ase[3] = sqrt (btau_var); @@ -2744,17 +2744,17 @@ calc_symmetric (double v[N_SYMMETRIC], double ase[N_SYMMETRIC], somers_d_ase[0] = 2. * btau_var / (Dr + Dc) * sqrt (Dr * Dc); somers_d_t[0] = (somers_d_v[0] / (4 / (Dc + Dr) - * sqrt (ctau_cum - sqr (P - Q) / W))); + * sqrt (ctau_cum - pow2 (P - Q) / W))); somers_d_v[1] = (P - Q) / Dc; - somers_d_ase[1] = 2. / sqr (Dc) * sqrt (d_xy_cum); + somers_d_ase[1] = 2. / pow2 (Dc) * sqrt (d_xy_cum); somers_d_t[1] = (somers_d_v[1] / (2. / Dc - * sqrt (ctau_cum - sqr (P - Q) / W))); + * sqrt (ctau_cum - pow2 (P - Q) / W))); somers_d_v[2] = (P - Q) / Dr; - somers_d_ase[2] = 2. / sqr (Dr) * sqrt (d_yx_cum); + somers_d_ase[2] = 2. / pow2 (Dr) * sqrt (d_yx_cum); somers_d_t[2] = (somers_d_v[2] / (2. / Dr - * sqrt (ctau_cum - sqr (P - Q) / W))); + * sqrt (ctau_cum - pow2 (P - Q) / W))); } free (cum); @@ -2847,12 +2847,12 @@ calc_symmetric (double v[N_SYMMETRIC], double ase[N_SYMMETRIC], / (W * (W * W - sum_rici) * (W * W - sum_rici))); #if 0 t[8] = v[8] / sqrt (W * (((sum_fii * (W - sum_fii)) - / sqr (W * W - sum_rici)) + / pow2 (W * W - sum_rici)) + ((2. * (W - sum_fii) * (2. * sum_fii * sum_rici - W * sum_fiiri_ci)) / cube (W * W - sum_rici)) - + (sqr (W - sum_fii) + + (pow2 (W - sum_fii) * (W * sum_fijri_ci2 - 4. * sum_rici * sum_rici) / pow4 (W * W - sum_rici)))); @@ -3015,7 +3015,7 @@ calc_directional (double v[N_DIRECTIONAL], double ase[N_DIRECTIONAL], { const int deltaj = j == cm_index; accum += (mat[j + i * n_cols] - * sqr ((j == fim_index[i]) + * pow2 ((j == fim_index[i]) - deltaj + v[0] * deltaj)); } @@ -3031,7 +3031,7 @@ calc_directional (double v[N_DIRECTIONAL], double ase[N_DIRECTIONAL], if (cm_index != fim_index[i]) accum += (mat[i * n_cols + fim_index[i]] + mat[i * n_cols + cm_index]); - t[2] = v[2] / (sqrt (accum - sqr (sum_fim - cm) / W) / (W - cm)); + t[2] = v[2] / (sqrt (accum - pow2 (sum_fim - cm) / W) / (W - cm)); } /* ASE1 for X given Y. */ @@ -3043,7 +3043,7 @@ calc_directional (double v[N_DIRECTIONAL], double ase[N_DIRECTIONAL], { const int deltaj = i == rm_index; accum += (mat[j + i * n_cols] - * sqr ((i == fmj_index[j]) + * pow2 ((i == fmj_index[j]) - deltaj + v[0] * deltaj)); } @@ -3059,7 +3059,7 @@ calc_directional (double v[N_DIRECTIONAL], double ase[N_DIRECTIONAL], if (rm_index != fmj_index[j]) accum += (mat[j + n_cols * fmj_index[j]] + mat[j + n_cols * rm_index]); - t[1] = v[1] / (sqrt (accum - sqr (sum_fmj - rm) / W) / (W - rm)); + t[1] = v[1] / (sqrt (accum - pow2 (sum_fmj - rm) / W) / (W - rm)); } /* Symmetric ASE0 and ASE1. */ @@ -3072,12 +3072,12 @@ calc_directional (double v[N_DIRECTIONAL], double ase[N_DIRECTIONAL], { int temp0 = (fmj_index[j] == i) + (fim_index[i] == j); int temp1 = (i == rm_index) + (j == cm_index); - accum0 += mat[j + i * n_cols] * sqr (temp0 - temp1); + accum0 += mat[j + i * n_cols] * pow2 (temp0 - temp1); accum1 += (mat[j + i * n_cols] - * sqr (temp0 + (v[0] - 1.) * temp1)); + * pow2 (temp0 + (v[0] - 1.) * temp1)); } ase[0] = sqrt (accum1 - 4. * W * v[0] * v[0]) / (2. * W - rm - cm); - t[0] = v[0] / (sqrt (accum0 - sqr ((sum_fim + sum_fmj - cm - rm) / W)) + t[0] = v[0] / (sqrt (accum0 - pow2 ((sum_fim + sum_fmj - cm - rm) / W)) / (2. * W - rm - cm)); } @@ -3093,7 +3093,7 @@ calc_directional (double v[N_DIRECTIONAL], double ase[N_DIRECTIONAL], for (sum_fij2_ri = sum_fij2_ci = 0., i = 0; i < n_rows; i++) for (j = 0; j < n_cols; j++) { - double temp = sqr (mat[j + i * n_cols]); + double temp = pow2 (mat[j + i * n_cols]); sum_fij2_ri += temp / row_tot[i]; sum_fij2_ci += temp / col_tot[j]; } @@ -3131,7 +3131,7 @@ calc_directional (double v[N_DIRECTIONAL], double ase[N_DIRECTIONAL], if (entry <= 0.) continue; - P += entry * sqr (log (col_tot[j] * row_tot[i] / (W * entry))); + P += entry * pow2 (log (col_tot[j] * row_tot[i] / (W * entry))); UXY -= entry / W * log (entry / W); } @@ -3143,27 +3143,27 @@ calc_directional (double v[N_DIRECTIONAL], double ase[N_DIRECTIONAL], if (entry <= 0.) continue; - ase1_yx += entry * sqr (UY * log (entry / row_tot[i]) + ase1_yx += entry * pow2 (UY * log (entry / row_tot[i]) + (UX - UXY) * log (col_tot[j] / W)); - ase1_xy += entry * sqr (UX * log (entry / col_tot[j]) + ase1_xy += entry * pow2 (UX * log (entry / col_tot[j]) + (UY - UXY) * log (row_tot[i] / W)); - ase1_sym += entry * sqr ((UXY + ase1_sym += entry * pow2 ((UXY * log (row_tot[i] * col_tot[j] / (W * W))) - (UX + UY) * log (entry / W)); } v[5] = 2. * ((UX + UY - UXY) / (UX + UY)); - ase[5] = (2. / (W * sqr (UX + UY))) * sqrt (ase1_sym); + ase[5] = (2. / (W * pow2 (UX + UY))) * sqrt (ase1_sym); t[5] = v[5] / ((2. / (W * (UX + UY))) - * sqrt (P - sqr (UX + UY - UXY) / W)); + * sqrt (P - pow2 (UX + UY - UXY) / W)); v[6] = (UX + UY - UXY) / UX; ase[6] = sqrt (ase1_xy) / (W * UX * UX); - t[6] = v[6] / (sqrt (P - W * sqr (UX + UY - UXY)) / (W * UX)); + t[6] = v[6] / (sqrt (P - W * pow2 (UX + UY - UXY)) / (W * UX)); v[7] = (UX + UY - UXY) / UY; ase[7] = sqrt (ase1_yx) / (W * UY * UY); - t[7] = v[7] / (sqrt (P - W * sqr (UX + UY - UXY)) / (W * UY)); + t[7] = v[7] / (sqrt (P - W * pow2 (UX + UY - UXY)) / (W * UY)); } /* Somers' D. */ diff --git a/src/debug.c b/src/debug.c index b1603bf35e..5392c68c64 100644 --- a/src/debug.c +++ b/src/debug.c @@ -19,77 +19,14 @@ #include #include +#include #include +#include +#include "alloc.h" #include "command.h" #include "error.h" #include "expr.h" #include "lexer.h" +#include "moments.h" #include "var.h" -int -cmd_debug_evaluate (void) -{ - struct expression *expr; - union value value; - enum expr_type expr_flags; - int dump_postfix = 0; - - discard_variables (); - - expr_flags = 0; - if (lex_match_id ("NOOPTIMIZE")) - expr_flags |= EXPR_NO_OPTIMIZE; - if (lex_match_id ("POSTFIX")) - dump_postfix = 1; - if (token != '/') - { - lex_force_match ('/'); - return CMD_FAILURE; - } - fprintf (stderr, "%s => ", lex_rest_of_line (NULL)); - lex_get (); - - expr = expr_parse (EXPR_ANY | expr_flags); - if (!expr || token != '.') - { - fprintf (stderr, "error\n"); - return CMD_FAILURE; - } - - if (dump_postfix) - expr_debug_print_postfix (expr); - else - { - expr_evaluate (expr, NULL, 0, &value); - switch (expr_get_type (expr)) - { - case EXPR_NUMERIC: - if (value.f == SYSMIS) - fprintf (stderr, "sysmis\n"); - else - fprintf (stderr, "%.2f\n", value.f); - break; - - case EXPR_BOOLEAN: - if (value.f == SYSMIS) - fprintf (stderr, "sysmis\n"); - else if (value.f == 0.0) - fprintf (stderr, "false\n"); - else - fprintf (stderr, "true\n"); - break; - - case EXPR_STRING: - fputc ('"', stderr); - fwrite (value.c + 1, value.c[0], 1, stderr); - fputs ("\"\n", stderr); - break; - - default: - assert (0); - } - } - - expr_free (expr); - return CMD_SUCCESS; -} diff --git a/src/descript.q b/src/descript.q deleted file mode 100644 index 9ba2b569c7..0000000000 --- a/src/descript.q +++ /dev/null @@ -1,860 +0,0 @@ -/* PSPP - computes sample statistics. - Copyright (C) 1997-9, 2000 Free Software Foundation, Inc. - Written by Ben Pfaff . - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA - 02111-1307, USA. */ - -/* FIXME: Many possible optimizations. */ - -#include -#include "error.h" -#include -#include -#include -#include "algorithm.h" -#include "alloc.h" -#include "bitvector.h" -#include "command.h" -#include "lexer.h" -#include "error.h" -#include "magic.h" -#include "stats.h" -#include "som.h" -#include "tab.h" -#include "var.h" -#include "vfm.h" - -/* (specification) - DESCRIPTIVES (dsc_): - *variables=custom; - +missing=miss:!variable/listwise,incl:!noinclude/include; - +format=labeled:!labels/nolabels,indexed:!noindex/index,lined:!line/serial; - +save=; - +options[op_]=1,2,3,4,5,6,7,8; - +statistics[st_]=all,1|mean,2|semean,5|stddev,6|variance,7|kurtosis, - 8|skewness,9|range,10|minimum,11|maximum,12|sum, - 13|default,14|seskewness,15|sekurtosis; - +sort=sortby:mean/semean/stddev/variance/kurtosis/skewness/range/ - range/minimum/maximum/sum/name/seskewness/sekurtosis/!none, - order:!a/d. -*/ -/* (declarations) */ -/* (functions) */ - -/* DESCRIPTIVES private data. */ - -/* Describes properties of a distribution for the purpose of - calculating a Z-score. */ -struct dsc_z_score - { - struct variable *s, *d; /* Source, destination variable. */ - double mean; /* Distribution mean. */ - double std_dev; /* Distribution standard deviation. */ - }; - -/* DESCRIPTIVES transformation (for calculating Z-scores). */ -struct descriptives_trns - { - struct trns_header h; - int n; /* Number of Z-scores. */ - struct dsc_z_score *z; /* Array of Z-scores. */ - }; - -/* These next three vars, see comment at top of display(). */ -/* Number of cases missing listwise, even if option 5 not selected. */ -static double d_glob_miss_list; - -/* Number of total *cases* valid or missing, as a double. Unless - option 5 is selected, d_glob_missing is 0. */ -static double d_glob_valid, d_glob_missing; - -/* Set when a weighting variable is missing or <=0. */ -static int bad_weight; - -/* Number of generic zvarnames we've generated in this execution. */ -static int z_count; - -/* Variables specified on command. */ -static struct variable **v_variables; -static int n_variables; - -/* Command specifications. */ -static struct cmd_descriptives cmd; - -/* Whether z-scores are computed. */ -static int z_scores; - -/* Statistic to sort by. */ -static int sortby_stat; - -/* Statistics to display. */ -static unsigned long stats; - -/* Easier access to long-named arrays. */ -#define stat cmd.a_statistics -#define opt cmd.a_options - -/* Groups of statistics. */ -#define BI BIT_INDEX - -#define dsc_default \ - (BI (dsc_mean) | BI (dsc_stddev) | BI (dsc_min) | BI (dsc_max)) - -#define dsc_all \ - (BI (dsc_sum) | BI (dsc_min) | BI (dsc_max) \ - | BI (dsc_mean) | BI (dsc_semean) | BI (dsc_stddev) \ - | BI (dsc_variance) | BI (dsc_kurt) | BI (dsc_sekurt) \ - | BI (dsc_skew) | BI (dsc_seskew) | BI (dsc_range) \ - | BI (dsc_range)) - -/* Table of options. */ -#define op_incl_miss DSC_OP_1 /* Honored. */ -#define op_no_varlabs DSC_OP_2 /* Ignored. */ -#define op_zscores DSC_OP_3 /* Honored. */ -#define op_index DSC_OP_4 /* FIXME. */ -#define op_excl_miss DSC_OP_5 /* Honored. */ -#define op_serial DSC_OP_6 /* Honored. */ -#define op_narrow DSC_OP_7 /* Ignored. */ -#define op_no_varnames DSC_OP_8 /* Honored. */ - -/* Describes one statistic that can be calculated. */ -/* FIXME: Currently sm,col_width are not used. */ -struct dsc_info - { - int st_indx; /* Index into st_a_statistics[]. */ - int sb_indx; /* Sort-by index. */ - const char *s10; /* Name, stuffed into 10 columns. */ - const char *s8; /* Name, stuffed into 8 columns. */ - const char *sm; /* Name, stuffed minimally. */ - const char *s; /* Full name. */ - int max_degree; /* Highest degree necessary to calculate this - statistic. */ - int col_width; /* Column width (not incl. spacing between columns) */ - }; - -/* Table of statistics, indexed by dsc_*. */ -static struct dsc_info dsc_info[dsc_n_stats] = -{ - {DSC_ST_MEAN, DSC_MEAN, N_("Mean"), N_("Mean"), N_("Mean"), N_("mean"), 1, 10}, - {DSC_ST_SEMEAN, DSC_SEMEAN, N_("S.E. Mean"), N_("S E Mean"), N_("SE"), - N_("standard error of mean"), 2, 10}, - {DSC_ST_STDDEV, DSC_STDDEV, N_("Std Dev"), N_("Std Dev"), N_("SD"), - N_("standard deviation"), 2, 11}, - {DSC_ST_VARIANCE, DSC_VARIANCE, N_("Variance"), N_("Variance"), - N_("Var"), N_("variance"), 2, 12}, - {DSC_ST_KURTOSIS, DSC_KURTOSIS, N_("Kurtosis"), N_("Kurtosis"), - N_("Kurt"), N_("kurtosis"), 4, 9}, - {DSC_ST_SEKURTOSIS, DSC_SEKURTOSIS, N_("S.E. Kurt"), N_("S E Kurt"), N_("SEKurt"), - N_("standard error of kurtosis"), 0, 9}, - {DSC_ST_SKEWNESS, DSC_SKEWNESS, N_("Skewness"), N_("Skewness"), N_("Skew"), - N_("skewness"), 3, 9}, - {DSC_ST_SESKEWNESS, DSC_SESKEWNESS, N_("S.E. Skew"), N_("S E Skew"), N_("SESkew"), - N_("standard error of skewness"), 0, 9}, - {DSC_ST_RANGE, DSC_RANGE, N_("Range"), N_("Range"), N_("Rng"), N_("range"), 0, 10}, - {DSC_ST_MINIMUM, DSC_MINIMUM, N_("Minimum"), N_("Minimum"), N_("Min"), - N_("minimum"), 0, 10}, - {DSC_ST_MAXIMUM, DSC_MAXIMUM, N_("Maximum"), N_("Maximum"), N_("Max"), - N_("maximum"), 0, 10}, - {DSC_ST_SUM, DSC_SUM, N_("Sum"), N_("Sum"), N_("Sum"), N_("sum"), 1, 13}, -}; - -/* Z-score functions. */ -static int generate_z_varname (struct variable * v); -static void dump_z_table (void); -static void run_z_pass (void); - -/* Procedure execution functions. */ -static int calc (struct ccase *, void *); -static void precalc (void *); -static void postcalc (void *); -static void display (void); - -/* Parser and outline. */ - -int -cmd_descriptives (void) -{ - struct variable *v; - int i; - - v_variables = NULL; - n_variables = 0; - - if (!parse_descriptives (&cmd)) - goto lossage; - - if (n_variables == 0) - goto lossage; - for (i = 0; i < n_variables; i++) - { - v = v_variables[i]; - v->p.dsc.dup = 0; - v->p.dsc.zname[0] = 0; - } - - if (n_variables < 0) - { - msg (SE, _("No variables specified.")); - goto lossage; - } - - if (cmd.sbc_options && (cmd.sbc_save || cmd.sbc_format || cmd.sbc_missing)) - { - msg (SE, _("OPTIONS may not be used with SAVE, FORMAT, or MISSING.")); - goto lossage; - } - - if (!cmd.sbc_options) - { - if (cmd.incl == DSC_INCLUDE) - opt[op_incl_miss] = 1; - if (cmd.labeled == DSC_NOLABELS) - opt[op_no_varlabs] = 1; - if (cmd.sbc_save) - opt[op_zscores] = 1; - if (cmd.miss == DSC_LISTWISE) - opt[op_excl_miss] = 1; - if (cmd.lined == DSC_SERIAL) - opt[op_serial] = 1; - } - - /* Construct z-score varnames, show translation table. */ - if (opt[op_zscores]) - { - z_count = 0; - for (i = 0; i < n_variables; i++) - { - v = v_variables[i]; - if (v->p.dsc.dup++) - continue; - - if (v->p.dsc.zname[0] == 0) - if (!generate_z_varname (v)) - goto lossage; - } - dump_z_table (); - z_scores = 1; - } - - /* Figure out statistics to calculate. */ - stats = 0; - if (stat[DSC_ST_DEFAULT] || !cmd.sbc_statistics) - stats |= dsc_default; - if (stat[DSC_ST_ALL]) - stats |= dsc_all; - for (i = 0; i < dsc_n_stats; i++) - if (stat[dsc_info[i].st_indx]) - stats |= BIT_INDEX (i); - if (stats & dsc_kurt) - stats |= dsc_sekurt; - if (stats & dsc_skew) - stats |= dsc_seskew; - - /* Check the sort order. */ - sortby_stat = -1; - if (cmd.sortby == DSC_NONE) - sortby_stat = -2; - else if (cmd.sortby != DSC_NAME) - { - for (i = 0; i < n_variables; i++) - if (dsc_info[i].sb_indx == cmd.sortby) - { - sortby_stat = i; - if (!(stats & BIT_INDEX (i))) - { - msg (SE, _("It's not possible to sort on `%s' without " - "displaying `%s'."), - gettext (dsc_info[i].s), gettext (dsc_info[i].s)); - goto lossage; - } - } - assert (sortby_stat != -1); - } - - /* Data pass! */ - bad_weight = 0; - procedure_with_splits (precalc, calc, postcalc, NULL); - - if (bad_weight) - msg (SW, _("At least one case in the data file had a weight value " - "that was system-missing, zero, or negative. These case(s) " - "were ignored.")); - - /* Z-scoring! */ - if (z_scores) - run_z_pass (); - - if (v_variables) - free (v_variables); - return CMD_SUCCESS; - - lossage: - if (v_variables) - free (v_variables); - return CMD_FAILURE; -} - -/* Parses the VARIABLES subcommand. */ -static int -dsc_custom_variables (struct cmd_descriptives *cmd UNUSED) -{ - if (!lex_match_id ("VARIABLES") - && (token != T_ID || dict_lookup_var (default_dict, tokid) == NULL) - && token != T_ALL) - return 2; - lex_match ('='); - - while (token == T_ID || token == T_ALL) - { - int i, n; - - n = n_variables; - if (!parse_variables (default_dict, &v_variables, &n_variables, - PV_DUPLICATE | PV_SINGLE | PV_APPEND | PV_NUMERIC - | PV_NO_SCRATCH)) - return 0; - if (lex_match ('(')) - { - if (n_variables - n > 1) - { - msg (SE, _("Names for z-score variables must be given for " - "individual variables, not for groups of " - "variables.")); - return 0; - } - assert (n_variables - n <= 0); - if (token != T_ID) - { - msg (SE, _("Name for z-score variable expected.")); - return 0; - } - if (dict_lookup_var (default_dict, tokid)) - { - msg (SE, _("Z-score variable name `%s' is a " - "duplicate variable name with a current variable."), - tokid); - return 0; - } - for (i = 0; i < n_variables; i++) - if (v_variables[i]->p.dsc.zname[0] - && !strcmp (v_variables[i]->p.dsc.zname, tokid)) - { - msg (SE, _("Z-score variable name `%s' is " - "used multiple times."), tokid); - return 0; - } - strcpy (v_variables[n_variables - 1]->p.dsc.zname, tokid); - lex_get (); - if (token != ')') - { - msg (SE, _("`)' expected after z-score variable name.")); - return 0; - } - - z_scores = 1; - } - lex_match (','); - } - return 1; -} - -/* Z scores. */ - -/* Returns 0 if NAME is a duplicate of any existing variable name or - of any previously-declared z-var name; otherwise returns 1. */ -static int -try_name (char *name) -{ - int i; - - if (dict_lookup_var (default_dict, name) != NULL) - return 0; - for (i = 0; i < n_variables; i++) - { - struct variable *v = v_variables[i]; - if (!strcmp (v->p.dsc.zname, name)) - return 0; - } - return 1; -} - -static int -generate_z_varname (struct variable * v) -{ - char zname[10]; - - strcpy (&zname[1], v->name); - zname[0] = 'Z'; - zname[8] = '\0'; - if (try_name (zname)) - { - strcpy (v->p.dsc.zname, zname); - return 1; - } - - for (;;) - { - /* Generate variable name. */ - z_count++; - - if (z_count <= 99) - sprintf (zname, "ZSC%03d", z_count); - else if (z_count <= 108) - sprintf (zname, "STDZ%02d", z_count - 99); - else if (z_count <= 117) - sprintf (zname, "ZZZZ%02d", z_count - 108); - else if (z_count <= 126) - sprintf (zname, "ZQZQ%02d", z_count - 117); - else - { - msg (SE, _("Ran out of generic names for Z-score variables. " - "There are only 126 generic names: ZSC001-ZSC0999, " - "STDZ01-STDZ09, ZZZZ01-ZZZZ09, ZQZQ01-ZQZQ09.")); - return 0; - } - - if (try_name (zname)) - { - strcpy (v->p.dsc.zname, zname); - return 1; - } - } -} - -static void -dump_z_table (void) -{ - int count; - struct tab_table *t; - - { - int i; - - for (count = i = 0; i < n_variables; i++) - if (v_variables[i]->p.dsc.zname) - count++; - } - - t = tab_create (2, count + 1, 0); - tab_title (t, 0, _("Mapping of variables to corresponding Z-scores.")); - tab_columns (t, SOM_COL_DOWN, 1); - tab_headers (t, 0, 0, 1, 0); - tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 1, count); - tab_hline (t, TAL_2, 0, 1, 1); - tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Source")); - tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Target")); - tab_dim (t, tab_natural_dimensions); - - { - int i, y; - - for (i = 0, y = 1; i < n_variables; i++) - if (v_variables[i]->p.dsc.zname) - { - tab_text (t, 0, y, TAB_LEFT, v_variables[i]->name); - tab_text (t, 1, y++, TAB_LEFT, v_variables[i]->p.dsc.zname); - } - } - - tab_submit (t); -} - -/* Transformation function to calculate Z-scores. */ -static int -descriptives_trns_proc (struct trns_header * trns, struct ccase * c, - int case_num UNUSED) -{ - struct descriptives_trns *t = (struct descriptives_trns *) trns; - struct dsc_z_score *z; - int i; - - for (i = t->n, z = t->z; i--; z++) - { - double score = c->data[z->s->fv].f; - - if (z->mean == SYSMIS || score == SYSMIS) - c->data[z->d->fv].f = SYSMIS; - else - c->data[z->d->fv].f = (score - z->mean) / z->std_dev; - } - return -1; -} - -/* Frees a descriptives_trns struct. */ -static void -descriptives_trns_free (struct trns_header * trns) -{ - struct descriptives_trns *t = (struct descriptives_trns *) trns; - - free (t->z); -} - -/* The name is a misnomer: actually this function sets up a - transformation by which scores can be converted into Z-scores. */ -static void -run_z_pass (void) -{ - struct descriptives_trns *t; - int count, i; - - for (i = 0; i < n_variables; i++) - v_variables[i]->p.dsc.dup = 0; - for (count = i = 0; i < n_variables; i++) - { - if (v_variables[i]->p.dsc.dup++) - continue; - if (v_variables[i]->p.dsc.zname) - count++; - } - - t = xmalloc (sizeof *t); - t->h.proc = descriptives_trns_proc; - t->h.free = descriptives_trns_free; - t->n = count; - t->z = xmalloc (count * sizeof *t->z); - - for (i = 0; i < n_variables; i++) - v_variables[i]->p.dsc.dup = 0; - for (count = i = 0; i < n_variables; i++) - { - struct variable *v = v_variables[i]; - if (v->p.dsc.dup++ == 0 && v->p.dsc.zname[0]) - { - char *cp; - struct variable *d; - - t->z[count].s = v; - t->z[count].d = d = dict_create_var_assert (default_dict, - v->p.dsc.zname, 0); - d->init = 0; - if (v->label) - { - d->label = xmalloc (strlen (v->label) + 12); - cp = stpcpy (d->label, _("Z-score of ")); - strcpy (cp, v->label); - } - else - { - d->label = xmalloc (strlen (v->name) + 12); - cp = stpcpy (d->label, _("Z-score of ")); - strcpy (cp, v->name); - } - t->z[count].mean = v->p.dsc.stats[dsc_mean]; - t->z[count].std_dev = v->p.dsc.stats[dsc_stddev]; - if (t->z[count].std_dev == SYSMIS || t->z[count].std_dev == 0.0) - t->z[count].mean = SYSMIS; - count++; - } - } - - add_transformation ((struct trns_header *) t); -} - -/* Statistical calculation. */ - -static void -precalc (void *aux UNUSED) -{ - int i; - - for (i = 0; i < n_variables; i++) - v_variables[i]->p.dsc.dup = -2; - for (i = 0; i < n_variables; i++) - { - struct descriptives_proc *dsc = &v_variables[i]->p.dsc; - - /* Don't need to initialize more than once. */ - if (dsc->dup == -1) - continue; - dsc->dup = -1; - - dsc->valid = dsc->miss = 0.0; - dsc->X_bar = dsc->M2 = dsc->M3 = dsc->M4 = 0.0; - dsc->min = DBL_MAX; - dsc->max = -DBL_MAX; - } - d_glob_valid = d_glob_missing = 0.0; -} - -static int -calc (struct ccase * c, void *aux UNUSED) -{ - int i; - - /* Unique case identifier. */ - static int case_id; - - /* Get the weight for this case. */ - double weight = dict_get_case_weight (default_dict, c); - - if (weight <= 0.0) - { - weight = 0.0; - bad_weight = 1; - } - case_id++; - - /* Handle missing values. */ - for (i = 0; i < n_variables; i++) - { - struct variable *v = v_variables[i]; - double X = c->data[v->fv].f; - - if (X == SYSMIS || (!opt[op_incl_miss] && is_num_user_missing (X, v))) - { - if (opt[op_excl_miss]) - { - d_glob_missing += weight; - return 1; - } - else - { - d_glob_miss_list += weight; - goto iterate; - } - } - } - d_glob_valid += weight; - -iterate: - for (i = 0; i < n_variables; i++) - { - struct descriptives_proc *inf = &v_variables[i]->p.dsc; - - double X, v; - double W_old, W_new; - double v2, v3, v4; - double w2, w3, w4; - - if (inf->dup == case_id) - continue; - inf->dup = case_id; - - X = c->data[v_variables[i]->fv].f; - if (X == SYSMIS - || (!opt[op_incl_miss] && is_num_user_missing (X, v_variables[i]))) - { - inf->miss += weight; - continue; - } - - /* These formulas taken from _SPSS Statistical Algorithms_. The - names W_old, and W_new are used for W_j-1 and W_j, - respectively, and other variables simply have the subscripts - trimmed off, except for X_bar. - - I am happy that mathematical formulas may not be - copyrighted. */ - W_old = inf->valid; - W_new = inf->valid += weight; - v = (weight / W_new) * (X - inf->X_bar); - v2 = v * v; - v3 = v2 * v; - v4 = v3 * v; - w2 = weight * weight; - w3 = w2 * weight; - w4 = w3 * weight; - inf->M4 += (-4.0 * v * inf->M3 + 6.0 * v2 * inf->M2 - + (W_new * W_new - 3 * weight * W_old / w3) * v4 * W_old * W_new); - inf->M3 += (-3.0 * v * inf->M2 + W_new * W_old / w2 - * (W_new - 2 * weight) * v3); - inf->M2 += W_new * W_old / weight * v2; - inf->X_bar += v; - if (X < inf->min) - inf->min = X; - if (X > inf->max) - inf->max = X; - } - return 1; -} - -static void -postcalc (void *aux UNUSED) -{ - int i; - - if (opt[op_excl_miss]) - d_glob_miss_list = d_glob_missing; - - for (i = 0; i < n_variables; i++) - { - struct descriptives_proc *dsc = &v_variables[i]->p.dsc; - double W; - - /* Don't duplicate our efforts. */ - if (dsc->dup == -2) - continue; - dsc->dup = -2; - - W = dsc->valid; - - dsc->stats[dsc_mean] = dsc->X_bar; - dsc->stats[dsc_variance] = dsc->M2 / (W - 1); - dsc->stats[dsc_stddev] = sqrt (dsc->stats[dsc_variance]); - dsc->stats[dsc_semean] = dsc->stats[dsc_stddev] / sqrt (W); - dsc->stats[dsc_min] = dsc->min == DBL_MAX ? SYSMIS : dsc->min; - dsc->stats[dsc_max] = dsc->max == -DBL_MAX ? SYSMIS : dsc->max; - dsc->stats[dsc_range] = ((dsc->min == DBL_MAX || dsc->max == -DBL_MAX) - ? SYSMIS : dsc->max - dsc->min); - dsc->stats[dsc_sum] = W * dsc->X_bar; - if (W > 2.0 && dsc->stats[dsc_variance] >= 1e-20) - { - double S = dsc->stats[dsc_stddev]; - dsc->stats[dsc_skew] = (W * dsc->M3 / ((W - 1.0) * (W - 2.0) * S * S * S)); - dsc->stats[dsc_seskew] = - sqrt (6.0 * W * (W - 1.0) / ((W - 2.0) * (W + 1.0) * (W + 3.0))); - } - else - { - dsc->stats[dsc_skew] = dsc->stats[dsc_seskew] = SYSMIS; - } - if (W > 3.0 && dsc->stats[dsc_variance] >= 1e-20) - { - double S2 = dsc->stats[dsc_variance]; - double SE_g1 = dsc->stats[dsc_seskew]; - - dsc->stats[dsc_kurt] = - (W * (W + 1.0) * dsc->M4 - 3.0 * dsc->M2 * dsc->M2 * (W - 1.0)) - / ((W - 1.0) * (W - 2.0) * (W - 3.0) * S2 * S2); - - /* Note that in _SPSS Statistical Algorithms_, the square - root symbol is missing from this formula. */ - dsc->stats[dsc_sekurt] = - sqrt ((4.0 * (W * W - 1.0) * SE_g1 * SE_g1) / ((W - 3.0) * (W + 5.0))); - } - else - { - dsc->stats[dsc_kurt] = dsc->stats[dsc_sekurt] = SYSMIS; - } - } - - display (); -} - -/* Statistical display. */ - -static algo_compare_func descriptives_compare_variables; - -static void -display (void) -{ - int i, j; - - int nc, n_stats; - struct tab_table *t; - - /* If op_excl_miss is on, d_glob_valid and (potentially) - d_glob_missing are nonzero, and d_glob_missing equals - d_glob_miss_list. - - If op_excl_miss is off, d_glob_valid is nonzero. d_glob_missing - is zero. d_glob_miss_list is (potentially) nonzero. */ - - if (sortby_stat != -2) - sort (v_variables, n_variables, sizeof *v_variables, - descriptives_compare_variables, &cmd); - - for (nc = i = 0; i < dsc_n_stats; i++) - if (stats & BIT_INDEX (i)) - nc++; - n_stats = nc; - if (!opt[op_no_varnames]) - nc++; - nc += opt[op_serial] ? 2 : 1; - - t = tab_create (nc, n_variables + 1, 0); - tab_headers (t, 1, 0, 1, 0); - tab_box (t, TAL_1, TAL_1, -1, -1, 0, 0, nc - 1, n_variables); - tab_box (t, -1, -1, -1, TAL_1, 1, 0, nc - 1, n_variables); - tab_hline (t, TAL_2, 0, nc - 1, 1); - tab_vline (t, TAL_2, 1, 0, n_variables); - tab_dim (t, tab_natural_dimensions); - - nc = 0; - if (!opt[op_no_varnames]) - { - tab_text (t, nc++, 0, TAB_LEFT | TAT_TITLE, _("Variable")); - } - if (opt[op_serial]) - { - tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, _("Valid N")); - tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, _("Missing N")); - } else { - tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, "N"); - } - - for (i = 0; i < dsc_n_stats; i++) - if (stats & BIT_INDEX (i)) - { - const char *title = gettext (dsc_info[i].s8); - tab_text (t, nc++, 0, TAB_CENTER | TAT_TITLE, title); - } - - for (i = 0; i < n_variables; i++) - { - struct variable *v = v_variables[i]; - - nc = 0; - if (!opt[op_no_varnames]) - tab_text (t, nc++, i + 1, TAB_LEFT, v->name); - tab_text (t, nc++, i + 1, TAT_PRINTF, "%g", v->p.dsc.valid); - if (opt[op_serial]) - tab_text (t, nc++, i + 1, TAT_PRINTF, "%g", v->p.dsc.miss); - for (j = 0; j < dsc_n_stats; j++) - if (stats & BIT_INDEX (j)) - tab_float (t, nc++, i + 1, TAB_NONE, v->p.dsc.stats[j], 10, 3); - } - - tab_title (t, 1, _("Valid cases = %g; cases with missing value(s) = %g."), - d_glob_valid, d_glob_miss_list); - - tab_submit (t); -} - -/* Compares variables A and B according to the ordering specified - by CMD. */ -static int -descriptives_compare_variables (const void *a_, const void *b_, void *cmd_) -{ - struct variable *const *ap = a_; - struct variable *const *bp = b_; - struct variable *a = *ap; - struct variable *b = *bp; - struct cmd_descriptives *cmd = cmd_; - - int result; - - if (cmd->sortby == DSC_NAME) - result = strcmp (a->name, b->name); - else - { - double as = a->p.dsc.stats[sortby_stat]; - double bs = b->p.dsc.stats[sortby_stat]; - - result = as < bs ? -1 : as > bs; - } - - if (cmd->order == DSC_D) - result = -result; - - return result; -} - -/* - Local variables: - mode: c - End: -*/ diff --git a/src/expr-evl.c b/src/expr-evl.c index 4849d4794d..3c74337531 100644 --- a/src/expr-evl.c +++ b/src/expr-evl.c @@ -42,9 +42,9 @@ #include "julcal/julcal.h" #include "magic.h" #include "misc.h" +#include "moments.h" #include "pool.h" #include "random.h" -#include "stats.h" #include "str.h" #include "var.h" #include "vfm.h" @@ -421,22 +421,18 @@ expr_evaluate (const struct expression *e, const struct ccase *c, int case_num, case OP_CFVAR: { int n_args = *op++; - int nv = 0; - double sum[2] = - {0.0, 0.0}; + double weight, mean, variance; sp -= n_args - 1; - for (i = 0; i < n_args; i++) - if (sp[i].f != SYSMIS) - { - nv++; - sum[0] += sp[i].f; - sum[1] += sp[i].f * sp[i].f; - } - if (nv < *op++) + + moments_of_values (sp, n_args, + &weight, &mean, &variance, NULL, NULL); + + if (weight < *op++ || mean == SYSMIS + || mean == 0 || variance == SYSMIS) sp->f = SYSMIS; else - sp->f = calc_cfvar (sum, nv); + sp->f = sqrt (variance) / mean; } break; case OP_MAX: @@ -475,21 +471,13 @@ expr_evaluate (const struct expression *e, const struct ccase *c, int case_num, case OP_MEAN: { int n_args = *op++; - int nv = 0; - double sum[1] = - {0.0}; + double weight, mean; sp -= n_args - 1; - for (i = 0; i < n_args; i++) - if (sp[i].f != SYSMIS) - { - nv++; - sum[0] += sp[i].f; - } - if (nv < *op++) - sp->f = SYSMIS; - else - sp->f = calc_mean (sum, nv); + + moments_of_values (sp, n_args, + &weight, &mean, NULL, NULL, NULL); + sp->f = weight < *op++ ? SYSMIS : mean; } break; case OP_MIN: @@ -590,23 +578,12 @@ expr_evaluate (const struct expression *e, const struct ccase *c, int case_num, case OP_SD: { int n_args = *op++; - int nv = 0; - double sum[2]; - - sum[0] = sum[1] = 0.0; + double weight, variance; sp -= n_args - 1; - for (i = 0; i < n_args; i++) - if (sp[i].f != SYSMIS) - { - nv++; - sum[0] += sp[i].f; - sum[1] += sp[i].f * sp[i].f; - } - if (nv < *op++) - sp->f = SYSMIS; - else - sp->f = calc_stddev (calc_variance (sum, nv)); + moments_of_values (sp, n_args, + &weight, NULL, &variance, NULL, NULL); + sp->f = weight < *op++ ? SYSMIS : sqrt (variance); } break; case OP_SUM: @@ -631,23 +608,12 @@ expr_evaluate (const struct expression *e, const struct ccase *c, int case_num, case OP_VARIANCE: { int n_args = *op++; - int nv = 0; - double sum[2]; - - sum[0] = sum[1] = 0.0; + double weight, variance; sp -= n_args - 1; - for (i = 0; i < n_args; i++) - if (sp[i].f != SYSMIS) - { - nv++; - sum[0] += sp[i].f; - sum[1] += sp[i].f * sp[i].f; - } - if (nv < *op++) - sp->f = SYSMIS; - else - sp->f = calc_variance (sum, nv); + moments_of_values (sp, n_args, + &weight, NULL, &variance, NULL, NULL); + sp->f = weight < *op++ ? SYSMIS : variance; } break; diff --git a/src/expr-opt.c b/src/expr-opt.c index 5f574f9918..aedc8f4532 100644 --- a/src/expr-opt.c +++ b/src/expr-opt.c @@ -31,7 +31,6 @@ #include "julcal/julcal.h" #include "misc.h" #include "pool.h" -#include "stats.h" #include "str.h" #include "var.h" diff --git a/src/expr-prs.c b/src/expr-prs.c index acf8c3c664..a6a4a7d680 100644 --- a/src/expr-prs.c +++ b/src/expr-prs.c @@ -971,6 +971,7 @@ CONCAT_func (const struct function *f UNUSED, int x UNUSED, union any_node **n) type = parse_or (&(*n)->nonterm.arg[(*n)->nonterm.n]); if (type == EXPR_ERROR) goto fail; + (*n)->nonterm.n++; if (type != EXPR_STRING) { msg (SE, _("Argument %d to CONCAT is type %s. All arguments " @@ -978,7 +979,6 @@ CONCAT_func (const struct function *f UNUSED, int x UNUSED, union any_node **n) (*n)->nonterm.n + 1, expr_type_name (type)); goto fail; } - (*n)->nonterm.n++; if (!lex_match (',')) break; @@ -1068,6 +1068,7 @@ generic_str_func (const struct function *f, int x UNUSED, union any_node **n) goto fail; else if (actual_type == EXPR_BOOLEAN) actual_type = EXPR_NUMERIC; + nonterm->n++; if (actual_type != wanted_type) { msg (SE, _("Argument %d to %s was expected to be of %s type. " @@ -1076,7 +1077,6 @@ generic_str_func (const struct function *f, int x UNUSED, union any_node **n) expr_type_name (actual_type), expr_type_name (wanted_type)); goto fail; } - nonterm->n++; } else if (*cp == 'f') { @@ -1682,3 +1682,75 @@ struct op_desc ops[OP_SENTINEL] = { #include "expr.def" }; + +#include "command.h" + +int +cmd_debug_evaluate (void) +{ + struct expression *expr; + union value value; + enum expr_type expr_flags; + int dump_postfix = 0; + + discard_variables (); + + expr_flags = 0; + if (lex_match_id ("NOOPTIMIZE")) + expr_flags |= EXPR_NO_OPTIMIZE; + if (lex_match_id ("POSTFIX")) + dump_postfix = 1; + if (token != '/') + { + lex_force_match ('/'); + return CMD_FAILURE; + } + fprintf (stderr, "%s => ", lex_rest_of_line (NULL)); + lex_get (); + + expr = expr_parse (EXPR_ANY | expr_flags); + if (!expr || token != '.') + { + if (expr != NULL) + expr_free (expr); + fprintf (stderr, "error\n"); + return CMD_FAILURE; + } + + if (dump_postfix) + expr_debug_print_postfix (expr); + else + { + expr_evaluate (expr, NULL, 0, &value); + switch (expr_get_type (expr)) + { + case EXPR_NUMERIC: + if (value.f == SYSMIS) + fprintf (stderr, "sysmis\n"); + else + fprintf (stderr, "%.2f\n", value.f); + break; + + case EXPR_BOOLEAN: + if (value.f == SYSMIS) + fprintf (stderr, "sysmis\n"); + else if (value.f == 0.0) + fprintf (stderr, "false\n"); + else + fprintf (stderr, "true\n"); + break; + + case EXPR_STRING: + fputc ('"', stderr); + fwrite (value.c + 1, value.c[0], 1, stderr); + fputs ("\"\n", stderr); + break; + + default: + assert (0); + } + } + + expr_free (expr); + return CMD_SUCCESS; +} diff --git a/src/frequencies.q b/src/frequencies.q index e7657b6418..c65567e8eb 100644 --- a/src/frequencies.q +++ b/src/frequencies.q @@ -37,7 +37,6 @@ #include "algorithm.h" #include "magic.h" #include "misc.h" -#include "stats.h" #include "output.h" #include "som.h" #include "str.h" diff --git a/src/levene.c b/src/levene.c index 635c68689e..7e0bd6a6f2 100644 --- a/src/levene.c +++ b/src/levene.c @@ -27,7 +27,7 @@ #include "var.h" #include "vfm.h" #include "alloc.h" -#include "stats.h" +#include "misc.h" #include #include @@ -348,7 +348,7 @@ levene2_calc (struct ccase *c, void *_l) if ( ! l->is_missing(v,var) ) { levene_z = fabs(v->f - gs->mean); - lz_denominator[i] += weight * sqr(levene_z - gs->lz_mean); + lz_denominator[i] += weight * pow2(levene_z - gs->lz_mean); } } @@ -373,7 +373,7 @@ levene2_postcalc (void *_l) g = (struct group_statistics *) hsh_next(hash[v],&hi) ) { - lz_numerator += g->n * sqr(g->lz_mean - lz[v].grand_mean ); + lz_numerator += g->n * pow2(g->lz_mean - lz[v].grand_mean ); } diff --git a/src/misc.h b/src/misc.h index 0f03b6df2d..26b9a52639 100644 --- a/src/misc.h +++ b/src/misc.h @@ -108,4 +108,27 @@ int intlog10 (unsigned); +/* Returns the square of X. */ +static inline double +pow2 (double x) +{ + return x * x; +} + +/* Returns the cube of X. */ +static inline double +pow3 (double x) +{ + return x * x * x; +} + +/* Returns the fourth power of X. */ +static inline double +pow4 (double x) +{ + double y = x * x; + y *= y; + return y; +} + #endif /* math/misc.h */ diff --git a/src/moments.c b/src/moments.c new file mode 100644 index 0000000000..9e27ae0f2c --- /dev/null +++ b/src/moments.c @@ -0,0 +1,426 @@ +/* PSPP - computes sample statistics. + Copyright (C) 1997-9, 2000 Free Software Foundation, Inc. + Written by Ben Pfaff . + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + 02111-1307, USA. */ + +#include +#include "moments.h" +#include +#include +#include +#include "alloc.h" +#include "misc.h" +#include "val.h" + +/* FIXME? _SPSS Statistical Algorithms_ in the DESCRIPTIVES + second describes a "provisional means algorithm" that might be + useful for improving accuracy when we only do one pass. */ + +/* A set of moments in process of calculation. */ +struct moments + { + enum moment max_moment; /* Highest-order moment we're computing. */ + int pass; /* Current pass (1 or 2). */ + + /* Pass one. */ + double w1; /* Total weight for pass 1, so far. */ + double sum; /* Sum of values so far. */ + double mean; /* Mean = sum / w1. */ + + /* Pass two. */ + double w2; /* Total weight for pass 2, so far. */ + double d1; /* Sum of deviations from the mean. */ + double d2; /* Sum of squared deviations from the mean. */ + double d3; /* Sum of cubed deviations from the mean. */ + double d4; /* Sum of (deviations from the mean)**4. */ + }; + +/* Initializes moments M for calculating moment MAX_MOMENT and + lower moments. */ +static void +init_moments (struct moments *m, enum moment max_moment) +{ + assert (m != NULL); + assert (max_moment == MOMENT_MEAN || max_moment == MOMENT_VARIANCE + || max_moment == MOMENT_SKEWNESS || max_moment == MOMENT_KURTOSIS); + m->max_moment = max_moment; + moments_clear (m); +} + +/* Clears out a set of moments so that it can be reused for a new + set of values. The moments to be calculated are not changed. */ +void +moments_clear (struct moments *m) +{ + m->pass = 1; + m->w1 = m->w2 = 0.; + m->sum = 0.; +} + +/* Creates and returns a data structure for calculating moment + MAX_MOMENT and lower moments on a data series. For greatest + accuracy, the user should call moments_pass_one() for each + value in the series, then call moments_pass_two() for the same + set of values in the same order, then call moments_calculate() + to obtain the moments. At a cost of reduced accuracy, the + first pass can be skipped. In either case, moments_destroy() + should be called when the moments are no longer needed. */ +struct moments * +moments_create (enum moment max_moment) +{ + struct moments *m = xmalloc (sizeof *m); + init_moments (m, max_moment); + return m; +} + +/* Adds VALUE with the given WEIGHT to the calculation of + moments for the first pass. */ +void +moments_pass_one (struct moments *m, double value, double weight) +{ + assert (m != NULL); + assert (m->pass == 1); + + if (value != SYSMIS && weight >= 0.) + { + m->sum += value * weight; + m->w1 += weight; + } +} + +/* Adds VALUE with the given WEIGHT to the calculation of + moments for the second pass. */ +void +moments_pass_two (struct moments *m, double value, double weight) +{ + double d, d_power; + + assert (m != NULL); + + if (m->pass == 1) + { + m->pass = 2; + m->mean = m->w1 != 0. ? m->sum / m->w1 : 0.; + m->d1 = m->d2 = m->d3 = m->d4 = 0.; + } + + if (value != SYSMIS && weight >= 0.) + { + m->w2 += weight; + + d = d_power = value - m->mean; + m->d1 += d_power * weight; + + if (m->max_moment >= MOMENT_VARIANCE) + { + d_power *= d; + m->d2 += d_power * weight; + + if (m->max_moment >= MOMENT_SKEWNESS) + { + d_power *= d; + m->d3 += d_power * weight; + + if (m->max_moment >= MOMENT_KURTOSIS) + { + d_power *= d; + m->d4 += d_power * weight; + } + } + } + } +} + +/* Calculates moments based on the input data. Stores the total + weight in *WEIGHT, the mean in *MEAN, the variance in + *VARIANCE, the skewness in *SKEWNESS, and the kurtosis in + *KURTOSIS. Any of these result parameters may be null + pointers, in which case the values are not calculated. If any + result cannot be calculated, either because they are undefined + based on the input data or because their moments are higher + than the maximum requested on moments_create(), then SYSMIS is + stored into that result. */ +void +moments_calculate (const struct moments *m, + double *weight, + double *mean, double *variance, + double *skewness, double *kurtosis) +{ + double W; + int one_pass; + + assert (m != NULL); + assert (m->pass == 2); + + one_pass = m->w1 == 0.; + + /* If passes 1 and 2 are used, then w1 and w2 must agree. */ + assert (one_pass || m->w1 == m->w2); + + if (mean != NULL) + *mean = SYSMIS; + if (variance != NULL) + *variance = SYSMIS; + if (skewness != NULL) + *skewness = SYSMIS; + if (kurtosis != NULL) + *kurtosis = SYSMIS; + + W = m->w2; + if (weight != NULL) + *weight = W; + if (W == 0.) + return; + + if (mean != NULL) + *mean = m->mean + m->d1 / W; + + if (m->max_moment >= MOMENT_VARIANCE && W > 1.) + { + double variance_tmp; + + /* From _Numerical Recipes in C_, 2nd ed., 0-521-43108-5, + section 14.1. */ + if (variance == NULL) + variance = &variance_tmp; + *variance = (m->d2 - pow2 (m->d1) / W) / (W - 1.); + + /* From _SPSS Statistical Algorithms, 2nd ed., + 0-918469-89-9, section "DESCRIPTIVES". */ + if (fabs (*variance) >= 1e-20) + { + if (m->max_moment >= MOMENT_SKEWNESS && skewness != NULL && W > 2.) + { + *skewness = ((W * m->d3 - 3.0 * m->d1 * m->d2 + 2.0 + * pow3 (m->d1) / W) + / ((W - 1.0) * (W - 2.0) + * *variance * sqrt (*variance))); + if (!finite (*skewness)) + *skewness = SYSMIS; + } + if (m->max_moment >= MOMENT_KURTOSIS && kurtosis != NULL && W > 3.) + { + *kurtosis = (((W + 1) * (W * m->d4 + - 4.0 * m->d1 * m->d3 + + 6.0 * pow2 (m->d1) * m->d2 / W + - 3.0 * pow4 (m->d1) / pow2 (W))) + / ((W - 1.0) * (W - 2.0) * (W - 3.0) + * pow2 (*variance)) + - (3.0 * pow2 (W - 1.0)) + / ((W - 2.0) * (W - 3.))); + if (!finite (*kurtosis)) + *kurtosis = SYSMIS; + } + } + } +} + +/* Destroys a set of moments. */ +void +moments_destroy (struct moments *m) +{ + free (m); +} + +/* Calculates the requested moments on the CNT values in ARRAY. + Each value is given a weight of 1. The total weight is stored + into *WEIGHT (trivially) and the mean, variance, skewness, and + kurtosis are stored into *MEAN, *VARIANCE, *SKEWNESS, and + *KURTOSIS, respectively. Any of the result pointers may be + null, in which case no value is stored. */ +void +moments_of_doubles (const double *array, size_t cnt, + double *weight, + double *mean, double *variance, + double *skewness, double *kurtosis) +{ + enum moment max_moment; + struct moments m; + size_t idx; + + if (kurtosis != NULL) + max_moment = MOMENT_KURTOSIS; + else if (skewness != NULL) + max_moment = MOMENT_SKEWNESS; + else if (variance != NULL) + max_moment = MOMENT_VARIANCE; + else + max_moment = MOMENT_MEAN; + + init_moments (&m, max_moment); + for (idx = 0; idx < cnt; idx++) + moments_pass_one (&m, array[idx], 1.); + for (idx = 0; idx < cnt; idx++) + moments_pass_two (&m, array[idx], 1.); + moments_calculate (&m, weight, mean, variance, skewness, kurtosis); +} + +/* Calculates the requested moments on the CNT numeric values in + ARRAY. Each value is given a weight of 1. The total weight + is stored into *WEIGHT (trivially) and the mean, variance, + skewness, and kurtosis are stored into *MEAN, *VARIANCE, + *SKEWNESS, and *KURTOSIS, respectively. Any of the result + pointers may be null, in which case no value is stored. */ +void +moments_of_values (const union value *array, size_t cnt, + double *weight, + double *mean, double *variance, + double *skewness, double *kurtosis) +{ + enum moment max_moment; + struct moments m; + size_t idx; + + if (kurtosis != NULL) + max_moment = MOMENT_KURTOSIS; + else if (skewness != NULL) + max_moment = MOMENT_SKEWNESS; + else if (variance != NULL) + max_moment = MOMENT_VARIANCE; + else + max_moment = MOMENT_MEAN; + + init_moments (&m, max_moment); + for (idx = 0; idx < cnt; idx++) + moments_pass_one (&m, array[idx].f, 1.); + for (idx = 0; idx < cnt; idx++) + moments_pass_two (&m, array[idx].f, 1.); + moments_calculate (&m, weight, mean, variance, skewness, kurtosis); +} + +/* Returns the standard error of the skewness for the given total + weight W. + + From _SPSS Statistical Algorithms, 2nd ed., 0-918469-89-9, + section "DESCRIPTIVES". */ +double +calc_seskew (double W) +{ + return sqrt ((6. * W * (W - 1.)) / ((W - 2.) * (W + 1.) * (W + 3.))); +} + +/* Returns the standard error of the kurtosis for the given total + weight W. + + From _SPSS Statistical Algorithms, 2nd ed., 0-918469-89-9, + section "DESCRIPTIVES", except that the sqrt symbol is omitted + there. */ +double +calc_sekurt (double W) +{ + return sqrt ((4. * (pow2 (W) - 1.) * pow2 (calc_seskew (W))) + / ((W - 3.) * (W + 5.))); +} + +#include +#include "command.h" +#include "lexer.h" + +static int +read_values (double **values, double **weights, size_t *cnt) +{ + size_t cap = 0; + + *values = NULL; + *weights = NULL; + *cnt = 0; + while (token == T_NUM) + { + double value = tokval; + double weight = 1.; + lex_get (); + if (lex_match ('*')) + { + if (token != T_NUM) + { + lex_error (_("expecting weight value")); + return 0; + } + weight = tokval; + lex_get (); + } + + if (*cnt >= cap) + { + cap = 2 * (cap + 8); + *values = xrealloc (*values, sizeof **values * cap); + *weights = xrealloc (*weights, sizeof **weights * cap); + } + + (*values)[*cnt] = value; + (*weights)[*cnt] = weight; + (*cnt)++; + } + + return 1; +} + +int +cmd_debug_moments (void) +{ + int retval = CMD_FAILURE; + struct moments *m = NULL; + double *values = NULL; + double *weights = NULL; + double weight, M[4]; + int two_pass = 1; + size_t cnt; + size_t i; + + if (lex_match_id ("ONEPASS")) + two_pass = 0; + if (token != '/') + { + lex_force_match ('/'); + goto done; + } + fprintf (stderr, "%s => ", lex_rest_of_line (NULL)); + lex_get (); + + m = moments_create (MOMENT_KURTOSIS); + if (!read_values (&values, &weights, &cnt)) + goto done; + if (two_pass) + { + for (i = 0; i < cnt; i++) + moments_pass_one (m, values[i], weights[i]); + } + for (i = 0; i < cnt; i++) + moments_pass_two (m, values[i], weights[i]); + moments_calculate (m, &weight, &M[0], &M[1], &M[2], &M[3]); + + fprintf (stderr, "W=%.3f", weight); + for (i = 0; i < 4; i++) + { + fprintf (stderr, " M%d=", i + 1); + if (M[i] == SYSMIS) + fprintf (stderr, "sysmis"); + else if (fabs (M[i]) <= 0.0005) + fprintf (stderr, "0.000"); + else + fprintf (stderr, "%.3f", M[i]); + } + fprintf (stderr, "\n"); + + retval = lex_end_of_command (); + + done: + moments_destroy (m); + free (values); + free (weights); + return retval; +} diff --git a/src/moments.h b/src/moments.h new file mode 100644 index 0000000000..e48bfd5ec6 --- /dev/null +++ b/src/moments.h @@ -0,0 +1,61 @@ +/* PSPP - computes sample statistics. + Copyright (C) 2004 Free Software Foundation, Inc. + Written by Ben Pfaff . + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + 02111-1307, USA. */ + +#ifndef HEADER_MOMENTS +#define HEADER_MOMENTS + +#include +#include "val.h" + +/* Moments of the mean. + Higher-order moments have higher values. */ +enum moment + { + MOMENT_NONE, /* No moments. */ + MOMENT_MEAN, /* First-order moment. */ + MOMENT_VARIANCE, /* Second-order moment. */ + MOMENT_SKEWNESS, /* Third-order moment. */ + MOMENT_KURTOSIS /* Fourth-order moment. */ + }; + +struct moments; + +struct moments *moments_create (enum moment max_moment); +void moments_clear (struct moments *); +void moments_pass_one (struct moments *, double value, double weight); +void moments_pass_two (struct moments *, double value, double weight); +void moments_calculate (const struct moments *, + double *weight, + double *mean, double *variance, + double *skewness, double *kurtosis); +void moments_destroy (struct moments *); + +void moments_of_doubles (const double *array, size_t cnt, + double *weight, + double *mean, double *variance, + double *skewness, double *kurtosis); +void moments_of_values (const union value *array, size_t cnt, + double *weight, + double *mean, double *variance, + double *skewness, double *kurtosis); + +double calc_seskew (double weight); +double calc_sekurt (double weight); + +#endif /* moments.h */ diff --git a/src/sort.c b/src/sort.c index c17ca2a87e..5f9cdb42f5 100644 --- a/src/sort.c +++ b/src/sort.c @@ -25,6 +25,7 @@ #include #include "algorithm.h" #include "alloc.h" +#include "casefile.h" #include "command.h" #include "error.h" #include "expr.h" @@ -35,6 +36,7 @@ #include "var.h" #include "vfm.h" #include "vfmP.h" +#include "workspace.h" #if HAVE_UNISTD_H #include @@ -53,7 +55,8 @@ /* Other prototypes. */ static int compare_record (const union value *, const union value *, const struct sort_cases_pgm *, int *idx_to_fv); -static int compare_case_lists (const void *, const void *, void *); +static int compare_cases (const struct ccase *, const struct ccase *, void *); +static int compare_case_dblptrs (const void *, const void *, void *); static struct internal_sort *do_internal_sort (struct sort_cases_pgm *, int separate); static void destroy_internal_sort (struct internal_sort *); @@ -163,13 +166,16 @@ destroy_sort_cases_pgm (struct sort_cases_pgm *scp) } /* Sorts the active file based on the key variables specified in - global variables vars and var_cnt. The output is either to the - active file, if SEPARATE is zero, or to a separate file, if - SEPARATE is nonzero. In the latter case the output cases can be - read with a call to read_sort_output(). (In the former case the - output cases should be dealt with through the usual vfm interface.) + global variables vars and var_cnt. - The caller is responsible for freeing vars[]. */ + If SEPARATE is zero, then output goes to the active file. The + output cases can be read through the usual VFM interfaces. + + If SEPARATE is nonzero, then output goes to a separate file. + The output cases can be read with a call to + read_sort_output(). + + The caller is responsible for freeing SCP. */ int sort_cases (struct sort_cases_pgm *scp, int separate) { @@ -190,9 +196,6 @@ sort_cases (struct sort_cases_pgm *scp, int separate) return 1; /* Fall back to an external sort. */ - if (vfm_source != NULL - && case_source_is_class (vfm_source, &storage_source_class)) - storage_source_to_disk (vfm_source); scp->xsrt = do_external_sort (scp, separate); if (scp->xsrt != NULL) return 1; @@ -201,10 +204,12 @@ sort_cases (struct sort_cases_pgm *scp, int separate) return 0; } -/* Results of an internal sort. */ +/* Results of an internal sort. + Used only for sorting to a separate file. */ struct internal_sort { - struct case_list **results; + const struct ccase **cases; + size_t case_cnt; }; /* If the data is in memory, do an internal sort. Return @@ -215,52 +220,53 @@ do_internal_sort (struct sort_cases_pgm *scp, int separate) struct internal_sort *isrt; isrt = xmalloc (sizeof *isrt); - isrt->results = NULL; + isrt->cases = NULL; + isrt->case_cnt = 0; - if (case_source_is_class (vfm_source, &storage_source_class) - && !storage_source_on_disk (vfm_source)) + if (case_source_is_class (vfm_source, &storage_source_class)) { - struct case_list *case_list; - struct case_list **case_array; - int case_cnt; - int i; + struct casefile *casefile = storage_source_get_casefile (vfm_source); - case_cnt = vfm_source->class->count (vfm_source); - if (case_cnt <= 0) - return isrt; - - if (case_cnt > get_max_workspace() / sizeof *case_array) - goto error; - - case_list = storage_source_get_cases (vfm_source); - case_array = malloc (sizeof *case_array * (case_cnt + 1)); - if (case_array == NULL) - goto error; - - for (i = 0; case_list != NULL; i++) + if (!separate) { - case_array[i] = case_list; - case_list = case_list->next; + if (!casefile_sort (casefile, compare_cases, scp)) + goto error; } - assert (i == case_cnt); - case_array[case_cnt] = NULL; + else + { + /* FIXME FIXME FIXME. + This is crap because the casefile could get flushed + to disk between the time we sort it and we use it + later, causing invalid pointer accesses. + The right solution is probably to extend casefiles + to support duplication. */ + struct casereader *reader; + size_t case_idx; + + if (!casefile_in_core (casefile)) + goto error; + + isrt->case_cnt = casefile_get_case_cnt (casefile); + isrt->cases = workspace_malloc (sizeof *isrt->cases + * isrt->case_cnt); + if (isrt->cases == NULL) + goto error; - sort (case_array, case_cnt, sizeof *case_array, - compare_case_lists, scp); + reader = casefile_get_reader (casefile); + for (case_idx = 0; case_idx < isrt->case_cnt; case_idx++) + { + casereader_read (reader, &isrt->cases[case_idx]); + assert (isrt->cases[case_idx] != NULL); + } + casereader_destroy (reader); - if (!separate) - { - storage_source_set_cases (vfm_source, case_array[0]); - for (i = 1; i <= case_cnt; i++) - case_array[i - 1]->next = case_array[i]; - free (case_array); + sort (isrt->cases, isrt->case_cnt, casefile_get_case_size (casefile), + compare_case_dblptrs, scp); } - else - isrt->results = case_array; - + return isrt; } - + error: free (isrt); return NULL; @@ -272,24 +278,34 @@ destroy_internal_sort (struct internal_sort *isrt) { if (isrt != NULL) { - free (isrt->results); + workspace_free (isrt->cases, sizeof *isrt->cases * isrt->case_cnt); free (isrt); } } -/* Compares the VAR_CNT variables in VARS[] between the - `case_list's at A and B, and returns a strcmp()-type - result. */ +/* Compares the variables specified by SCP between the cases at A + and B, and returns a strcmp()-type result. */ static int -compare_case_lists (const void *a_, const void *b_, void *scp_) +compare_cases (const struct ccase *a, const struct ccase *b, + void *scp_) { struct sort_cases_pgm *scp = scp_; - struct case_list *const *pa = a_; - struct case_list *const *pb = b_; - struct case_list *a = *pa; - struct case_list *b = *pb; - return compare_record (a->c.data, b->c.data, scp, NULL); + return compare_record (a->data, b->data, scp, NULL); +} + +/* Compares the variables specified by SCP between the cases at A + and B, and returns a strcmp()-type result. */ +static int +compare_case_dblptrs (const void *a_, const void *b_, void *scp_) +{ + struct sort_cases_pgm *scp = scp_; + struct ccase *const *pa = a_; + struct ccase *const *pb = b_; + struct ccase *a = *pa; + struct ccase *b = *pb; + + return compare_record (a->data, b->data, scp, NULL); } /* External sort. */ @@ -356,6 +372,10 @@ do_external_sort (struct sort_cases_pgm *scp, int separate) struct external_sort *xsrt; int success = 0; + if (vfm_source != NULL + && case_source_is_class (vfm_source, &storage_source_class)) + casefile_to_disk (storage_source_get_casefile (vfm_source)); + xsrt = xmalloc (sizeof *xsrt); xsrt->scp = scp; if (!init_external_sort (xsrt)) @@ -1344,12 +1364,11 @@ read_internal_sort_output (struct internal_sort *isrt, read_sort_output_func *output_func, void *aux) { - struct case_list **p; + size_t case_idx; - for (p = isrt->results; *p; p++) - if (!output_func (&(*p)->c, aux)) + for (case_idx = 0; case_idx < isrt->case_cnt; case_idx++) + if (!output_func (isrt->cases[case_idx], aux)) break; - free (isrt->results); } static void diff --git a/src/stats.c b/src/stats.c deleted file mode 100644 index 4727b938e4..0000000000 --- a/src/stats.c +++ /dev/null @@ -1,203 +0,0 @@ -/* PSPP - computes sample statistics. - Copyright (C) 1997-9, 2000 Free Software Foundation, Inc. - Written by Ben Pfaff . - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA - 02111-1307, USA. */ - -#include -#include "stats.h" -#include - -/* Returns the fourth power of its argument. */ -double -pow4 (double x) -{ - x *= x; - return x * x; -} - -/* Returns the cube of its argument. */ -double -cube (double x) -{ - return x * x * x; -} - -/* Returns the square of its argument. */ -double -sqr (double x) -{ - return x * x; -} - -/* - * kurtosis = [(n+1){n*sum(X**4) - 4*sum(X)*sum(X**3) - * + 6*sum(X)**2*sum(X**2)/n - 3*sum(X)**4/n**2}] - * /[(n-1)(n-2)(n-3)*(variance)**2] - * -[3*{(n-1)**2}] - * /[(n-2)(n-3)] - * - * This and other formulas from _Biometry_, Sokal and Rohlf, - * W. H. Freeman and Company, 1969. See pages 117 and 136 especially. - */ -double -calc_kurt (const double d[4], double n, double variance) -{ - return - (((n + 1) * (n * d[3] - - 4.0 * d[0] * d[2] - + 6.0 * sqr (d[0]) * d[1] / n - - 3.0 * pow4 (d[0]) / sqr (n))) - / ((n - 1.0) * (n - 2.0) * (n - 3.0) * sqr (variance)) - - (3.0 * sqr (n - 1.0)) - / ((n - 2.0) * (n - 3.))); -} - -/* - * standard error of kurtosis = sqrt([24n((n-1)**2)]/[(n-3)(n-2)(n+3)(n+5)]) - */ -double -calc_sekurt (double n) -{ - return sqrt ((24.0 * n * sqr (n - 1.0)) - / ((n - 3.0) * (n - 2.0) * (n + 3.0) * (n + 5.0))); -} - -/* - * skewness = [n*sum(X**3) - 3*sum(X)*sum(X**2) + 2*sum(X)**3/n]/ - * /[(n-1)(n-2)*(variance)**3] - */ -double -calc_skew (const double d[3], double n, double stddev) -{ - return - ((n * d[2] - 3.0 * d[0] * d[1] + 2.0 * cube (d[0]) / n) - / ((n - 1.0) * (n - 2.0) * cube (stddev))); -} - -/* - * standard error of skewness = sqrt([6n(n-1)] / [(n-2)(n+1)(n+3)]) - */ -double -calc_seskew (double n) -{ - return - sqrt ((6.0 * n * (n - 1.0)) - / ((n - 2.0) * (n + 1.0) * (n + 3.0))); -} - -/* Returns one-sided significance level corresponding to standard - normal deviate X. Algorithm from _SPSS Statistical Algorithms_, - Appendix 1. */ -#if 0 -double -normal_sig (double x) -{ - const double a1 = .070523078; - const double a2 = .0422820123; - const double a3 = .0092705272; - const double a4 = .0001520143; - const double a5 = .0002765672; - const double a6 = .0000430638; - - const double z = fabs (x) <= 14.14 ? 0.7071067812 * fabs (x) : 10.; - double r; - - r = 1. + z * (a1 + z * (a2 + z * (a3 + z * (a4 + z * (a5 + z * a6))))); - r *= r; /* r ** 2 */ - r *= r; /* r ** 4 */ - r *= r; /* r ** 16 */ - - return .5 / r; -} -#else /* 1 */ -/* Taken from _BASIC Statistics: An Introduction to Problem Solving - with Your Personal Computer_, Jerry W. O'Dell, TAB 1984, page 314-5. */ -double -normal_sig (double z) -{ - double h; - - h = 1 + 0.0498673470 * z; - z *= z; - h += 0.0211410061 * z; - z *= z; - h += 0.0032776263 * z; - z *= z; - h += 0.0000380036 * z; - z *= z; - h += 0.0000488906 * z; - z *= z; - h += 0.0000053830 * z; - return pow (h, -16.) / 2.; -} -#endif /* 1 */ - -/* Algorithm from _Turbo Pascal Programmer's Toolkit_, Rugg and - Feldman, Que 1989. Returns the significance level of chi-square - value CHISQ with DF degrees of freedom, correct to at least 7 - decimal places. */ -double -chisq_sig (double x, int k) -{ - if (x <= 0. || k < 1) - return 1.0; - else if (k == 1) - return 2. * normal_sig (sqrt (x)); - else if (k <= 30) - { - double z, z_partial, term, denom, numerator, value; - - z = 1.; - z_partial = 1.; - term = k; - do - { - term += 2; - z_partial *= x / term; - if (z_partial >= 10000000.) - return 0.; - z += z_partial; - } - while (z_partial >= 1.e-7); - denom = term = 2 - k % 2; - while (term < k) - { - term += 2; - denom *= term; - } - if (k % 2) - { - value = ((k + 1) / 2) * log (x) - x / 2.; - numerator = exp (value) * sqrt (2. / x / PI); - } - else - { - value = k / 2. * log (x) - x / 2.; - numerator = exp (value); - } - return 1. - numerator * z / denom; - } - else - { - double term, numer, norm_x; - - term = 2. / 9. / k; - numer = pow (x / k, 1. / 3.); - norm_x = numer / sqrt (term); - return 1.0 - normal_sig (norm_x); - } -} diff --git a/src/stats.h b/src/stats.h deleted file mode 100644 index 46cb7266dc..0000000000 --- a/src/stats.h +++ /dev/null @@ -1,88 +0,0 @@ -/* PSPP - computes sample statistics. - Copyright (C) 1997-9, 2000 Free Software Foundation, Inc. - Written by Ben Pfaff . - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA - 02111-1307, USA. */ - -#if !statistics_h -#define statistics_h 1 - -/* These are all sample statistics except for mean since uses - population statistics for whatever reason. */ - -/* Define pi to the maximum precision available. */ -#include /* defines M_PI on many systems */ -#ifndef PI -#ifdef M_PI -#define PI M_PI -#else /* !PI && !M_PI */ -#define PI 3.14159265358979323846264338327 -#endif /* !PI && !M_PI */ -#endif /* !PI */ - -extern double pow4 (double); -extern double cube (double); -extern double sqr (double); - -/* Returns the fourth power of its argument. */ -extern inline double -pow4 (double x) -{ - x *= x; - return x * x; -} - -/* Returns the cube of its argument. */ -extern inline double -cube (double x) -{ - return x * x * x; -} - -/* Returns the square of its argument. */ -extern inline double -sqr (double x) -{ - return x * x; -} - -/* Mean, standard error of mean. */ -#define calc_mean(D, N) \ - ((D)[0] / (N)) -#define calc_semean(STDDEV, N) \ - ((STDDEV) / sqrt (N)) - -/* Variance, standard deviation, coefficient of variance. */ -#define calc_variance(D, N) \ - ( ((D)[1] - sqr ((D)[0])/(N)) / ((N)-1) ) -#define calc_stddev(VARIANCE) \ - (sqrt (VARIANCE)) -#define calc_cfvar(D, N) \ - ( calc_stddev (calc_variance (D, N)) / calc_mean (D, N) ) - -/* Kurtosis, standard error of kurtosis. */ -double calc_kurt (const double d[4], double n, double variance); -double calc_sekurt (double n); - -/* Skewness, standard error of skewness. */ -double calc_skew (const double d[3], double n, double stddev); -double calc_seskew (double n); - -/* Significance. */ -double normal_sig (double x); -double chisq_sig (double chisq, int df); - -#endif /* !statistics_h */ diff --git a/src/t-test.q b/src/t-test.q index 6d1801243f..0f4133a6c7 100644 --- a/src/t-test.q +++ b/src/t-test.q @@ -31,13 +31,13 @@ #include "lexer.h" #include "error.h" #include "magic.h" +#include "misc.h" #include "tab.h" #include "som.h" #include "value-labels.h" #include "var.h" #include "vfm.h" #include "hash.h" -#include "stats.h" #include "t-test.h" #include "levene.h" @@ -972,9 +972,9 @@ trbox_independent_samples_populate(struct trbox *self, df = gs0->n + gs1->n - 2.0 ; tab_float (self->t, 5, i*2+3, TAB_RIGHT, df, 2, 0); - pooled_variance = ( (gs0->n )*sqr(gs0->s_std_dev) + pooled_variance = ( (gs0->n )*pow2(gs0->s_std_dev) + - (gs1->n )*sqr(gs1->s_std_dev) + (gs1->n )*pow2(gs1->s_std_dev) ) / df ; t = (gs0->mean - gs1->mean) / sqrt(pooled_variance) ; @@ -991,7 +991,7 @@ trbox_independent_samples_populate(struct trbox *self, tab_float(self->t, 7, i*2+3, TAB_RIGHT, mean_diff, 8, 3); - std_err_diff = sqrt( sqr(gs0->se_mean) + sqr(gs1->se_mean)); + std_err_diff = sqrt( pow2(gs0->se_mean) + pow2(gs1->se_mean)); tab_float(self->t, 8, i*2+3, TAB_RIGHT, std_err_diff, 8, 3); @@ -1014,18 +1014,18 @@ trbox_independent_samples_populate(struct trbox *self, TAB_LEFT, _("Equal variances not assumed")); - se2 = (sqr(gs0->s_std_dev)/(gs0->n -1) ) + - (sqr(gs1->s_std_dev)/(gs1->n -1) ); + se2 = (pow2(gs0->s_std_dev)/(gs0->n -1) ) + + (pow2(gs1->s_std_dev)/(gs1->n -1) ); t = mean_diff / sqrt(se2) ; tab_float (self->t, 4, i*2+3+1, TAB_RIGHT, t, 8, 3); - df = sqr(se2) / ( - (sqr(sqr(gs0->s_std_dev)/(gs0->n - 1 )) + df = pow2(se2) / ( + (pow2(pow2(gs0->s_std_dev)/(gs0->n - 1 )) /(gs0->n -1 ) ) + - (sqr(sqr(gs1->s_std_dev)/(gs1->n - 1 )) + (pow2(pow2(gs1->s_std_dev)/(gs1->n - 1 )) /(gs1->n -1 ) ) ) ; @@ -1134,7 +1134,7 @@ trbox_paired_populate(struct trbox *trb, t = (pairs[i].mean[0] - pairs[i].mean[1]) / sqrt ( - ( sqr (pairs[i].s_std_dev[0]) + sqr (pairs[i].s_std_dev[1]) - + ( pow2 (pairs[i].s_std_dev[0]) + pow2 (pairs[i].s_std_dev[1]) - 2 * pairs[i].correlation * pairs[i].s_std_dev[0] * pairs[i].s_std_dev[1] ) / (n - 1) @@ -1294,7 +1294,7 @@ pscbox(void) double correlation_t = pairs[i].correlation * sqrt(df) / - sqrt(1 - sqr(pairs[i].correlation)); + sqrt(1 - pow2(pairs[i].correlation)); /* row headings */ @@ -1569,13 +1569,13 @@ paired_calc (struct ccase *c, void *aux UNUSED) pairs[i].sum[0] += weight * val0->f; pairs[i].sum[1] += weight * val1->f; - pairs[i].ssq[0] += weight * sqr(val0->f); - pairs[i].ssq[1] += weight * sqr(val1->f); + pairs[i].ssq[0] += weight * pow2(val0->f); + pairs[i].ssq[1] += weight * pow2(val1->f); pairs[i].sum_of_prod += weight * val0->f * val1->f ; pairs[i].sum_of_diffs += weight * ( val0->f - val1->f ) ; - pairs[i].ssq_diffs += weight * sqr(val0->f - val1->f); + pairs[i].ssq_diffs += weight * pow2(val0->f - val1->f); } } @@ -1596,11 +1596,11 @@ paired_postcalc (void *aux UNUSED) { pairs[i].mean[j] = pairs[i].sum[j] / n ; pairs[i].s_std_dev[j] = sqrt((pairs[i].ssq[j] / n - - sqr(pairs[i].mean[j])) + pow2(pairs[i].mean[j])) ); pairs[i].std_dev[j] = sqrt(n/(n-1)*(pairs[i].ssq[j] / n - - sqr(pairs[i].mean[j])) + pow2(pairs[i].mean[j])) ); } @@ -1616,7 +1616,7 @@ paired_postcalc (void *aux UNUSED) pairs[i].std_dev_diff = sqrt ( n / (n - 1) * ( ( pairs[i].ssq_diffs / n ) - - sqr(pairs[i].mean_diff ) + pow2(pairs[i].mean_diff ) ) ); } } @@ -1739,7 +1739,7 @@ group_calc (struct ccase *c, void *aux UNUSED) { gs->n+=weight; gs->sum+=weight * val->f; - gs->ssq+=weight * sqr(val->f); + gs->ssq+=weight * pow2(val->f); } } diff --git a/src/var.h b/src/var.h index 6d5cad7938..21b23faccf 100644 --- a/src/var.h +++ b/src/var.h @@ -115,36 +115,6 @@ struct list_proc int vert; /* Whether to print the varname vertically. */ }; -/* DESCRIPTIVES private data. Note that the DESCRIPTIVES procedure also - has a transformation, descriptives_trns. */ -enum - { - /* As these are used as bit indexes, there must be 32 or fewer. - Be very careful in adjusting these, see the structure below - and the table in descriptives.q. */ - dsc_mean = 0, dsc_semean, dsc_stddev, dsc_variance, dsc_kurt, - dsc_sekurt, dsc_skew, dsc_seskew, dsc_range, dsc_min, - dsc_max, dsc_sum, dsc_n_stats - }; - -struct descriptives_proc - { - /* Miscellaneous. */ - int dup; /* Finds duplicates in list of - variables. */ - char zname[10]; /* Name for z-score variable. */ - - /* Counts. */ - double valid, miss; /* Valid, missing--general. */ - - /* Mean, moments about the mean. */ - double X_bar, M2, M3, M4; - double min, max; - - /* Statistics. */ - double stats[dsc_n_stats]; /* Everything glommed together. */ - }; - /* GET private data. */ struct get_proc { @@ -244,7 +214,6 @@ struct variable union { struct crosstab_proc crs; - struct descriptives_proc dsc; struct frequencies_proc frq; struct list_proc lst; struct means_proc mns; diff --git a/src/vfm.c b/src/vfm.c index 1ea4211e32..bfd1ef017a 100644 --- a/src/vfm.c +++ b/src/vfm.c @@ -28,6 +28,7 @@ #include /* Required by SunOS4. */ #endif #include "alloc.h" +#include "casefile.h" #include "do-ifP.h" #include "error.h" #include "expr.h" @@ -73,12 +74,6 @@ struct case_sink *vfm_sink; stored, zero otherwise. */ static int compaction_necessary; -/* Nonzero means that we've overflowed our allotted workspace. - After that happens once during a session, we always store the - active file on disk instead of in memory. (This policy may be - too aggressive.) */ -static int workspace_overflow = 0; - /* Time at which vfm was last invoked. */ time_t last_vfm_invocation; @@ -356,6 +351,7 @@ compact_case (struct ccase *dest, const struct ccase *src) /* Copy all the variables except scratch variables from SRC to DEST. */ + /* FIXME: this should be temp_dict not default_dict I guess. */ var_cnt = dict_get_var_cnt (default_dict); for (i = 0; i < var_cnt; i++) { @@ -461,21 +457,9 @@ close_active_file (void) /* Information about storage sink or source. */ struct storage_stream_info { - size_t case_cnt; /* Number of cases. */ - size_t case_size; /* Number of bytes in case. */ - enum { DISK, MEMORY } mode; /* Where is data stored? */ - - /* Disk storage. */ - FILE *file; /* Data file. */ - - /* Memory storage. */ - int max_cases; /* Maximum cases before switching to disk. */ - struct case_list *head; /* First case in list. */ - struct case_list *tail; /* Last case in list. */ + struct casefile *casefile; /* Storage. */ }; -static void open_storage_file (struct storage_stream_info *info); - /* Initializes a storage sink. */ static void storage_sink_open (struct case_sink *sink) @@ -483,91 +467,14 @@ storage_sink_open (struct case_sink *sink) struct storage_stream_info *info; sink->aux = info = xmalloc (sizeof *info); - info->case_cnt = 0; - info->case_size = sink->value_cnt * sizeof (union value); - info->file = NULL; - info->max_cases = 0; - info->head = info->tail = NULL; - if (workspace_overflow) - { - info->mode = DISK; - open_storage_file (info); - } - else - { - info->mode = MEMORY; - info->max_cases = (get_max_workspace() - / (sizeof (struct case_list) + info->case_size)); - } -} - -/* Creates a new temporary file and puts it into INFO. */ -static void -open_storage_file (struct storage_stream_info *info) -{ - info->file = tmpfile (); - if (info->file == NULL) - { - msg (ME, _("An error occurred creating a temporary " - "file for use as the active file: %s."), - strerror (errno)); - err_failure (); - } -} - -/* Writes the VALUE_CNT values in VALUES to FILE. */ -static void -write_storage_file (FILE *file, const union value *values, size_t value_cnt) -{ - if (fwrite (values, sizeof *values * value_cnt, 1, file) != 1) - { - msg (ME, _("An error occurred writing to a " - "temporary file used as the active file: %s."), - strerror (errno)); - err_failure (); - } -} - -/* If INFO represents records in memory, moves them to disk. - Each comprises VALUE_CNT `union value's. */ -static void -storage_to_disk (struct storage_stream_info *info, size_t value_cnt) -{ - struct case_list *cur, *next; - - if (info->mode == MEMORY) - { - info->mode = DISK; - open_storage_file (info); - for (cur = info->head; cur; cur = next) - { - next = cur->next; - write_storage_file (info->file, cur->c.data, value_cnt); - free (cur); - } - info->head = info->tail = NULL; - } + info->casefile = casefile_create (sink->value_cnt * sizeof (union value)); } /* Destroys storage stream represented by INFO. */ static void destroy_storage_stream_info (struct storage_stream_info *info) { - if (info->mode == DISK) - { - if (info->file != NULL) - fclose (info->file); - } - else - { - struct case_list *cur, *next; - - for (cur = info->head; cur; cur = next) - { - next = cur->next; - free (cur); - } - } + casefile_destroy (info->casefile); free (info); } @@ -577,39 +484,7 @@ storage_sink_write (struct case_sink *sink, const struct ccase *c) { struct storage_stream_info *info = sink->aux; - info->case_cnt++; - if (info->mode == MEMORY) - { - struct case_list *new_case; - - /* Copy case. */ - new_case = xmalloc (sizeof (struct case_list) - + ((sink->value_cnt - 1) * sizeof (union value))); - memcpy (&new_case->c, c, sizeof (union value) * sink->value_cnt); - - /* Append case to linked list. */ - new_case->next = NULL; - if (info->head != NULL) - info->tail->next = new_case; - else - info->head = new_case; - info->tail = new_case; - - /* Dump all the cases to disk if we've run out of - workspace. */ - if (info->case_cnt > info->max_cases) - { - workspace_overflow = 1; - msg (MW, _("Workspace limit of %d KB (%d cases at %d bytes each) " - "overflowed. Writing active file to disk."), - get_max_workspace() / 1024, info->max_cases, - sizeof (struct case_list) + info->case_size); - - storage_to_disk (info, sink->value_cnt); - } - } - else - write_storage_file (info->file, c->data, sink->value_cnt); + casefile_append (info->casefile, c); } /* Destroys internal data in SINK. */ @@ -624,22 +499,7 @@ storage_sink_destroy (struct case_sink *sink) static struct case_source * storage_sink_make_source (struct case_sink *sink) { - struct storage_stream_info *info = sink->aux; - - if (info->mode == DISK) - { - /* Rewind the file. */ - assert (info->file != NULL); - if (fseek (info->file, 0, SEEK_SET) != 0) - { - msg (ME, _("An error occurred while attempting to rewind a " - "temporary file used as the active file: %s."), - strerror (errno)); - err_failure (); - } - } - - return create_case_source (&storage_source_class, sink->dict, info); + return create_case_source (&storage_source_class, sink->dict, sink->aux); } /* Storage sink. */ @@ -661,51 +521,28 @@ storage_source_count (const struct case_source *source) { struct storage_stream_info *info = source->aux; - return info->case_cnt; + return casefile_get_case_cnt (info->casefile); } /* Reads all cases from the storage source and passes them one by one to write_case(). */ static void storage_source_read (struct case_source *source, - struct ccase *c, + struct ccase *output_case, write_case_func *write_case, write_case_data wc_data) { struct storage_stream_info *info = source->aux; + const struct ccase *casefile_case; + struct casereader *reader; - if (info->mode == DISK) + reader = casefile_get_reader (info->casefile); + while (casereader_read (reader, &casefile_case)) { - int i; - - for (i = 0; i < info->case_cnt; i++) - { - if (!fread (c, info->case_size, 1, info->file)) - { - msg (ME, _("An error occurred while attempting to read from " - "a temporary file created for the active file: %s."), - strerror (errno)); - err_failure (); - break; - } - - if (!write_case (wc_data)) - break; - } - } - else - { - while (info->head != NULL) - { - struct case_list *iter = info->head; - memcpy (c, &iter->c, info->case_size); - if (!write_case (wc_data)) - break; - - info->head = iter->next; - free (iter); - } - info->tail = NULL; + memcpy (output_case, casefile_case, + casefile_get_case_size (info->casefile)); + write_case (wc_data); } + casereader_destroy (reader); } /* Destroys the source's internal data. */ @@ -724,44 +561,13 @@ const struct case_source_class storage_source_class = storage_source_destroy, }; -/* Returns nonzero only if SOURCE is stored on disk (instead of - in memory). */ -int -storage_source_on_disk (const struct case_source *source) +struct casefile * +storage_source_get_casefile (struct case_source *source) { struct storage_stream_info *info = source->aux; - return info->mode == DISK; -} - -/* Returns the list of cases in storage source SOURCE. */ -struct case_list * -storage_source_get_cases (const struct case_source *source) -{ - struct storage_stream_info *info = source->aux; - - assert (info->mode == MEMORY); - return info->head; -} - -/* Sets the list of cases in memory source SOURCE to CASES. */ -void -storage_source_set_cases (const struct case_source *source, - struct case_list *cases) -{ - struct storage_stream_info *info = source->aux; - - assert (info->mode == MEMORY); - info->head = cases; -} - -/* If SOURCE has its cases in memory, writes them to disk. */ -void -storage_source_to_disk (struct case_source *source) -{ - struct storage_stream_info *info = source->aux; - - storage_to_disk (info, source->value_cnt); + assert (source->class == &storage_source_class); + return info->casefile; } /* Null sink. Used by a few procedures that keep track of output @@ -1037,3 +843,74 @@ dump_splits (struct ccase *c) tab_flags (t, SOMF_NO_TITLE); tab_submit (t); } + +/* Represents auxiliary data for handling SPLIT FILE in a + multipass procedure. */ +struct multipass_split_aux_data + { + struct ccase *prev_case; /* Data in previous case. */ + struct casefile *casefile; /* Accumulates data for a split. */ + + /* Function to call with the accumulated data. */ + void (*split_func) (const struct casefile *, void *); + void *func_aux; /* Auxiliary data. */ + }; + +static int multipass_split_callback (struct ccase *c, void *aux_); +static void multipass_split_output (struct multipass_split_aux_data *); + +void +multipass_procedure_with_splits (void (*split_func) (const struct casefile *, + void *), + void *func_aux) +{ + struct multipass_split_aux_data aux; + + assert (split_func != NULL); + + aux.prev_case = xmalloc (dict_get_case_size (default_dict)); + aux.casefile = NULL; + aux.split_func = split_func; + aux.func_aux = func_aux; + + procedure (multipass_split_callback, &aux); + + if (aux.casefile != NULL) + multipass_split_output (&aux); + free (aux.prev_case); +} + +/* procedure() callback used by multipass_procedure_with_splits(). */ +static int +multipass_split_callback (struct ccase *c, void *aux_) +{ + struct multipass_split_aux_data *aux = aux_; + + /* Start a new series if needed. */ + if (aux->casefile == NULL || !equal_splits (c, aux->prev_case)) + { + /* Pass any cases to split_func. */ + if (aux->casefile != NULL) + multipass_split_output (aux); + + /* Start a new casefile. */ + aux->casefile = casefile_create (dict_get_case_size (default_dict)); + + /* Record split values. */ + dump_splits (c); + memcpy (aux->prev_case, c, dict_get_case_size (default_dict)); + } + + casefile_append (aux->casefile, c); + + return 1; +} + +static void +multipass_split_output (struct multipass_split_aux_data *aux) +{ + assert (aux->casefile != NULL); + aux->split_func (aux->casefile, aux->func_aux); + casefile_destroy (aux->casefile); + aux->casefile = NULL; +} diff --git a/src/vfm.h b/src/vfm.h index 9577953ef4..5d9403bbde 100644 --- a/src/vfm.h +++ b/src/vfm.h @@ -75,11 +75,7 @@ int case_source_is_complex (const struct case_source *); int case_source_is_class (const struct case_source *, const struct case_source_class *); -int storage_source_on_disk (const struct case_source *); -struct case_list *storage_source_get_cases (const struct case_source *); -void storage_source_set_cases (const struct case_source *, - struct case_list *); -void storage_source_to_disk (struct case_source *source); +struct casefile *storage_source_get_casefile (struct case_source *); /* The replacement active file, to which cases are written. */ extern struct case_sink *vfm_sink; @@ -123,6 +119,9 @@ extern const struct case_sink_class null_sink_class; struct case_sink *create_case_sink (const struct case_sink_class *, const struct dictionary *, void *); +void case_sink_open (struct case_sink *); +void case_sink_write (struct case_sink *, const struct ccase *); +void case_sink_destroy (struct case_sink *); void free_case_sink (struct case_sink *); /* Number of cases to lag. */ @@ -134,5 +133,9 @@ void procedure_with_splits (void (*begin_func) (void *aux), void (*end_func) (void *aux), void *aux); struct ccase *lagged_case (int n_before); + +void multipass_procedure_with_splits (void (*) (const struct casefile *, + void *), + void *aux); #endif /* !vfm_h */ diff --git a/src/workspace.c b/src/workspace.c new file mode 100644 index 0000000000..0095920ebf --- /dev/null +++ b/src/workspace.c @@ -0,0 +1,53 @@ +/* PSPP - computes sample statistics. + Copyright (C) 2004 Free Software Foundation, Inc. + Written by Ben Pfaff . + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + 02111-1307, USA. */ + +#include +#include "workspace.h" +#include +#include +#include "alloc.h" +#include "settings.h" + +static size_t workspace_used; + +/* Returns a block SIZE bytes in size, charging it against the + workspace limit. Returns a null pointer if the workspace + limit is reached. */ +void * +workspace_malloc (size_t size) +{ + if (workspace_used + size > get_max_workspace ()) + return NULL; + + workspace_used += size; + return xmalloc (size); +} + +/* Frees BLOCK, which is SIZE bytes long, and credits it toward + the workspace limit. */ +void +workspace_free (void *block, size_t size) +{ + if (block != NULL) + { + assert (workspace_used >= size); + free (block); + workspace_used -= size; + } +} diff --git a/src/workspace.h b/src/workspace.h new file mode 100644 index 0000000000..8e3000efdb --- /dev/null +++ b/src/workspace.h @@ -0,0 +1,28 @@ +/* PSPP - computes sample statistics. + Copyright (C) 2004 Free Software Foundation, Inc. + Written by Ben Pfaff . + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + 02111-1307, USA. */ + +#ifndef HEADER_WORKSPACE +#define HEADER_WORKSPACE + +#include + +void *workspace_malloc (size_t); +void workspace_free (void *, size_t); + +#endif /* workspace.h */ diff --git a/tests/ChangeLog b/tests/ChangeLog index 2b27655a1b..e065558f7b 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,23 @@ +Mon Mar 29 15:25:09 2004 Ben Pfaff + + * Makefile.am: (TESTS) Add xforms/casefile.sh, + stats/descript-basic.sh, stats/descript-missing.sh, + stats/moments.sh. Remove command/descriptives.sh. + + * command/descriptives.sh: Removed. + + * command/weight.sh: Fix output (statistic values were wrong!). + + * stats/descript-basic.sh: New test. + + * stats/descript-missing.sh: New test. + + * stats/moments.sh: New test. + + * xforms/casefile.sh: New test. + + * xforms/expressions.sh: Cleans up after itself now. + Fri Mar 26 00:55:48 2004 Ben Pfaff * Makefile.am: (TESTS) Add xforms/expressions.sh, remove diff --git a/tests/Makefile.am b/tests/Makefile.am index 9e9bd47b5f..2df0749e15 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -6,7 +6,6 @@ TESTS = command/aggregate.sh \ command/beg-data.sh \ command/bignum.sh \ command/count.sh \ - command/descriptives.sh \ command/erase.sh \ command/file-label.sh \ command/filter.sh \ @@ -42,7 +41,11 @@ TESTS = command/aggregate.sh \ bugs/t-test.sh \ bugs/temporary.sh \ bugs/val-labs.sh \ - xforms/expressions.sh + xforms/casefile.sh \ + xforms/expressions.sh \ + stats/descript-basic.sh \ + stats/descript-missing.sh \ + stats/moments.sh noinst_PROGRAMS = gengarbage diff --git a/tests/command/weight.sh b/tests/command/weight.sh index df34b49935..9243cb32a0 100755 --- a/tests/command/weight.sh +++ b/tests/command/weight.sh @@ -75,7 +75,7 @@ diff -B -b $TEMPDIR/pspp.list - < $TEMPDIR/descript.stat < $TEMPDIR/moments-list-1p <<'EOF' +# Both the one-pass and two-pass algorithms should be +# able to cope properly with these. +1 2 3 4 => W=4.000 M1=2.500 M2=1.667 M3=0.000 M4=-1.200 +1*5 2*5 3*5 4*5 => W=20.000 M1=2.500 M2=1.316 M3=0.000 M4=-1.401 +1*1 2*2 3*3 4*4 => W=10.000 M1=3.000 M2=1.111 M3=-0.712 M4=-0.450 +1*0 => W=0.000 M1=sysmis M2=sysmis M3=sysmis M4=sysmis +1*1 => W=1.000 M1=1.000 M2=sysmis M3=sysmis M4=sysmis +1*2 => W=2.000 M1=1.000 M2=0.000 M3=sysmis M4=sysmis +1*3 => W=3.000 M1=1.000 M2=0.000 M3=sysmis M4=sysmis +1*2 3 => W=3.000 M1=1.667 M2=1.333 M3=1.732 M4=sysmis +1 1.00000001 => W=2.000 M1=1.000 M2=0.000 M3=sysmis M4=sysmis +EOF +if [ $? -ne 0 ] ; then no_result ; fi + +cp $TEMPDIR/moments-list-1p $TEMPDIR/moments-list-2p +sed -ne 's/#.*//;/^[ ]*$/!p' >> $TEMPDIR/moments-list-2p <<'EOF' +# Only the two-pass algorithm can be expected to produce +# proper third and fourth moments here. +1000001 1000002 1000003 1000004 => W=4.000 M1=1000002.500 M2=1.667 M3=0.000 M4=-1.200 +EOF + +activity="create two-pass input file" +sed < $TEMPDIR/moments-list-2p >> $TEMPDIR/moments-2p.stat \ + -e 's#^\(.*\) => \(.*\)$#DEBUG MOMENTS/\1.#' +if [ $? -ne 0 ] ; then no_result ; fi + +activity="run program" +$SUPERVISOR $here/../src/pspp --testing-mode -o raw-ascii \ + $TEMPDIR/moments-2p.stat >$TEMPDIR/moments-2p.err 2> $TEMPDIR/moments-2p.out + +activity="compare output" +diff -B -b $TEMPDIR/moments-list-2p $TEMPDIR/moments-2p.out +if [ $? -ne 0 ] ; then fail ; fi + +activity="create input file" +sed < $TEMPDIR/moments-list-1p >> $TEMPDIR/moments-1p.stat \ + -e 's#^\(.*\) => \(.*\)$#DEBUG MOMENTS ONEPASS/\1.#' +if [ $? -ne 0 ] ; then no_result ; fi + +activity="run program" +$SUPERVISOR $here/../src/pspp --testing-mode -o raw-ascii \ + $TEMPDIR/moments-1p.stat >$TEMPDIR/moments-1p.err 2> $TEMPDIR/moments-1p.out + +activity="compare output" +diff -B -b $TEMPDIR/moments-list-1p $TEMPDIR/moments-1p.out +if [ $? -ne 0 ] ; then fail ; fi + +pass diff --git a/tests/xforms/casefile.sh b/tests/xforms/casefile.sh new file mode 100755 index 0000000000..1215b84eab --- /dev/null +++ b/tests/xforms/casefile.sh @@ -0,0 +1,70 @@ +#!/bin/sh + +# This program tests casefiles by running DEBUG CASEFILE. + +TEMPDIR=/tmp/pspp-tst-$$ + +here=`pwd`; + +# ensure that top_srcdir is absolute +cd $top_srcdir; top_srcdir=`pwd` + +export STAT_CONFIG_PATH=$top_srcdir/config + + +cleanup() +{ + rm -rf $TEMPDIR +} + + +fail() +{ + echo $activity + echo FAILED + cleanup; + exit 1; +} + + +no_result() +{ + echo $activity + echo NO RESULT; + cleanup; + exit 2; +} + +pass() +{ + cleanup; + exit 0; +} + +mkdir -p $TEMPDIR + +cd $TEMPDIR + +activity="create program" +cat > $TEMPDIR/casefile.stat < $TEMPDIR/casefile.out +if [ $? -ne 0 ] ; then no_result ; fi + +activity="compare results" +diff -b -B $TEMPDIR/casefile.out - <