1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
28 /* Definition of the max length of a short string value, generally
30 #define MAX_SHORT_STRING ((SIZEOF_DOUBLE)>=8 ? ((SIZEOF_DOUBLE)+1)/2*2 : 8)
31 #define MIN_LONG_STRING (MAX_SHORT_STRING+1)
33 /* FYI: It is a bad situation if sizeof(flt64) < MAX_SHORT_STRING:
34 then short string missing values can be truncated in system files
35 because there's only room for as many characters as can fit in a
37 #if MAX_SHORT_STRING > 8
38 #error MAX_SHORT_STRING must be less than 8.
42 #define SYSMIS (-DBL_MAX)
43 #define LOWEST second_lowest_value
44 #define HIGHEST DBL_MAX
46 /* Describes one value, which is either a floating-point number or a
50 /* A numeric value. */
53 /* A short-string value. */
54 unsigned char s[MAX_SHORT_STRING];
56 /* This member is used by data-in.c to return a string result,
57 since it may need to return a long string. As currently
58 implemented, it's a pointer to a static internal buffer in
61 Also used by evaluate_expression() to return a string result.
62 As currently implemented, it's a pointer to a dynamic buffer in
63 the appropriate expression.
65 Also used by the AGGREGATE procedure in handling string
69 /* Sometimes we insert value's in a hash table. */
70 unsigned long hash[SIZEOF_DOUBLE / SIZEOF_LONG];
73 /* Frequency tables. */
75 /* Frequency table entry. */
78 union value v; /* The value. */
79 double c; /* The number of occurrences of the value. */
82 /* Types of frequency tables. */
89 /* Entire frequency table. */
92 int mode; /* FRQM_GENERAL or FRQM_INTEGER. */
95 struct hsh_table *data; /* Undifferentiated data. */
98 double *vector; /* Frequencies proper. */
99 int min, max; /* The boundaries of the table. */
100 double out_of_range; /* Sum of weights of out-of-range values. */
101 double sysmis; /* Sum of weights of SYSMIS values. */
104 struct freq *valid; /* Valid freqs. */
105 int n_valid; /* Number of total freqs. */
107 struct freq *missing; /* Missing freqs. */
108 int n_missing; /* Number of missing freqs. */
111 double total_cases; /* Sum of weights of all cases. */
112 double valid_cases; /* Sum of weights of valid cases. */
115 /* Procedures' private per-variable data. */
117 /* Structure name suffixes for private data:
118 _proc: for a procedure (i.e., LIST -> list_proc).
119 _trns: for a transformation (i.e., COMPUTE -> compute_trns.
120 _pgm: for an input program (i.e., DATA LIST -> data_list_pgm). */
122 /* CROSSTABS private data. */
125 /* Integer mode only. */
126 int min; /* Minimum value. */
127 int max; /* Maximum value + 1. */
128 int count; /* max - min. */
132 /* T-TEST private data */
137 /* Population std. deviation */
140 /* Sample std. deviation */
151 /* Std Err of Mean */
154 /* Sum of differnces */
157 /* Mean of differences */
162 /* FREQUENCIES private data. */
165 frq_mean = 0, frq_semean, frq_median, frq_mode, frq_stddev, frq_variance,
166 frq_kurt, frq_sekurt, frq_skew, frq_seskew, frq_range, frq_min, frq_max,
170 struct frequencies_proc
172 int used; /* 1=This variable already used. */
174 /* Freqency table. */
175 struct freq_tab tab; /* Frequencies table to use. */
178 int n_groups; /* Number of groups. */
179 double *groups; /* Groups. */
182 double stat[frq_n_stats];
185 /* LIST private data. */
188 int newline; /* Whether a new line begins here. */
189 int width; /* Field width. */
190 int vert; /* Whether to print the varname vertically. */
193 /* DESCRIPTIVES private data. Note that the DESCRIPTIVES procedure also
194 has a transformation, descriptives_trns. */
197 /* As these are used as bit indexes, there must be 32 or fewer.
198 Be very careful in adjusting these, see the structure below
199 and the table in descriptives.q. */
200 dsc_mean = 0, dsc_semean, dsc_stddev, dsc_variance, dsc_kurt,
201 dsc_sekurt, dsc_skew, dsc_seskew, dsc_range, dsc_min,
202 dsc_max, dsc_sum, dsc_n_stats
205 struct descriptives_proc
208 int dup; /* Finds duplicates in list of
210 char zname[10]; /* Name for z-score variable. */
213 double valid, miss; /* Valid, missing--general. */
215 /* Mean, moments about the mean. */
216 double X_bar, M2, M3, M4;
220 double stats[dsc_n_stats]; /* Everything glommed together. */
223 /* GET private data. */
226 int fv, nv; /* First, # of values. */
232 SRT_ASCEND, /* A, B, C, ..., X, Y, Z. */
233 SRT_DESCEND /* Z, Y, X, ..., C, B, A. */
236 /* SORT CASES private data. */
237 struct sort_cases_proc
239 int order; /* SRT_ASCEND or SRT_DESCEND. */
242 /* MEANS private data. */
245 double min, max; /* Range for integer mode. */
248 /* Different types of variables for MATRIX DATA procedure. Order is
249 important: these are used for sort keys. */
252 MXD_SPLIT, /* SPLIT FILE variables. */
253 MXD_ROWTYPE, /* ROWTYPE_. */
254 MXD_FACTOR, /* Factor variables. */
255 MXD_VARNAME, /* VARNAME_. */
256 MXD_CONTINUOUS, /* Continuous variables. */
261 /* MATRIX DATA private data. */
262 struct matrix_data_proc
264 int vartype; /* Variable type. */
265 int subtype; /* Subtype. */
268 /* MATCH FILES private data. */
269 struct match_files_proc
271 struct variable *master; /* Corresponding master file variable. */
275 /* Script variables. */
280 NUMERIC, /* A numeric variable. */
281 ALPHA /* A string variable. (STRING is pre-empted by lexer.h) */
284 /* Types of missing values. Order is significant, see
285 mis-val.c:parse_numeric(), sfm-read.c:sfm_read_dictionary()
286 sfm-write.c:sfm_write_dictionary(),
287 sysfile-info.c:cmd_sysfile_info(), mis-val.c:copy_missing_values(),
288 pfm-read.c:read_variables(), pfm-write.c:write_variables(),
289 apply-dict.c:cmd_apply_dictionary(), and more (?). */
292 MISSING_NONE, /* No user-missing values. */
293 MISSING_1, /* One user-missing value. */
294 MISSING_2, /* Two user-missing values. */
295 MISSING_3, /* Three user-missing values. */
296 MISSING_RANGE, /* [a,b]. */
297 MISSING_LOW, /* (-inf,a]. */
298 MISSING_HIGH, /* (a,+inf]. */
299 MISSING_RANGE_1, /* [a,b], c. */
300 MISSING_LOW_1, /* (-inf,a], b. */
301 MISSING_HIGH_1, /* (a,+inf), b. */
305 /* A variable's dictionary entry. Note: don't reorder name[] from the
306 first element; a pointer to `variable' should be a pointer to
310 /* Required by parse_variables() to be in this order. */
311 char name[9]; /* As a string. */
312 int index; /* Index into its dictionary's var[]. */
313 int type; /* NUMERIC or ALPHA. */
315 /* Also important but parse_variables() doesn't need it. Still,
316 check before reordering. */
317 int width; /* Size of string variables in chars. */
318 int fv, nv; /* Index into `value's, number of values. */
319 int left; /* 0=reinitialize each case, 1=don't. */
321 /* Missing values. */
322 int miss_type; /* One of the MISSING_* constants. */
323 union value missing[3]; /* User-missing value. */
325 /* Display formats. */
326 struct fmt_spec print; /* Default format for PRINT. */
327 struct fmt_spec write; /* Default format for WRITE. */
330 struct val_labs *val_labs;
331 char *label; /* Variable label. */
333 /* Per-procedure info. */
338 struct crosstab_proc crs;
339 struct descriptives_proc dsc;
340 struct frequencies_proc frq;
341 struct list_proc lst;
342 struct means_proc mns;
343 struct sort_cases_proc srt;
344 struct matrix_data_proc mxd;
345 struct match_files_proc mtf;
346 struct t_test_proc t_t;
351 int compare_variables (const void *, const void *, void *);
352 unsigned hash_variable (const void *, void *);
354 /* Classes of variables. */
357 DC_ORDINARY, /* Ordinary identifier. */
358 DC_SYSTEM, /* System variable. */
359 DC_SCRATCH /* Scratch variable. */
362 enum dict_class dict_class_from_id (const char *name);
363 const char *dict_class_to_name (enum dict_class dict_class);
365 /* Vector of variables. */
368 int idx; /* Index for dict_get_vector(). */
369 char name[9]; /* Name. */
370 struct variable **var; /* Vector of variables. */
371 int cnt; /* Number of variables. */
376 /* A single case. (This doesn't need to be a struct anymore, but it
377 remains so for hysterical raisins.) */
385 /* Complete dictionary state. */
388 struct dictionary *dict_create (void);
389 struct dictionary *dict_clone (const struct dictionary *);
390 void dict_clear (struct dictionary *);
391 void dict_destroy (struct dictionary *);
393 size_t dict_get_var_cnt (const struct dictionary *);
394 struct variable *dict_get_var (const struct dictionary *, size_t idx);
395 void dict_get_vars (const struct dictionary *,
396 struct variable ***vars, size_t *cnt,
397 unsigned exclude_classes);
399 struct variable *dict_create_var (struct dictionary *, const char *,
401 struct variable *dict_clone_var (struct dictionary *, const struct variable *,
403 void dict_rename_var (struct dictionary *, struct variable *, const char *);
405 struct variable *dict_lookup_var (const struct dictionary *, const char *);
406 int dict_contains_var (const struct dictionary *, const struct variable *);
407 void dict_delete_var (struct dictionary *, struct variable *);
408 void dict_delete_vars (struct dictionary *,
409 struct variable *const *, size_t count);
410 void dict_reorder_vars (struct dictionary *,
411 struct variable *const *, size_t count);
412 int dict_rename_vars (struct dictionary *,
413 struct variable **, char **new_names,
414 size_t count, char **err_name);
416 struct variable *dict_get_weight (const struct dictionary *);
417 double dict_get_case_weight (const struct dictionary *, const struct ccase *);
418 void dict_set_weight (struct dictionary *, struct variable *);
420 struct variable *dict_get_filter (const struct dictionary *);
421 void dict_set_filter (struct dictionary *, struct variable *);
423 int dict_get_case_limit (const struct dictionary *);
424 void dict_set_case_limit (struct dictionary *, int);
426 int dict_get_value_cnt (const struct dictionary *);
427 void dict_compact_values (struct dictionary *);
429 struct variable *const *dict_get_split_vars (const struct dictionary *);
430 size_t dict_get_split_cnt (const struct dictionary *);
431 void dict_set_split_vars (struct dictionary *,
432 struct variable *const *, size_t cnt);
434 const char *dict_get_label (const struct dictionary *);
435 void dict_set_label (struct dictionary *, const char *);
437 const char *dict_get_documents (const struct dictionary *);
438 void dict_set_documents (struct dictionary *, const char *);
440 int dict_create_vector (struct dictionary *,
442 struct variable **, size_t cnt);
443 const struct vector *dict_get_vector (const struct dictionary *,
445 size_t dict_get_vector_cnt (const struct dictionary *);
446 const struct vector *dict_lookup_vector (const struct dictionary *,
448 void dict_clear_vectors (struct dictionary *);
450 void discard_variables (void);
452 /* This is the active file dictionary. */
453 extern struct dictionary *default_dict;
455 /* Transformation state. */
457 /* Default file handle for DATA LIST, REREAD, REPEATING DATA
459 extern struct file_handle *default_handle;
461 /* PROCESS IF expression. */
462 extern struct expression *process_if_expr;
464 /* TEMPORARY support. */
466 /* 1=TEMPORARY has been executed at some point. */
467 extern int temporary;
469 /* If temporary!=0, the saved dictionary. */
470 extern struct dictionary *temp_dict;
472 /* If temporary!=0, index into t_trns[] (declared far below) that
473 gives the point at which data should be written out. -1 means that
474 the data shouldn't be changed since all transformations are
476 extern int temp_trns;
478 /* If FILTER is active, whether it was executed before or after
480 extern int FILTER_before_TEMPORARY;
482 void cancel_temporary (void);
486 void dump_split_vars (const struct ccase *);
488 int is_num_user_missing (double, const struct variable *);
489 int is_str_user_missing (const unsigned char[], const struct variable *);
490 int is_missing (const union value *, const struct variable *);
491 int is_system_missing (const union value *, const struct variable *);
492 int is_user_missing (const union value *, const struct variable *);
493 void copy_missing_values (struct variable *dest, const struct variable *src);
496 struct variable *force_create_variable (struct dictionary *, const char *name,
497 int type, int width);
498 struct variable *force_dup_variable (struct dictionary *,
499 const struct variable *src,
502 #define force_create_variable(A, B, C, D) \
503 create_variable (A, B, C, D)
504 #define force_dup_variable(A, B, C) \
505 dup_variable (A, B, C)
509 /* Transformations. */
511 /* Header for all transformations. */
514 /* Index into t_trns[]. */
517 /* Transformation proc. */
518 int (*proc) (struct trns_header *, struct ccase *);
520 /* Garbage collector proc. */
521 void (*free) (struct trns_header *);
524 /* Array of transformations */
525 extern struct trns_header **t_trns;
527 /* Number of transformations, maximum number in array currently. */
528 extern int n_trns, m_trns;
530 /* Index of first transformation that is really a transformation. Any
531 transformations before this belong to INPUT PROGRAM. */
534 void add_transformation (struct trns_header *trns);
535 void cancel_transformations (void);
539 struct var_set *var_set_create_from_dict (struct dictionary *d);
540 struct var_set *var_set_create_from_array (struct variable **var, size_t);
542 size_t var_set_get_cnt (struct var_set *vs);
543 struct variable *var_set_get_var (struct var_set *vs, size_t idx);
544 struct variable *var_set_lookup_var (struct var_set *vs, const char *name);
545 void var_set_destroy (struct var_set *vs);
548 /* Variable parsers. */
552 PV_NONE = 0, /* No options. */
553 PV_SINGLE = 0001, /* Restrict to a single name or TO use. */
554 PV_DUPLICATE = 0002, /* Don't merge duplicates. */
555 PV_APPEND = 0004, /* Append to existing list. */
556 PV_NO_DUPLICATE = 0010, /* Error on duplicates. */
557 PV_NUMERIC = 0020, /* Vars must be numeric. */
558 PV_STRING = 0040, /* Vars must be string. */
559 PV_SAME_TYPE = 00100, /* All vars must be the same type. */
560 PV_NO_SCRATCH = 00200, /* Disallow scratch variables. */
563 struct variable *parse_variable (void);
564 struct variable *parse_dict_variable (struct dictionary *);
565 int parse_variables (struct dictionary *, struct variable ***, int *,
567 int parse_var_set_vars (struct var_set *, struct variable ***, int *,
569 int parse_DATA_LIST_vars (char ***names, int *cnt, int opts);
570 int parse_mixed_vars (char ***names, int *cnt, int opts);