From a12aeb43c5edf27b0ef2be5c2d7db30214a31a2b Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Mon, 2 Sep 2019 05:14:52 +0000 Subject: [PATCH] AUTORECODE: Fix treatment of missing values. User-missing values are supposed to be recoded to a separate range above the non-missing values. The system-missing value is supposed to not be recoded at all. This fixes and documents that behavior. --- doc/transformation.texi | 28 +++++-- src/data/missing-values.c | 28 +++++++ src/data/missing-values.h | 3 + src/language/stats/autorecode.c | 120 ++++++++++++++++++++++++----- tests/language/stats/autorecode.at | 100 ++++++++++++++++++------ 5 files changed, 231 insertions(+), 48 deletions(-) diff --git a/doc/transformation.texi b/doc/transformation.texi index e10cf8374f..5ee7febef0 100644 --- a/doc/transformation.texi +++ b/doc/transformation.texi @@ -247,17 +247,33 @@ variables, @subcmd{INTO}, and a list of target variables. There must the same number of source and target variables. The target variables must not already exist. -By default, increasing values of a source variable (for a string, this -is based on character code comparisons) are recoded to increasing values -of its target variable. To cause increasing values of a source variable -to be recoded to decreasing values of its target variable (@var{n} down -to 1), specify @subcmd{DESCENDING}. +@cmd{AUTORECODE} ordinarily assigns each increasing non-missing value +of a source variable (for a string, this is based on character code +comparisons) to consecutive values of its target variable. For +example, the smallest non-missing value of the source variable is +recoded to value 1, the next smallest to 2, and so on. If the source +variable has user-missing values, they are recoded to +consecutive values just above the non-missing values. For example, if +a source variables has seven distinct non-missing values, then the +smallest missing value would be recoded to 8, the next smallest to 9, +and so on. + +Use @subcmd{DESCENDING} to reverse the sort order for non-missing +values, so that the largest non-missing value is recoded to 1, the +second-largest to 2, and so on. Even with @subcmd{DESCENDING}, +user-missing values are still recoded in ascending order just above +the non-missing values. + +The system-missing value is always recoded into the system-missing +variable in target variables. @subcmd{PRINT} is currently ignored. The @subcmd{GROUP} subcommand is relevant only if more than one variable is to be recoded. It causes a single mapping between source and target values to -be used, instead of one map per variable. +be used, instead of one map per variable. With @subcmd{GROUP}, +user-missing values are taken from the first source variable that has +any user-missing values. If @subcmd{/BLANK=MISSING} is given, then string variables which contain only whitespace are recoded as SYSMIS. If @subcmd{/BLANK=VALID} is given then they diff --git a/src/data/missing-values.c b/src/data/missing-values.c index 0126651638..267d34fb91 100644 --- a/src/data/missing-values.c +++ b/src/data/missing-values.c @@ -474,6 +474,34 @@ mv_is_str_missing (const struct missing_values *mv, const uint8_t s[], return class & MV_USER && is_str_user_missing (mv, s); } +/* Like mv_is_value_missing(), this tests whether V is a missing value + in the given CLASS in MV. It supports the uncommon case where V + and MV might have different widths: the caller must specify VW, the + width of V. MV and VW must be both numeric or both string. + + Comparison of strings of different width is done by conceptually + extending both strings to infinite width by appending spaces. */ +bool +mv_is_value_missing_varwidth (const struct missing_values *mv, + const union value *v, int vw, + enum mv_class class) +{ + int mvw = mv->width; + if (mvw == vw) + return mv_is_value_missing (mv, v, class); + + /* Make sure they're both strings. */ + assert (mvw && vw); + if (!(class & MV_USER) || mv->type == MVT_NONE) + return false; + + for (int i = 0; i < mv->type; i++) + if (!buf_compare_rpad (CHAR_CAST_BUG (const char *, mv->values[i].s), mvw, + CHAR_CAST_BUG (const char *, v->s), vw)) + return true; + return false; +} + char * mv_to_string (const struct missing_values *mv, const char *encoding) { diff --git a/src/data/missing-values.h b/src/data/missing-values.h index c4d5695074..9d9a884d08 100644 --- a/src/data/missing-values.h +++ b/src/data/missing-values.h @@ -66,6 +66,9 @@ bool mv_is_value_missing (const struct missing_values *, const union value *, bool mv_is_num_missing (const struct missing_values *, double, enum mv_class); bool mv_is_str_missing (const struct missing_values *, const uint8_t[], enum mv_class); +bool mv_is_value_missing_varwidth (const struct missing_values *, + const union value *, int value_width, + enum mv_class); /* Initializing missing value sets. */ void mv_init (struct missing_values *, int width); diff --git a/src/language/stats/autorecode.c b/src/language/stats/autorecode.c index 66afb13afa..fd614ec5ab 100644 --- a/src/language/stats/autorecode.c +++ b/src/language/stats/autorecode.c @@ -40,7 +40,7 @@ #include "gl/xalloc.h" #include "gl/c-xvasprintf.h" #include "gl/mbiter.h" - +#include "gl/size_max.h" #include "gettext.h" #define _(msgid) gettext (msgid) @@ -53,6 +53,7 @@ struct arc_item struct hmap_node hmap_node; /* Element in "struct arc_spec" hash table. */ union value from; /* Original value. */ int width; /* Width of the original value */ + bool missing; /* Is 'from' missing in its source varible? */ double to; /* Recoded value. */ }; @@ -63,6 +64,7 @@ struct arc_spec int width; /* Variable width. */ int src_idx; /* Case index of source variable. */ struct variable *dst; /* Target variable. */ + struct missing_values mv; /* Missing values of source variable. */ struct rec_items *items; }; @@ -246,6 +248,29 @@ cmd_autorecode (struct lexer *lexer, struct dataset *ds) } } + /* Initialize specs[*]->mv to the user-missing values for each + source variable. */ + if (group) + { + /* Use the first source variable that has any user-missing values. */ + size_t mv_idx = 0; + for (size_t i = 0; i < n_dsts; i++) + if (var_has_missing_values (src_vars[i])) + { + mv_idx = i; + break; + } + + for (size_t i = 0; i < n_dsts; i++) + mv_copy (&arc->specs[i].mv, var_get_missing_values (src_vars[mv_idx])); + } + else + { + /* Each variable uses its own user-missing values. */ + for (size_t i = 0; i < n_dsts; i++) + mv_copy (&arc->specs[i].mv, var_get_missing_values (src_vars[i])); + } + /* Execute procedure. */ struct casereader *input = proc_open (ds); struct ccase *c; @@ -254,6 +279,15 @@ cmd_autorecode (struct lexer *lexer, struct dataset *ds) { struct arc_spec *spec = &arc->specs[i]; const union value *value = case_data_idx (c, spec->src_idx); + if (spec->width == 0 && value->f == SYSMIS) + { + /* AUTORECODE never changes the system-missing value. + (Leaving it out of the translation table has this + effect automatically because values not found in the + translation table get translated to system-missing.) */ + continue; + } + int width = value_trim_spaces (value, spec->width); if (width == 1 && value->s[0] == ' ' && !arc->blank_valid) continue; @@ -265,6 +299,8 @@ cmd_autorecode (struct lexer *lexer, struct dataset *ds) struct arc_item *item = xmalloc (sizeof *item); item->width = width; value_clone (&item->from, value, width); + item->missing = mv_is_value_missing_varwidth (&spec->mv, value, spec->width, + MV_ANY); hmap_insert (&spec->items->ht, &item->hmap_node, hash); } bool ok = casereader_destroy (input); @@ -296,11 +332,45 @@ cmd_autorecode (struct lexer *lexer, struct dataset *ds) assert (j == n_items); /* Sort array by value. */ - sort (items, n_items, sizeof *items, compare_arc_items, NULL); + sort (items, n_items, sizeof *items, compare_arc_items, &direction); /* Assign recoded values in sorted order. */ for (j = 0; j < n_items; j++) - items[j]->to = direction == ASCENDING ? j + 1 : n_items - j; + items[j]->to = j + 1; + + /* Assign user-missing values. + + User-missing values in the source variable(s) must be marked + as user-missing values in the destination variable. There + might be an arbitrary number of missing values, since the + source variable might have a range. Our sort function always + puts missing values together at the top of the range, so that + means that we can use a missing value range to cover all of + the user-missing values in any case (but we avoid it unless + necessary because user-missing value ranges are an obscure + feature). */ + size_t n_missing = n_items; + for (size_t k = 0; k < n_items; k++) + if (!items[n_items - k - 1]->missing) + { + n_missing = k; + break; + } + if (n_missing > 0) + { + size_t lo = n_items - (n_missing - 1); + size_t hi = n_items; + + struct missing_values mv; + mv_init (&mv, 0); + if (n_missing > 3) + mv_add_range (&mv, lo, hi); + else if (n_missing > 0) + for (size_t k = 0; k < n_missing; k++) + mv_add_num (&mv, lo + k); + var_set_missing_values (spec->dst, &mv); + mv_destroy (&mv); + } /* Add value labels to the destination variable which indicate the source value from whence the new value comes. */ @@ -371,6 +441,7 @@ arc_free (struct autorecode_pgm *arc) hmap_delete (&spec->items->ht, &item->hmap_node); free (item); } + mv_destroy (&spec->mv); } size_t n_rec_items = @@ -403,24 +474,35 @@ find_arc_item (const struct rec_items *items, } static int -compare_arc_items (const void *a_, const void *b_, const void *aux UNUSED) +compare_arc_items (const void *a_, const void *b_, const void *direction_) { - const struct arc_item *const *a = a_; - const struct arc_item *const *b = b_; - int width_a = (*a)->width; - int width_b = (*b)->width; - - if ( width_a == width_b) - return value_compare_3way (&(*a)->from, &(*b)->from, width_a); - - if ( width_a == 0 && width_b != 0) - return -1; - - if ( width_b == 0 && width_a != 0) - return +1; + const struct arc_item *const *ap = a_; + const struct arc_item *const *bp = b_; + const struct arc_item *a = *ap; + const struct arc_item *b = *bp; + + /* User-missing values always sort to the highest target values + (regardless of sort direction). */ + if (a->missing != b->missing) + return a->missing < b->missing ? -1 : 1; + + /* Otherwise, compare the data. */ + int aw = a->width; + int bw = b->width; + int cmp; + if (aw == bw) + cmp = value_compare_3way (&a->from, &b->from, aw); + else + { + assert (aw && bw); + cmp = buf_compare_rpad (CHAR_CAST_BUG (const char *, a->from.s), aw, + CHAR_CAST_BUG (const char *, b->from.s), bw); + } - return buf_compare_rpad (CHAR_CAST_BUG (const char *, (*a)->from.s), width_a, - CHAR_CAST_BUG (const char *, (*b)->from.s), width_b); + /* Then apply sort direction. */ + const enum arc_direction *directionp = direction_; + enum arc_direction direction = *directionp; + return direction == ASCENDING ? cmp : -cmp; } static int diff --git a/tests/language/stats/autorecode.at b/tests/language/stats/autorecode.at index 42b35b81b9..0b688adbb0 100644 --- a/tests/language/stats/autorecode.at +++ b/tests/language/stats/autorecode.at @@ -16,6 +16,50 @@ dnl along with this program. If not, see . dnl AT_BANNER([AUTORECODE procedure]) +AT_SETUP([AUTORECODE multiple missing values]) +AT_DATA([autorecode.sps], + [DATA LIST LIST NOTABLE /u v w x y z (F2.0). +BEGIN DATA. +11 11 11 11 11 11 +12 12 12 12 12 12 +13 13 13 13 13 13 +14 14 14 14 14 14 +15 15 15 15 15 15 +16 16 16 16 16 16 +END DATA. + +MISSING VALUES u (11) + v (11, 12) + w (11, 12, 13) + x (11 THRU 14) + y (11 THRU 15) + z (11 THRU 16). + +AUTORECODE u v w x y z INTO a b c d e f. +LIST. +DISPLAY VARIABLES/VARIABLES=a TO f. +]) +AT_CHECK([pspp -O format=csv autorecode.sps], [0], [dnl +Table: Data List +u,v,w,x,y,z,a,b,c,d,e,f +11,11,11,11,11,11,6.00,5.00,4.00,3.00,2.00,1.00 +12,12,12,12,12,12,1.00,6.00,5.00,4.00,3.00,2.00 +13,13,13,13,13,13,2.00,1.00,6.00,5.00,4.00,3.00 +14,14,14,14,14,14,3.00,2.00,1.00,6.00,5.00,4.00 +15,15,15,15,15,15,4.00,3.00,2.00,1.00,6.00,5.00 +16,16,16,16,16,16,5.00,4.00,3.00,2.00,1.00,6.00 + +Table: Variables +Name,Position,Print Format,Write Format,Missing Values +a,7,F8.2,F8.2,6 +b,8,F8.2,F8.2,5; 6 +c,9,F8.2,F8.2,4; 5; 6 +d,10,F8.2,F8.2,3 THRU 6 +e,11,F8.2,F8.2,2 THRU 6 +f,12,F8.2,F8.2,1 THRU 6 +]) +AT_CLEANUP + AT_SETUP([AUTORECODE numbers and short strings]) AT_DATA([autorecode.sps], [data list /X 1-5(a) Y 7. @@ -31,6 +75,8 @@ asdfk 0 asdfk 1 end data. +missing values x('asdfk') y(9). + autorecode x y into A B/descend. list. @@ -46,27 +92,27 @@ Y,1,7- 7,F1.0 Table: Data List X,Y,A,B -lasdj,1,1.00,3.00 -asdfk,0,3.00,4.00 -asdfj,2,4.00,2.00 -asdfj,1,4.00,3.00 -asdfk,2,3.00,2.00 -asdfj,9,4.00,1.00 -lajks,9,2.00,1.00 -asdfk,0,3.00,4.00 -asdfk,1,3.00,3.00 +lasdj,1,1.00,2.00 +asdfk,0,4.00,3.00 +asdfj,2,3.00,1.00 +asdfj,1,3.00,2.00 +asdfk,2,4.00,1.00 +asdfj,9,3.00,4.00 +lajks,9,2.00,4.00 +asdfk,0,4.00,3.00 +asdfk,1,4.00,2.00 Table: Data List X,Y,A,B,Z,W -lasdj,1,1.00,3.00,.00,1.00 -asdfk,0,3.00,4.00,.00,1.00 -asdfj,2,4.00,2.00,1.00,2.00 -asdfj,1,4.00,3.00,.00,1.00 -asdfk,2,3.00,2.00,1.00,2.00 -asdfj,9,4.00,1.00,4.00,3.00 -lajks,9,2.00,1.00,4.00,3.00 -asdfk,0,3.00,4.00,.00,1.00 -asdfk,1,3.00,3.00,.00,1.00 +lasdj,1,1.00,2.00,.00,1.00 +asdfk,0,4.00,3.00,.00,1.00 +asdfj,2,3.00,1.00,1.00,2.00 +asdfj,1,3.00,2.00,.00,1.00 +asdfk,2,4.00,1.00,1.00,2.00 +asdfj,9,3.00,4.00,. ,. @&t@ +lajks,9,2.00,4.00,. ,. @&t@ +asdfk,0,4.00,3.00,.00,1.00 +asdfk,1,4.00,2.00,.00,1.00 ]) AT_CLEANUP @@ -132,22 +178,30 @@ begin data. 16 18 end data. +missing values y (12). + autorecode x y into a b /group. list. +display variables /variables=a b. ]) AT_CHECK([pspp -O format=csv ar-group.sps], [0], [Table: Data List x,y,a,b 11.00,10.00,2.00,1.00 -12.00,12.00,3.00,3.00 -13.00,15.00,4.00,6.00 -14.00,11.00,5.00,2.00 -15.00,12.00,6.00,3.00 -16.00,18.00,7.00,8.00 +12.00,12.00,8.00,8.00 +13.00,15.00,3.00,5.00 +14.00,11.00,4.00,2.00 +15.00,12.00,5.00,8.00 +16.00,18.00,6.00,7.00 + +Table: Variables +Name,Position,Print Format,Write Format,Missing Values +a,3,F8.2,F8.2,8 +b,4,F8.2,F8.2,8 ]) AT_CLEANUP -- 2.30.2