number of source and target variables. The target variables must not
already exist.
-By default, increasing values of a source variable (for a string, this
-is based on character code comparisons) are recoded to increasing values
-of its target variable. To cause increasing values of a source variable
-to be recoded to decreasing values of its target variable (@var{n} down
-to 1), specify @subcmd{DESCENDING}.
+@cmd{AUTORECODE} ordinarily assigns each increasing non-missing value
+of a source variable (for a string, this is based on character code
+comparisons) to consecutive values of its target variable. For
+example, the smallest non-missing value of the source variable is
+recoded to value 1, the next smallest to 2, and so on. If the source
+variable has user-missing values, they are recoded to
+consecutive values just above the non-missing values. For example, if
+a source variables has seven distinct non-missing values, then the
+smallest missing value would be recoded to 8, the next smallest to 9,
+and so on.
+
+Use @subcmd{DESCENDING} to reverse the sort order for non-missing
+values, so that the largest non-missing value is recoded to 1, the
+second-largest to 2, and so on. Even with @subcmd{DESCENDING},
+user-missing values are still recoded in ascending order just above
+the non-missing values.
+
+The system-missing value is always recoded into the system-missing
+variable in target variables.
@subcmd{PRINT} is currently ignored.
The @subcmd{GROUP} subcommand is relevant only if more than one variable is to be
recoded. It causes a single mapping between source and target values to
-be used, instead of one map per variable.
+be used, instead of one map per variable. With @subcmd{GROUP},
+user-missing values are taken from the first source variable that has
+any user-missing values.
If @subcmd{/BLANK=MISSING} is given, then string variables which contain only
whitespace are recoded as SYSMIS. If @subcmd{/BLANK=VALID} is given then they
return class & MV_USER && is_str_user_missing (mv, s);
}
+/* Like mv_is_value_missing(), this tests whether V is a missing value
+ in the given CLASS in MV. It supports the uncommon case where V
+ and MV might have different widths: the caller must specify VW, the
+ width of V. MV and VW must be both numeric or both string.
+
+ Comparison of strings of different width is done by conceptually
+ extending both strings to infinite width by appending spaces. */
+bool
+mv_is_value_missing_varwidth (const struct missing_values *mv,
+ const union value *v, int vw,
+ enum mv_class class)
+{
+ int mvw = mv->width;
+ if (mvw == vw)
+ return mv_is_value_missing (mv, v, class);
+
+ /* Make sure they're both strings. */
+ assert (mvw && vw);
+ if (!(class & MV_USER) || mv->type == MVT_NONE)
+ return false;
+
+ for (int i = 0; i < mv->type; i++)
+ if (!buf_compare_rpad (CHAR_CAST_BUG (const char *, mv->values[i].s), mvw,
+ CHAR_CAST_BUG (const char *, v->s), vw))
+ return true;
+ return false;
+}
+
char *
mv_to_string (const struct missing_values *mv, const char *encoding)
{
bool mv_is_num_missing (const struct missing_values *, double, enum mv_class);
bool mv_is_str_missing (const struct missing_values *, const uint8_t[],
enum mv_class);
+bool mv_is_value_missing_varwidth (const struct missing_values *,
+ const union value *, int value_width,
+ enum mv_class);
/* Initializing missing value sets. */
void mv_init (struct missing_values *, int width);
#include "gl/xalloc.h"
#include "gl/c-xvasprintf.h"
#include "gl/mbiter.h"
-
+#include "gl/size_max.h"
#include "gettext.h"
#define _(msgid) gettext (msgid)
struct hmap_node hmap_node; /* Element in "struct arc_spec" hash table. */
union value from; /* Original value. */
int width; /* Width of the original value */
+ bool missing; /* Is 'from' missing in its source varible? */
double to; /* Recoded value. */
};
int width; /* Variable width. */
int src_idx; /* Case index of source variable. */
struct variable *dst; /* Target variable. */
+ struct missing_values mv; /* Missing values of source variable. */
struct rec_items *items;
};
}
}
+ /* Initialize specs[*]->mv to the user-missing values for each
+ source variable. */
+ if (group)
+ {
+ /* Use the first source variable that has any user-missing values. */
+ size_t mv_idx = 0;
+ for (size_t i = 0; i < n_dsts; i++)
+ if (var_has_missing_values (src_vars[i]))
+ {
+ mv_idx = i;
+ break;
+ }
+
+ for (size_t i = 0; i < n_dsts; i++)
+ mv_copy (&arc->specs[i].mv, var_get_missing_values (src_vars[mv_idx]));
+ }
+ else
+ {
+ /* Each variable uses its own user-missing values. */
+ for (size_t i = 0; i < n_dsts; i++)
+ mv_copy (&arc->specs[i].mv, var_get_missing_values (src_vars[i]));
+ }
+
/* Execute procedure. */
struct casereader *input = proc_open (ds);
struct ccase *c;
{
struct arc_spec *spec = &arc->specs[i];
const union value *value = case_data_idx (c, spec->src_idx);
+ if (spec->width == 0 && value->f == SYSMIS)
+ {
+ /* AUTORECODE never changes the system-missing value.
+ (Leaving it out of the translation table has this
+ effect automatically because values not found in the
+ translation table get translated to system-missing.) */
+ continue;
+ }
+
int width = value_trim_spaces (value, spec->width);
if (width == 1 && value->s[0] == ' ' && !arc->blank_valid)
continue;
struct arc_item *item = xmalloc (sizeof *item);
item->width = width;
value_clone (&item->from, value, width);
+ item->missing = mv_is_value_missing_varwidth (&spec->mv, value, spec->width,
+ MV_ANY);
hmap_insert (&spec->items->ht, &item->hmap_node, hash);
}
bool ok = casereader_destroy (input);
assert (j == n_items);
/* Sort array by value. */
- sort (items, n_items, sizeof *items, compare_arc_items, NULL);
+ sort (items, n_items, sizeof *items, compare_arc_items, &direction);
/* Assign recoded values in sorted order. */
for (j = 0; j < n_items; j++)
- items[j]->to = direction == ASCENDING ? j + 1 : n_items - j;
+ items[j]->to = j + 1;
+
+ /* Assign user-missing values.
+
+ User-missing values in the source variable(s) must be marked
+ as user-missing values in the destination variable. There
+ might be an arbitrary number of missing values, since the
+ source variable might have a range. Our sort function always
+ puts missing values together at the top of the range, so that
+ means that we can use a missing value range to cover all of
+ the user-missing values in any case (but we avoid it unless
+ necessary because user-missing value ranges are an obscure
+ feature). */
+ size_t n_missing = n_items;
+ for (size_t k = 0; k < n_items; k++)
+ if (!items[n_items - k - 1]->missing)
+ {
+ n_missing = k;
+ break;
+ }
+ if (n_missing > 0)
+ {
+ size_t lo = n_items - (n_missing - 1);
+ size_t hi = n_items;
+
+ struct missing_values mv;
+ mv_init (&mv, 0);
+ if (n_missing > 3)
+ mv_add_range (&mv, lo, hi);
+ else if (n_missing > 0)
+ for (size_t k = 0; k < n_missing; k++)
+ mv_add_num (&mv, lo + k);
+ var_set_missing_values (spec->dst, &mv);
+ mv_destroy (&mv);
+ }
/* Add value labels to the destination variable which indicate
the source value from whence the new value comes. */
hmap_delete (&spec->items->ht, &item->hmap_node);
free (item);
}
+ mv_destroy (&spec->mv);
}
size_t n_rec_items =
}
static int
-compare_arc_items (const void *a_, const void *b_, const void *aux UNUSED)
+compare_arc_items (const void *a_, const void *b_, const void *direction_)
{
- const struct arc_item *const *a = a_;
- const struct arc_item *const *b = b_;
- int width_a = (*a)->width;
- int width_b = (*b)->width;
-
- if ( width_a == width_b)
- return value_compare_3way (&(*a)->from, &(*b)->from, width_a);
-
- if ( width_a == 0 && width_b != 0)
- return -1;
-
- if ( width_b == 0 && width_a != 0)
- return +1;
+ const struct arc_item *const *ap = a_;
+ const struct arc_item *const *bp = b_;
+ const struct arc_item *a = *ap;
+ const struct arc_item *b = *bp;
+
+ /* User-missing values always sort to the highest target values
+ (regardless of sort direction). */
+ if (a->missing != b->missing)
+ return a->missing < b->missing ? -1 : 1;
+
+ /* Otherwise, compare the data. */
+ int aw = a->width;
+ int bw = b->width;
+ int cmp;
+ if (aw == bw)
+ cmp = value_compare_3way (&a->from, &b->from, aw);
+ else
+ {
+ assert (aw && bw);
+ cmp = buf_compare_rpad (CHAR_CAST_BUG (const char *, a->from.s), aw,
+ CHAR_CAST_BUG (const char *, b->from.s), bw);
+ }
- return buf_compare_rpad (CHAR_CAST_BUG (const char *, (*a)->from.s), width_a,
- CHAR_CAST_BUG (const char *, (*b)->from.s), width_b);
+ /* Then apply sort direction. */
+ const enum arc_direction *directionp = direction_;
+ enum arc_direction direction = *directionp;
+ return direction == ASCENDING ? cmp : -cmp;
}
static int
dnl
AT_BANNER([AUTORECODE procedure])
+AT_SETUP([AUTORECODE multiple missing values])
+AT_DATA([autorecode.sps],
+ [DATA LIST LIST NOTABLE /u v w x y z (F2.0).
+BEGIN DATA.
+11 11 11 11 11 11
+12 12 12 12 12 12
+13 13 13 13 13 13
+14 14 14 14 14 14
+15 15 15 15 15 15
+16 16 16 16 16 16
+END DATA.
+
+MISSING VALUES u (11)
+ v (11, 12)
+ w (11, 12, 13)
+ x (11 THRU 14)
+ y (11 THRU 15)
+ z (11 THRU 16).
+
+AUTORECODE u v w x y z INTO a b c d e f.
+LIST.
+DISPLAY VARIABLES/VARIABLES=a TO f.
+])
+AT_CHECK([pspp -O format=csv autorecode.sps], [0], [dnl
+Table: Data List
+u,v,w,x,y,z,a,b,c,d,e,f
+11,11,11,11,11,11,6.00,5.00,4.00,3.00,2.00,1.00
+12,12,12,12,12,12,1.00,6.00,5.00,4.00,3.00,2.00
+13,13,13,13,13,13,2.00,1.00,6.00,5.00,4.00,3.00
+14,14,14,14,14,14,3.00,2.00,1.00,6.00,5.00,4.00
+15,15,15,15,15,15,4.00,3.00,2.00,1.00,6.00,5.00
+16,16,16,16,16,16,5.00,4.00,3.00,2.00,1.00,6.00
+
+Table: Variables
+Name,Position,Print Format,Write Format,Missing Values
+a,7,F8.2,F8.2,6
+b,8,F8.2,F8.2,5; 6
+c,9,F8.2,F8.2,4; 5; 6
+d,10,F8.2,F8.2,3 THRU 6
+e,11,F8.2,F8.2,2 THRU 6
+f,12,F8.2,F8.2,1 THRU 6
+])
+AT_CLEANUP
+
AT_SETUP([AUTORECODE numbers and short strings])
AT_DATA([autorecode.sps],
[data list /X 1-5(a) Y 7.
asdfk 1
end data.
+missing values x('asdfk') y(9).
+
autorecode x y into A B/descend.
list.
Table: Data List
X,Y,A,B
-lasdj,1,1.00,3.00
-asdfk,0,3.00,4.00
-asdfj,2,4.00,2.00
-asdfj,1,4.00,3.00
-asdfk,2,3.00,2.00
-asdfj,9,4.00,1.00
-lajks,9,2.00,1.00
-asdfk,0,3.00,4.00
-asdfk,1,3.00,3.00
+lasdj,1,1.00,2.00
+asdfk,0,4.00,3.00
+asdfj,2,3.00,1.00
+asdfj,1,3.00,2.00
+asdfk,2,4.00,1.00
+asdfj,9,3.00,4.00
+lajks,9,2.00,4.00
+asdfk,0,4.00,3.00
+asdfk,1,4.00,2.00
Table: Data List
X,Y,A,B,Z,W
-lasdj,1,1.00,3.00,.00,1.00
-asdfk,0,3.00,4.00,.00,1.00
-asdfj,2,4.00,2.00,1.00,2.00
-asdfj,1,4.00,3.00,.00,1.00
-asdfk,2,3.00,2.00,1.00,2.00
-asdfj,9,4.00,1.00,4.00,3.00
-lajks,9,2.00,1.00,4.00,3.00
-asdfk,0,3.00,4.00,.00,1.00
-asdfk,1,3.00,3.00,.00,1.00
+lasdj,1,1.00,2.00,.00,1.00
+asdfk,0,4.00,3.00,.00,1.00
+asdfj,2,3.00,1.00,1.00,2.00
+asdfj,1,3.00,2.00,.00,1.00
+asdfk,2,4.00,1.00,1.00,2.00
+asdfj,9,3.00,4.00,. ,. @&t@
+lajks,9,2.00,4.00,. ,. @&t@
+asdfk,0,4.00,3.00,.00,1.00
+asdfk,1,4.00,2.00,.00,1.00
])
AT_CLEANUP
16 18
end data.
+missing values y (12).
+
autorecode
x y into a b
/group.
list.
+display variables /variables=a b.
])
AT_CHECK([pspp -O format=csv ar-group.sps], [0],
[Table: Data List
x,y,a,b
11.00,10.00,2.00,1.00
-12.00,12.00,3.00,3.00
-13.00,15.00,4.00,6.00
-14.00,11.00,5.00,2.00
-15.00,12.00,6.00,3.00
-16.00,18.00,7.00,8.00
+12.00,12.00,8.00,8.00
+13.00,15.00,3.00,5.00
+14.00,11.00,4.00,2.00
+15.00,12.00,5.00,8.00
+16.00,18.00,6.00,7.00
+
+Table: Variables
+Name,Position,Print Format,Write Format,Missing Values
+a,3,F8.2,F8.2,8
+b,4,F8.2,F8.2,8
])
AT_CLEANUP