1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
23 #include "data/case.h"
24 #include "data/data-in.h"
25 #include "data/dataset.h"
26 #include "data/dictionary.h"
27 #include "data/format.h"
28 #include "data/transformations.h"
29 #include "data/variable.h"
30 #include "language/command.h"
31 #include "language/lexer/lexer.h"
32 #include "language/lexer/value-parser.h"
33 #include "language/lexer/variable-parser.h"
34 #include "libpspp/assertion.h"
35 #include "libpspp/cast.h"
36 #include "libpspp/compiler.h"
37 #include "libpspp/i18n.h"
38 #include "libpspp/message.h"
39 #include "libpspp/pool.h"
40 #include "libpspp/str.h"
42 #include "gl/xalloc.h"
45 #define _(msgid) gettext (msgid)
49 /* Type of source value for RECODE. */
52 MAP_SINGLE, /* Specific value. */
53 MAP_RANGE, /* Range of values. */
54 MAP_SYSMIS, /* System missing value. */
55 MAP_MISSING, /* Any missing value. */
56 MAP_ELSE, /* Any value. */
57 MAP_CONVERT /* "123" => 123. */
60 /* Describes input values to be mapped. */
63 enum map_in_type type; /* One of MAP_*. */
64 union value x, y; /* Source values. */
67 /* Describes the value used as output from a mapping. */
70 bool copy_input; /* If true, copy input to output. */
71 union value value; /* If copy_input false, recoded value. */
72 int width; /* If copy_input false, output value width. */
73 int ofs; /* Lexical location. */
76 /* Describes how to recode a single value or range of values into a
80 struct map_in in; /* Input values. */
81 struct map_out out; /* Output value. */
84 /* RECODE transformation. */
89 /* Variable types, for convenience. */
90 enum val_type src_type; /* src_vars[*] type. */
91 enum val_type dst_type; /* dst_vars[*] type. */
94 const struct variable **src_vars; /* Source variables. */
95 const struct variable **dst_vars; /* Destination variables. */
96 const struct dictionary *dst_dict; /* Dictionary of dst_vars */
97 char **dst_names; /* Name of dest variables, if they're new. */
98 size_t n_vars; /* Number of variables. */
101 struct mapping *mappings; /* Value mappings. */
102 size_t n_maps; /* Number of mappings. */
103 int max_src_width; /* Maximum width of src_vars[*]. */
104 int max_dst_width; /* Maximum width of any map_out in mappings. */
107 static bool parse_src_vars (struct lexer *, struct recode_trns *, const struct dictionary *dict);
108 static bool parse_mappings (struct lexer *, struct recode_trns *,
109 const char *dict_encoding);
110 static bool parse_dst_vars (struct lexer *, struct recode_trns *,
111 const struct dictionary *,
112 int src_start, int src_end,
113 int mappings_start, int mappings_end);
115 static void add_mapping (struct recode_trns *,
116 size_t *map_allocated, const struct map_in *);
118 static bool parse_map_in (struct lexer *lexer, struct map_in *, struct pool *,
119 enum val_type src_type, size_t max_src_width,
120 const char *dict_encoding);
121 static void set_map_in_str (struct map_in *, struct pool *,
122 struct substring, size_t width,
123 const char *dict_encoding);
125 static bool parse_map_out (struct lexer *lexer, struct pool *, struct map_out *);
126 static void set_map_out_str (struct map_out *, struct pool *,
129 static bool enlarge_dst_widths (struct lexer *, struct recode_trns *,
130 int dst_start, int dst_end);
131 static void create_dst_vars (struct recode_trns *, struct dictionary *);
133 static bool recode_trns_free (void *trns_);
135 static const struct trns_class recode_trns_class;
140 parse_one_recoding (struct lexer *lexer, struct dataset *ds,
141 struct recode_trns *trns)
143 struct dictionary *dict = dataset_dict (ds);
145 /* Parse source variable names,
146 then input to output mappings,
147 then destination variable names. */
148 int src_start = lex_ofs (lexer);
149 if (!parse_src_vars (lexer, trns, dict))
151 int src_end = lex_ofs (lexer) - 1;
153 int mappings_start = lex_ofs (lexer);
154 if (!parse_mappings (lexer, trns, dict_get_encoding (dict)))
156 int mappings_end = lex_ofs (lexer) - 1;
158 int dst_start = lex_ofs (lexer);
159 if (!parse_dst_vars (lexer, trns, dict,
160 src_start, src_end, mappings_start, mappings_end))
162 int dst_end = lex_ofs (lexer) - 1;
163 if (dst_end < dst_start)
165 /* There was no target variable syntax, so the target variables are the
166 same as the source variables. */
167 dst_start = src_start;
171 /* Ensure that all the output strings are at least as wide
172 as the widest destination variable. */
173 if (trns->dst_type == VAL_STRING
174 && !enlarge_dst_widths (lexer, trns, dst_start, dst_end))
177 /* Create destination variables, if needed.
178 This must be the final step; otherwise we'd have to
179 delete destination variables on failure. */
180 trns->dst_dict = dict;
181 if (trns->src_vars != trns->dst_vars)
182 create_dst_vars (trns, dict);
185 add_transformation (ds, &recode_trns_class, trns);
189 /* Parses the RECODE transformation. */
191 cmd_recode (struct lexer *lexer, struct dataset *ds)
195 struct pool *pool = pool_create ();
196 struct recode_trns *trns = pool_alloc (pool, sizeof *trns);
197 *trns = (struct recode_trns) { .pool = pool };
199 if (!parse_one_recoding (lexer, ds, trns))
201 recode_trns_free (trns);
205 while (lex_match (lexer, T_SLASH));
210 /* Parses a set of variables to recode into TRNS->src_vars and
211 TRNS->n_vars. Sets TRNS->src_type. Returns true if
212 successful, false on parse error. */
214 parse_src_vars (struct lexer *lexer,
215 struct recode_trns *trns, const struct dictionary *dict)
217 if (!parse_variables_const (lexer, dict, &trns->src_vars, &trns->n_vars,
220 pool_register (trns->pool, free, trns->src_vars);
221 trns->src_type = var_get_type (trns->src_vars[0]);
225 /* Parses a set of mappings, which take the form (input=output),
226 into TRNS->mappings and TRNS->n_maps. Sets TRNS->dst_type.
227 Returns true if successful, false on parse error. */
229 parse_mappings (struct lexer *lexer, struct recode_trns *trns,
230 const char *dict_encoding)
232 /* Find length of longest source variable. */
233 trns->max_src_width = var_get_width (trns->src_vars[0]);
234 for (size_t i = 1; i < trns->n_vars; i++)
236 size_t var_width = var_get_width (trns->src_vars[i]);
237 if (var_width > trns->max_src_width)
238 trns->max_src_width = var_width;
241 /* Parse the mappings in parentheses. */
242 size_t map_allocated = 0;
243 bool have_dst_type = false;
244 if (!lex_force_match (lexer, T_LPAREN))
248 enum val_type dst_type;
250 if (!lex_match_id (lexer, "CONVERT"))
252 size_t first_map_idx = trns->n_maps;
254 /* Parse source specifications. */
259 if (!parse_map_in (lexer, &in, trns->pool,
260 trns->src_type, trns->max_src_width,
263 add_mapping (trns, &map_allocated, &in);
264 lex_match (lexer, T_COMMA);
266 while (!lex_match (lexer, T_EQUALS));
269 if (!parse_map_out (lexer, trns->pool, &out))
272 dst_type = (out.copy_input
274 : val_type_from_width (out.width));
275 for (size_t i = first_map_idx; i < trns->n_maps; i++)
276 trns->mappings[i].out = out;
280 /* Parse CONVERT as a special case. */
281 struct map_in in = { .type = MAP_CONVERT };
282 add_mapping (trns, &map_allocated, &in);
284 int ofs = lex_ofs (lexer) - 1;
285 trns->mappings[trns->n_maps - 1].out = (struct map_out) {
289 dst_type = VAL_NUMERIC;
290 if (trns->src_type != VAL_STRING)
292 lex_ofs_error (lexer, ofs, ofs,
293 _("CONVERT requires string input values."));
297 if (have_dst_type && dst_type != trns->dst_type)
299 msg (SE, _("Output values must be all numeric or all string."));
301 assert (trns->n_maps > 1);
302 const struct map_out *numeric = &trns->mappings[trns->n_maps - 2].out;
303 const struct map_out *string = &trns->mappings[trns->n_maps - 1].out;
305 if (trns->dst_type == VAL_STRING)
307 const struct map_out *tmp = numeric;
312 lex_ofs_msg (lexer, SN, numeric->ofs, numeric->ofs,
313 _("This output value is numeric."));
314 lex_ofs_msg (lexer, SN, string->ofs, string->ofs,
315 _("This output value is string."));
318 trns->dst_type = dst_type;
319 have_dst_type = true;
321 if (!lex_force_match (lexer, T_RPAREN))
324 while (lex_match (lexer, T_LPAREN));
329 /* Parses a mapping input value into IN, allocating memory from
330 POOL. The source value type must be provided as SRC_TYPE and,
331 if string, the maximum width of a string source variable must
332 be provided in MAX_SRC_WIDTH. Returns true if successful,
333 false on parse error. */
335 parse_map_in (struct lexer *lexer, struct map_in *in, struct pool *pool,
336 enum val_type src_type, size_t max_src_width,
337 const char *dict_encoding)
340 if (lex_match_id (lexer, "ELSE"))
341 *in = (struct map_in) { .type = MAP_ELSE };
342 else if (src_type == VAL_NUMERIC)
344 if (lex_match_id (lexer, "MISSING"))
345 *in = (struct map_in) { .type = MAP_MISSING };
346 else if (lex_match_id (lexer, "SYSMIS"))
347 *in = (struct map_in) { .type = MAP_SYSMIS };
351 if (!parse_num_range (lexer, &x, &y, NULL))
353 *in = (struct map_in) {
354 .type = x == y ? MAP_SINGLE : MAP_RANGE,
362 if (lex_match_id (lexer, "MISSING"))
363 *in = (struct map_in) { .type = MAP_MISSING };
364 else if (!lex_force_string (lexer))
368 set_map_in_str (in, pool, lex_tokss (lexer), max_src_width,
371 if (lex_match_id (lexer, "THRU"))
373 lex_next_error (lexer, -1, -1,
374 _("%s is not allowed with string variables."),
384 /* Adds IN to the list of mappings in TRNS.
385 MAP_ALLOCATED is the current number of allocated mappings,
386 which is updated as needed. */
388 add_mapping (struct recode_trns *trns,
389 size_t *map_allocated, const struct map_in *in)
392 if (trns->n_maps >= *map_allocated)
393 trns->mappings = pool_2nrealloc (trns->pool, trns->mappings,
395 sizeof *trns->mappings);
396 m = &trns->mappings[trns->n_maps++];
400 /* Sets IN as a string mapping, with STRING as the string,
401 allocated from POOL. The string is padded with spaces on the
402 right to WIDTH characters long. */
404 set_map_in_str (struct map_in *in, struct pool *pool,
405 struct substring string, size_t width,
406 const char *dict_encoding)
408 *in = (struct map_in) { .type = MAP_SINGLE };
410 char *s = recode_string (dict_encoding, "UTF-8",
411 ss_data (string), ss_length (string));
412 value_init_pool (pool, &in->x, width);
413 value_copy_buf_rpad (&in->x, width,
414 CHAR_CAST (uint8_t *, s), strlen (s), ' ');
418 /* Parses a mapping output value into OUT, allocating memory from
419 POOL. Returns true if successful, false on parse error. */
421 parse_map_out (struct lexer *lexer, struct pool *pool, struct map_out *out)
423 if (lex_is_number (lexer))
425 *out = (struct map_out) { .value = { .f = lex_number (lexer) } };
428 else if (lex_match_id (lexer, "SYSMIS"))
429 *out = (struct map_out) { .value = { .f = SYSMIS } };
430 else if (lex_is_string (lexer))
432 set_map_out_str (out, pool, lex_tokss (lexer));
435 else if (lex_match_id (lexer, "COPY"))
436 *out = (struct map_out) { .copy_input = true };
439 lex_error (lexer, _("Syntax error expecting output value."));
442 out->ofs = lex_ofs (lexer) - 1;
446 /* Sets OUT as a string mapping output with the given VALUE. */
448 set_map_out_str (struct map_out *out, struct pool *pool,
449 const struct substring value)
451 const char *string = ss_data (value);
452 size_t length = ss_length (value);
456 /* A length of 0 will yield a numeric value, which is not
462 *out = (struct map_out) { .width = length };
463 value_init_pool (pool, &out->value, length);
464 memcpy (out->value.s, string, length);
467 /* Parses a set of target variables into TRNS->dst_vars and
470 parse_dst_vars (struct lexer *lexer, struct recode_trns *trns,
471 const struct dictionary *dict, int src_start, int src_end,
472 int mappings_start, int mappings_end)
474 int dst_start, dst_end;
475 if (lex_match_id (lexer, "INTO"))
477 dst_start = lex_ofs (lexer);
479 if (!parse_mixed_vars_pool (lexer, dict, trns->pool,
480 &trns->dst_names, &n_names,
483 dst_end = lex_ofs (lexer) - 1;
485 if (n_names != trns->n_vars)
487 msg (SE, _("Source and target variable counts must match."));
488 lex_ofs_msg (lexer, SN, src_start, src_end,
489 ngettext ("There is %zu source variable.",
490 "There are %zu source variables.",
493 lex_ofs_msg (lexer, SN, dst_start, dst_end,
494 ngettext ("There is %zu target variable.",
495 "There are %zu target variables.",
501 trns->dst_vars = pool_nalloc (trns->pool,
502 trns->n_vars, sizeof *trns->dst_vars);
503 for (size_t i = 0; i < trns->n_vars; i++)
505 const struct variable *v;
506 v = trns->dst_vars[i] = dict_lookup_var (dict, trns->dst_names[i]);
507 if (v == NULL && trns->dst_type == VAL_STRING)
509 msg (SE, _("All string variables specified on INTO must already "
510 "exist. (Use the STRING command to create a string "
512 lex_ofs_msg (lexer, SN, dst_start, dst_end,
513 _("There is no variable named %s."),
521 dst_start = src_start;
524 trns->dst_vars = trns->src_vars;
525 if (trns->src_type != trns->dst_type)
527 if (trns->src_type == VAL_NUMERIC)
528 lex_ofs_error (lexer, mappings_start, mappings_end,
529 _("INTO is required with numeric input values "
530 "and string output values."));
532 lex_ofs_error (lexer, mappings_start, mappings_end,
533 _("INTO is required with string input values "
534 "and numeric output values."));
539 for (size_t i = 0; i < trns->n_vars; i++)
541 const struct variable *v = trns->dst_vars[i];
542 if (v && var_get_type (v) != trns->dst_type)
544 if (trns->dst_type == VAL_STRING)
545 lex_ofs_error (lexer, dst_start, dst_end,
546 _("Type mismatch: cannot store string data in "
547 "numeric variable %s."), var_get_name (v));
549 lex_ofs_error (lexer, dst_start, dst_end,
550 _("Type mismatch: cannot store numeric data in "
551 "string variable %s."), var_get_name (v));
559 /* Ensures that all the output values in TRNS are as wide as the
560 widest destination variable. */
562 enlarge_dst_widths (struct lexer *lexer, struct recode_trns *trns,
563 int dst_start, int dst_end)
565 const struct variable *narrow_var = NULL;
566 int min_dst_width = INT_MAX;
567 trns->max_dst_width = 0;
569 for (size_t i = 0; i < trns->n_vars; i++)
571 const struct variable *v = trns->dst_vars[i];
572 if (var_get_width (v) > trns->max_dst_width)
573 trns->max_dst_width = var_get_width (v);
575 if (var_get_width (v) < min_dst_width)
577 min_dst_width = var_get_width (v);
582 for (size_t i = 0; i < trns->n_maps; i++)
584 struct map_out *out = &trns->mappings[i].out;
585 if (!out->copy_input)
587 if (out->width > min_dst_width)
589 msg (SE, _("At least one target variable is too narrow for "
590 "the output values."));
591 lex_ofs_msg (lexer, SN, out->ofs, out->ofs,
592 _("This recoding output value has width %d."),
594 lex_ofs_msg (lexer, SN, dst_start, dst_end,
595 _("Target variable %s only has width %d."),
596 var_get_name (narrow_var),
597 var_get_width (narrow_var));
601 value_resize_pool (trns->pool, &out->value,
602 out->width, trns->max_dst_width);
609 /* Creates destination variables that don't already exist. */
611 create_dst_vars (struct recode_trns *trns, struct dictionary *dict)
613 for (size_t i = 0; i < trns->n_vars; i++)
615 const struct variable **var = &trns->dst_vars[i];
616 const char *name = trns->dst_names[i];
618 *var = dict_lookup_var (dict, name);
620 *var = dict_create_var_assert (dict, name, 0);
621 assert (var_get_type (*var) == trns->dst_type);
625 /* Data transformation. */
627 /* Returns the output mapping in TRNS for an input of VALUE on
628 variable V, or a null pointer if there is no mapping. */
629 static const struct map_out *
630 find_src_numeric (struct recode_trns *trns, double value, const struct variable *v)
632 for (struct mapping *m = trns->mappings; m < trns->mappings + trns->n_maps;
635 const struct map_in *in = &m->in;
636 const struct map_out *out = &m->out;
642 match = value == in->x.f;
645 match = var_is_num_missing (v, value) != 0;
648 match = value >= in->x.f && value <= in->y.f;
651 match = value == SYSMIS;
667 /* Returns the output mapping in TRNS for an input of VALUE with
668 the given WIDTH, or a null pointer if there is no mapping. */
669 static const struct map_out *
670 find_src_string (struct recode_trns *trns, const uint8_t *value,
671 const struct variable *src_var)
673 const char *encoding = dict_get_encoding (trns->dst_dict);
674 int width = var_get_width (src_var);
675 for (struct mapping *m = trns->mappings; m < trns->mappings + trns->n_maps;
678 const struct map_in *in = &m->in;
679 struct map_out *out = &m->out;
685 match = !memcmp (value, in->x.s, width);
695 error = data_in (ss_buffer (CHAR_CAST_BUG (char *, value), width),
696 C_ENCODING, FMT_F, settings_get_fmt_settings (),
698 match = error == NULL;
705 match = var_is_str_missing (src_var, value) != 0;
718 /* Performs RECODE transformation. */
719 static enum trns_result
720 recode_trns_proc (void *trns_, struct ccase **c, casenumber case_idx UNUSED)
722 struct recode_trns *trns = trns_;
724 *c = case_unshare (*c);
725 for (size_t i = 0; i < trns->n_vars; i++)
727 const struct variable *src_var = trns->src_vars[i];
728 const struct variable *dst_var = trns->dst_vars[i];
729 const struct map_out *out;
731 if (trns->src_type == VAL_NUMERIC)
732 out = find_src_numeric (trns, case_num (*c, src_var), src_var);
734 out = find_src_string (trns, case_str (*c, src_var), src_var);
736 if (trns->dst_type == VAL_NUMERIC)
738 double *dst = case_num_rw (*c, dst_var);
740 *dst = !out->copy_input ? out->value.f : case_num (*c, src_var);
741 else if (trns->src_vars != trns->dst_vars)
746 char *dst = CHAR_CAST_BUG (char *, case_str_rw (*c, dst_var));
749 if (!out->copy_input)
750 memcpy (dst, out->value.s, var_get_width (dst_var));
751 else if (trns->src_vars != trns->dst_vars)
753 union value *dst_data = case_data_rw (*c, dst_var);
754 const union value *src_data = case_data (*c, src_var);
755 value_copy_rpad (dst_data, var_get_width (dst_var),
756 src_data, var_get_width (src_var), ' ');
759 else if (trns->src_vars != trns->dst_vars)
760 memset (dst, ' ', var_get_width (dst_var));
764 return TRNS_CONTINUE;
767 /* Frees a RECODE transformation. */
769 recode_trns_free (void *trns_)
771 struct recode_trns *trns = trns_;
772 pool_destroy (trns->pool);
776 static const struct trns_class recode_trns_class = {
778 .execute = recode_trns_proc,
779 .destroy = recode_trns_free,