1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2009, 2010, 2012, 2013, 2014 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
22 #include "data/case.h"
23 #include "data/casereader.h"
24 #include "data/dataset.h"
25 #include "data/dictionary.h"
26 #include "data/transformations.h"
27 #include "data/variable.h"
28 #include "language/command.h"
29 #include "language/lexer/lexer.h"
30 #include "language/lexer/variable-parser.h"
31 #include "libpspp/array.h"
32 #include "libpspp/compiler.h"
33 #include "libpspp/hash-functions.h"
34 #include "libpspp/hmap.h"
35 #include "libpspp/i18n.h"
36 #include "libpspp/message.h"
37 #include "libpspp/pool.h"
38 #include "libpspp/str.h"
40 #include "gl/xalloc.h"
41 #include "gl/c-xvasprintf.h"
42 #include "gl/mbiter.h"
46 #define _(msgid) gettext (msgid)
48 /* FIXME: Implement PRINT subcommand. */
50 /* Explains how to recode one value. */
53 struct hmap_node hmap_node; /* Element in "struct arc_spec" hash table. */
54 union value from; /* Original value. */
55 int width; /* Width of the original value */
57 double to; /* Recoded value. */
60 /* Explains how to recode an AUTORECODE variable. */
63 int width; /* Variable width. */
64 int src_idx; /* Case index of source variable. */
65 struct variable *dst; /* Target variable. */
66 struct rec_items *items;
69 /* Descending or ascending sort order. */
78 struct hmap ht; /* Hash table of "struct arc_item"s. */
83 /* AUTORECODE data. */
86 struct arc_spec *specs;
92 static trns_proc_func autorecode_trns_proc;
93 static trns_free_func autorecode_trns_free;
95 static int compare_arc_items (const void *, const void *, const void *aux);
96 static void arc_free (struct autorecode_pgm *);
97 static struct arc_item *find_arc_item (
98 const struct rec_items *, const union value *, int width,
101 /* Returns WIDTH with any trailing spaces in VALUE trimmed off (except that a
102 minimum width of 1 is always returned because otherwise the width would
103 indicate a numeric type). */
105 value_trim_spaces (const union value *value, int width)
107 while (width > 1 && value->s[width - 1] == ' ')
112 /* Performs the AUTORECODE procedure. */
114 cmd_autorecode (struct lexer *lexer, struct dataset *ds)
116 struct dictionary *dict = dataset_dict (ds);
118 const struct variable **src_vars = NULL;
121 char **dst_names = NULL;
124 enum arc_direction direction = ASCENDING;
126 /* Create procedure. */
127 struct autorecode_pgm *arc = xzalloc (sizeof *arc);
128 arc->blank_valid = true;
130 /* Parse variable lists. */
131 lex_match_id (lexer, "VARIABLES");
132 lex_match (lexer, T_EQUALS);
133 if (!parse_variables_const (lexer, dict, &src_vars, &n_srcs,
134 PV_NO_DUPLICATE | PV_NO_SCRATCH))
136 lex_match (lexer, T_SLASH);
137 if (!lex_force_match_id (lexer, "INTO"))
139 lex_match (lexer, T_EQUALS);
140 if (!parse_DATA_LIST_vars (lexer, dict, &dst_names, &n_dsts,
143 if (n_dsts != n_srcs)
145 msg (SE, _("Source variable count (%zu) does not match "
146 "target variable count (%zu)."),
151 for (size_t i = 0; i < n_dsts; i++)
153 const char *name = dst_names[i];
155 if (dict_lookup_var (dict, name) != NULL)
157 msg (SE, _("Target variable %s duplicates existing variable %s."),
165 while (lex_match (lexer, T_SLASH))
167 if (lex_match_id (lexer, "DESCENDING"))
168 direction = DESCENDING;
169 else if (lex_match_id (lexer, "PRINT"))
171 /* Not yet implemented. */
173 else if (lex_match_id (lexer, "GROUP"))
175 else if (lex_match_id (lexer, "BLANK"))
177 lex_match (lexer, T_EQUALS);
178 if (lex_match_id (lexer, "VALID"))
180 arc->blank_valid = true;
182 else if (lex_match_id (lexer, "MISSING"))
184 arc->blank_valid = false;
188 lex_error_expecting (lexer, "VALID", "MISSING");
194 lex_error_expecting (lexer, "DESCENDING", "PRINT", "GROUP", "BLANK");
199 if (lex_token (lexer) != T_ENDCMD)
201 lex_error (lexer, _("expecting end of command"));
205 /* If GROUP is specified, verify that the variables are all string or all
209 enum val_type type = var_get_type (src_vars[0]);
210 for (size_t i = 1; i < n_dsts; i++)
212 if (var_get_type (src_vars[i]) != type)
214 size_t string_idx = type == VAL_STRING ? 0 : i;
215 size_t numeric_idx = type == VAL_STRING ? i : 0;
216 lex_error (lexer, _("With GROUP, variables may not mix string "
217 "variables (such as %s) and numeric "
218 "variables (such as %s)."),
219 var_get_name (src_vars[string_idx]),
220 var_get_name (src_vars[numeric_idx]));
226 /* Allocate all the specs and the rec_items that they point to.
228 If GROUP is specified, there is only a single global rec_items, with the
229 maximum width 'width', and all of the specs point to it; otherwise each
230 spec has its own rec_items. */
231 arc->specs = xmalloc (n_dsts * sizeof *arc->specs);
232 arc->n_specs = n_dsts;
233 for (size_t i = 0; i < n_dsts; i++)
235 struct arc_spec *spec = &arc->specs[i];
237 spec->width = var_get_width (src_vars[i]);
238 spec->src_idx = var_get_case_index (src_vars[i]);
241 spec->items = arc->specs[0].items;
244 spec->items = xzalloc (sizeof (*spec->items));
245 hmap_init (&spec->items->ht);
249 /* Execute procedure. */
250 struct casereader *input = proc_open (ds);
252 for (; (c = casereader_read (input)) != NULL; case_unref (c))
253 for (size_t i = 0; i < arc->n_specs; i++)
255 struct arc_spec *spec = &arc->specs[i];
256 const union value *value = case_data_idx (c, spec->src_idx);
257 int width = value_trim_spaces (value, spec->width);
258 if (width == 1 && value->s[0] == ' ' && !arc->blank_valid)
261 size_t hash = value_hash (value, width, 0);
262 if (find_arc_item (spec->items, value, width, hash))
265 struct arc_item *item = xmalloc (sizeof *item);
267 value_clone (&item->from, value, width);
268 hmap_insert (&spec->items->ht, &item->hmap_node, hash);
270 bool ok = casereader_destroy (input);
271 ok = proc_commit (ds) && ok;
273 /* Re-fetch dictionary because it might have changed (if TEMPORARY was in
275 dict = dataset_dict (ds);
277 /* Create transformation. */
278 for (size_t i = 0; i < arc->n_specs; i++)
280 struct arc_spec *spec = &arc->specs[i];
281 struct arc_item **items;
282 struct arc_item *item;
286 /* Create destination variable. */
287 spec->dst = dict_create_var_assert (dict, dst_names[i], 0);
289 /* Create array of pointers to items. */
290 n_items = hmap_count (&spec->items->ht);
291 items = xmalloc (n_items * sizeof *items);
293 HMAP_FOR_EACH (item, struct arc_item, hmap_node, &spec->items->ht)
296 assert (j == n_items);
298 /* Sort array by value. */
299 sort (items, n_items, sizeof *items, compare_arc_items, NULL);
301 /* Assign recoded values in sorted order. */
302 for (j = 0; j < n_items; j++)
303 items[j]->to = direction == ASCENDING ? j + 1 : n_items - j;
305 /* Add value labels to the destination variable which indicate
306 the source value from whence the new value comes. */
307 for (j = 0; j < n_items; j++)
309 const union value *from = &items[j]->from;
310 const int src_width = items[j]->width;
314 const char *str = CHAR_CAST_BUG (const char *, from->s);
316 recoded_value = recode_string (UTF8, dict_get_encoding (dict),
320 recoded_value = c_xasprintf ("%.*g", DBL_DIG + 1, from->f);
322 /* Remove trailing whitespace. */
323 size_t len = strlen (recoded_value);
324 while (len > 0 && recoded_value[len - 1] == ' ')
325 recoded_value[--len] = '\0';
327 /* Add value label, if it would be nonempty. */
330 union value to_val = { .f = items[j]->to };
331 var_add_value_label (spec->dst, &to_val, recoded_value);
333 free (recoded_value);
339 add_transformation (ds, autorecode_trns_proc, autorecode_trns_free, arc);
341 for (size_t i = 0; i < n_dsts; i++)
346 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
349 for (size_t i = 0; i < n_dsts; i++)
354 return CMD_CASCADING_FAILURE;
358 arc_free (struct autorecode_pgm *arc)
362 for (size_t i = 0; i < arc->n_specs; i++)
364 struct arc_spec *spec = &arc->specs[i];
365 struct arc_item *item, *next;
367 HMAP_FOR_EACH_SAFE (item, next, struct arc_item, hmap_node,
370 value_destroy (&item->from, item->width);
371 hmap_delete (&spec->items->ht, &item->hmap_node);
377 (arc->n_specs == 1 || arc->specs[0].items == arc->specs[1].items
380 for (size_t i = 0; i < n_rec_items; i++)
382 struct arc_spec *spec = &arc->specs[i];
383 hmap_destroy (&spec->items->ht);
392 static struct arc_item *
393 find_arc_item (const struct rec_items *items,
394 const union value *value, int width,
397 struct arc_item *item;
399 HMAP_FOR_EACH_WITH_HASH (item, struct arc_item, hmap_node, hash, &items->ht)
400 if (item->width == width && value_equal (value, &item->from, width))
406 compare_arc_items (const void *a_, const void *b_, const void *aux UNUSED)
408 const struct arc_item *const *a = a_;
409 const struct arc_item *const *b = b_;
410 int width_a = (*a)->width;
411 int width_b = (*b)->width;
413 if ( width_a == width_b)
414 return value_compare_3way (&(*a)->from, &(*b)->from, width_a);
416 if ( width_a == 0 && width_b != 0)
419 if ( width_b == 0 && width_a != 0)
422 return buf_compare_rpad (CHAR_CAST_BUG (const char *, (*a)->from.s), width_a,
423 CHAR_CAST_BUG (const char *, (*b)->from.s), width_b);
427 autorecode_trns_proc (void *arc_, struct ccase **c,
428 casenumber case_idx UNUSED)
430 struct autorecode_pgm *arc = arc_;
432 *c = case_unshare (*c);
433 for (size_t i = 0; i < arc->n_specs; i++)
435 const struct arc_spec *spec = &arc->specs[i];
436 const union value *value = case_data_idx (*c, spec->src_idx);
437 int width = value_trim_spaces (value, spec->width);
438 size_t hash = value_hash (value, width, 0);
439 const struct arc_item *item = find_arc_item (spec->items, value, width,
441 case_data_rw (*c, spec->dst)->f = item ? item->to : SYSMIS;
444 return TRNS_CONTINUE;
448 autorecode_trns_free (void *arc_)
450 struct autorecode_pgm *arc = arc_;