From: Ben Pfaff Date: Sat, 6 May 2023 01:01:06 +0000 (-0700) Subject: combine-files: Allow matching string variables to have different widths. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=638bf380e550;p=pspp combine-files: Allow matching string variables to have different widths. Requested by Frans Houweling and Alan Mead at https://lists.gnu.org/archive/html/pspp-users/2015-03/msg00008.html --- diff --git a/NEWS b/NEWS index 2ba75571b3..6c15134edf 100644 --- a/NEWS +++ b/NEWS @@ -17,6 +17,9 @@ Changes after 1.6.2: - Break variables are now optional. + * ADD FILES, MATCH FILES, and UPDATE now allow string variables with + the same name to have different widths. + * CROSSTABS now calculates significance of Pearson and Spearman correlations in symmetric measures. diff --git a/doc/combining.texi b/doc/combining.texi index 751612a874..8bb9eafb33 100644 --- a/doc/combining.texi +++ b/doc/combining.texi @@ -116,8 +116,10 @@ The variables in the new active dataset are the union of all the input files' variables, matched based on their name. When a single input file contains a variable with a given name, the output file will contain exactly that variable. When more than one input file contains a -variable with a given name, those variables must all have the same -type (numeric or string) and, for string variables, the same width. +variable with a given name, those variables must be all string or all numeric. +If they are string variables, then the result will have the width of the longest +variable with that name, with narrower values padded on the right with spaces +to fill the width. Variables are matched after renaming with the @subcmd{RENAME} subcommand. Thus, @subcmd{RENAME} can be used to resolve conflicts. Only variables in the output file can conflict, so @subcmd{DROP} or diff --git a/src/data/subcase.h b/src/data/subcase.h index 027a639542..7abbef6d49 100644 --- a/src/data/subcase.h +++ b/src/data/subcase.h @@ -84,6 +84,7 @@ static inline size_t subcase_get_n_fields (const struct subcase *); static inline size_t subcase_get_case_index (const struct subcase *, size_t idx); +static inline int subcase_get_width (const struct subcase *, size_t idx); static inline enum subcase_direction subcase_get_direction ( const struct subcase *, size_t idx); @@ -119,6 +120,12 @@ subcase_get_case_index (const struct subcase *sc, size_t idx) return sc->fields[idx].case_index; } +static inline int +subcase_get_width (const struct subcase *sc, size_t idx) +{ + return sc->fields[idx].width; +} + static inline enum subcase_direction subcase_get_direction (const struct subcase *sc, size_t idx) { diff --git a/src/language/commands/combine-files.c b/src/language/commands/combine-files.c index ad6914b364..5641328101 100644 --- a/src/language/commands/combine-files.c +++ b/src/language/commands/combine-files.c @@ -62,6 +62,21 @@ enum comb_file_type COMB_TABLE /* Specified on TABLE= subcommand. */ }; +/* These commands combine multiple input files into a single master file. The + input files may merge string variables with different widths, right-padding + with spaces to the length of the longest. This data structure allows for + that. */ +struct comb_resizer + { + struct caseproto *output_proto; + int *indexes; + size_t n_indexes; + }; + +static struct casereader *comb_resize (struct casereader *, + struct comb_resizer *); +static void comb_resizer_destroy (struct comb_resizer *); + /* One FILE or TABLE subcommand. */ struct comb_file { @@ -82,6 +97,7 @@ struct comb_file bool is_minimal; /* Does 'data' have minimum BY values across all input files? */ bool is_sorted; /* Is file presorted on the BY variables? */ + struct comb_resizer *resizer; /* If necessary. */ /* IN subcommand. */ char *in_name; @@ -98,7 +114,7 @@ struct comb_proc struct subcase by_vars; /* BY variables in the output. */ struct casewriter *output; /* Destination for output. */ - /* Names of variables whose types or widths different among the files. + /* Names of variables whose types differ among the files. It's OK if they're all dropped, but not otherwise. */ struct stringi_set different_types; @@ -277,6 +293,41 @@ combine_files (enum comb_command_type command, merge_dictionary (&proc, file); } + for (size_t i = 0; i < dict_get_n_vars (proc.dict); i++) + { + const struct variable *mv = dict_get_var (proc.dict, i); + const char *name = var_get_name (mv); + int mw = var_get_width (mv); + if (!mw || stringi_set_contains (&proc.different_types, name)) + continue; + + for (size_t j = 0; j < proc.n_files; j++) + { + struct comb_file *cf = &proc.files[j]; + struct variable *dv = dict_lookup_var (cf->dict, name); + if (!dv) + continue; + + int dw = var_get_width (dv); + assert (dw <= mw); + if (dw < mw) + { + struct comb_resizer *r = cf->resizer; + if (!r) + { + r = xmalloc (sizeof *r); + *r = (struct comb_resizer) { + .output_proto = caseproto_ref (dict_get_proto (proc.dict)), + .indexes = xnmalloc (dict_get_n_vars (cf->dict), + sizeof *r->indexes), + }; + cf->resizer = r; + } + r->indexes[r->n_indexes++] = var_get_dict_index (dv); + } + } + } + while (lex_token (lexer) != T_ENDCMD) { if (lex_match (lexer, T_BY)) @@ -316,8 +367,9 @@ combine_files (enum comb_command_type command, const char *name = var_get_name (by_vars[j]); struct variable *var = dict_lookup_var (file->dict, name); if (var != NULL) - subcase_add_var (&file->by_vars, var, - subcase_get_direction (&proc.by_vars, j)); + subcase_add (&file->by_vars, var_get_dict_index (var), + subcase_get_width (&proc.by_vars, j), + subcase_get_direction (&proc.by_vars, j)); else { const char *fn @@ -491,6 +543,11 @@ combine_files (enum comb_command_type command, else file->reader = casereader_clone (active_file); } + if (file->resizer) + { + file->reader = comb_resize (file->reader, file->resizer); + file->resizer = NULL; + } if (!file->is_sorted) file->reader = sort_execute (file->reader, &file->by_vars); taint_propagate (casereader_get_taint (file->reader), taint); @@ -580,8 +637,11 @@ merge_dictionary (struct comb_proc *proc, struct comb_file *f) if (!mv) mv = dict_clone_var_assert (m, dv); - else if (var_get_width (mv) == var_get_width (dv)) + else if (var_get_type (mv) == var_get_type (dv)) { + if (var_get_width (dv) > var_get_width (mv)) + var_set_width (mv, var_get_width (dv)); + if (var_has_value_labels (dv) && !var_has_value_labels (mv)) var_set_value_labels (mv, var_get_value_labels (dv)); if (var_has_missing_values (dv) && !var_has_missing_values (mv)) @@ -598,8 +658,7 @@ static void different_types_error (struct comb_proc *proc, struct lexer *lexer, const char *var_name) { - msg (SE, _("Variable %s has different type or width in different " - "files."), var_name); + msg (SE, _("Variable %s has different types in different files."), var_name); for (size_t i = 0; i < proc->n_files; i++) { const struct comb_file *ef = &proc->files[i]; @@ -614,8 +673,7 @@ different_types_error (struct comb_proc *proc, fn, var_name); else lex_ofs_msg (lexer, SN, ef->start_ofs, ef->end_ofs, - _("In file %s, %s is a string with width %d."), - fn, var_name, var_get_width (ev)); + _("In file %s, %s is a string."), fn, var_name); } } @@ -707,6 +765,7 @@ close_all_comb_files (struct comb_proc *proc) dict_unref (file->dict); casereader_destroy (file->reader); case_unref (file->data); + comb_resizer_destroy (file->resizer); free (file->in_name); } free (proc->files); @@ -1000,3 +1059,56 @@ output_buffered_case (struct comb_proc *proc) proc->buffered_case = NULL; } } + +static void +comb_resizer_destroy (struct comb_resizer *r) +{ + if (!r) + return; + + caseproto_unref (r->output_proto); + free (r->indexes); + free (r); +} + +static struct ccase * +comb_resize_translate (struct ccase *c, void *r_) +{ + struct comb_resizer *r = r_; + + c = case_unshare (c); + + for (size_t i = 0; i < r->n_indexes; i++) + { + int idx = r->indexes[i]; + value_resize (&c->values[idx], + caseproto_get_width (c->proto, idx), + caseproto_get_width (r->output_proto, idx)); + } + + caseproto_unref (c->proto); + c->proto = caseproto_ref (r->output_proto); + + return c; +} + +static bool +comb_resizer_translate_destroy (void *r_) +{ + struct comb_resizer *r = r_; + + comb_resizer_destroy (r); + return true; +} + +static struct casereader * +comb_resize (struct casereader *subreader, struct comb_resizer *r) +{ + static struct casereader_translator_class class = { + .translate = comb_resize_translate, + .destroy = comb_resizer_translate_destroy, + }; + + return casereader_translate_stateless (subreader, r->output_proto, + &class, r); +} diff --git a/tests/language/commands/match-files.at b/tests/language/commands/match-files.at index 7e7aee352b..3341ce79f1 100644 --- a/tests/language/commands/match-files.at +++ b/tests/language/commands/match-files.at @@ -215,11 +215,7 @@ x,y,j ]) AT_CLEANUP -dnl Tests for a bug that caused MATCH FILES to crash -dnl with incompatible variables, especially but not -dnl exclusively when one variable came from the active -dnl file. -AT_SETUP([MATCH FILES with incompatible variable types]) +AT_SETUP([MATCH FILES with different string widths]) AT_DATA([match-files.sps], [dnl DATA LIST LIST NOTABLE/name (A6) x. BEGIN DATA. @@ -238,18 +234,13 @@ END DATA. MATCH FILES/FILE='x.sav'/FILE=*/BY name. LIST. ]) -AT_CHECK([pspp -O format=csv match-files.sps], [1], [dnl -match-files.sps:15: error: MATCH FILES: Variable name has different type or width in different files. - -"match-files.sps:15.13-15.24: note: MATCH FILES: In file `x.sav', name is a string with width 6. - 15 | MATCH FILES/FILE='x.sav'/FILE=*/BY name. - | ^~~~~~~~~~~~" - -"match-files.sps:15.26-15.31: note: MATCH FILES: In file *, name is a string with width 7. - 15 | MATCH FILES/FILE='x.sav'/FILE=*/BY name. - | ^~~~~~" - -match-files.sps:16: error: Stopping syntax file processing here to avoid a cascade of dependent command failures. +AT_CHECK([pspp -O format=csv match-files.sps], [0], [dnl +Table: Data List +name,x,y +al,7.00,1.00 +brad,8.00,. @&t@ +carl,9.00,2.00 +dan,. ,3.00 ]) AT_CLEANUP @@ -277,7 +268,7 @@ al,1 carl,2 dan,3 END DATA. -MATCH FILES/FILE='x.sav'/FILE=*/BY name. +MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x)/BY x. MATCH FILES/FILE='x.sav'/IN=**. MATCH FILES/FILE='x.sav'/IN=x/IN=y. MATCH FILES/FILE='x.sav'/BY=x/BY=y. @@ -299,9 +290,10 @@ MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=name2)/LAST=x. MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=name2)/IN=x. MATCH FILES/KEEP=**. -* The different 'name' variables are a problem, but not if we drop them. -MATCH FILES/FILE='x.sav'/FILE=*/RENAME(y=x). -MATCH FILES/FILE='x.sav'/FILE=*/RENAME(y=x)/DROP=name. +* It's a problem to have different types and the same name, +but not if we drop them. +MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x). +MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x)/DROP=x. ]) AT_CHECK([pspp --testing-mode -O format=csv insert.sps], [1], [dnl "match-files.sps:1.18: error: MATCH FILES: Cannot specify the active dataset since none has been defined. @@ -312,14 +304,14 @@ AT_CHECK([pspp --testing-mode -O format=csv insert.sps], [1], [dnl 12 | MATCH FILES/FILE=*. | ^" -match-files.sps:20: error: MATCH FILES: Variable name has different type or width in different files. +match-files.sps:20: error: MATCH FILES: Variable x has different types in different files. -"match-files.sps:20.13-20.24: note: MATCH FILES: In file `x.sav', name is a string with width 6. - 20 | MATCH FILES/FILE='x.sav'/FILE=*/BY name. +"match-files.sps:20.13-20.24: note: MATCH FILES: In file `x.sav', x is numeric. + 20 | MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x)/BY x. | ^~~~~~~~~~~~" -"match-files.sps:20.26-20.31: note: MATCH FILES: In file *, name is a string with width 7. - 20 | MATCH FILES/FILE='x.sav'/FILE=*/BY name. +"match-files.sps:20.26-20.31: note: MATCH FILES: In file *, x is a string. + 20 | MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x)/BY x. | ^~~~~~" "match-files.sps:21.29-21.30: error: MATCH FILES: Syntax error expecting identifier. @@ -402,14 +394,14 @@ match-files.sps:20: error: MATCH FILES: Variable name has different type or widt 40 | MATCH FILES/KEEP=**. | ^~~~" -match-files.sps:43: error: MATCH FILES: Variable name has different type or width in different files. +match-files.sps:44: error: MATCH FILES: Variable x has different types in different files. -"match-files.sps:43.13-43.24: note: MATCH FILES: In file `x.sav', name is a string with width 6. - 43 | MATCH FILES/FILE='x.sav'/FILE=*/RENAME(y=x). +"match-files.sps:44.13-44.24: note: MATCH FILES: In file `x.sav', x is numeric. + 44 | MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x). | ^~~~~~~~~~~~" -"match-files.sps:43.26-43.31: note: MATCH FILES: In file *, name is a string with width 7. - 43 | MATCH FILES/FILE='x.sav'/FILE=*/RENAME(y=x). +"match-files.sps:44.26-44.31: note: MATCH FILES: In file *, x is a string. + 44 | MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x). | ^~~~~~" ]) AT_CLEANUP