From e24089bbe924045c7b98e4bbcc0dc1c4b8703429 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Fri, 17 Jun 2011 22:21:38 -0700 Subject: [PATCH] sys-file-writer: Write identical sets of value labels only once. The .sav file format can save space by writing a single set of value labels once and assigning it to multiple variables. PSPP has never taken advantage of this, instead writing the value labels for each variable separately. This commit adopts the strategy for avoiding duplicates apparently used by SPSS, based on observing a .sav file that had many groups of variables with the same labels. Thanks to Curt Reinhold for providing the .sav file that led to this discovery. --- src/data/sys-file-writer.c | 145 ++++++++++++++++++++++++++----------- src/data/value-labels.c | 63 +++++++++++++--- src/data/value-labels.h | 4 + 3 files changed, 161 insertions(+), 51 deletions(-) diff --git a/src/data/sys-file-writer.c b/src/data/sys-file-writer.c index 63e14ffb..9586cbc2 100644 --- a/src/data/sys-file-writer.c +++ b/src/data/sys-file-writer.c @@ -98,8 +98,8 @@ static const struct casewriter_class sys_file_casewriter_class; static void write_header (struct sfm_writer *, const struct dictionary *); static void write_variable (struct sfm_writer *, const struct variable *); -static void write_value_labels (struct sfm_writer *, struct variable *, - int idx); +static void write_value_labels (struct sfm_writer *, + const struct dictionary *); static void write_integer_info_record (struct sfm_writer *, const struct dictionary *); static void write_float_info_record (struct sfm_writer *); @@ -178,7 +178,6 @@ sfm_open_writer (struct file_handle *fh, struct dictionary *d, { struct sfm_writer *w; mode_t mode; - int idx; int i; /* Check version. */ @@ -236,15 +235,7 @@ sfm_open_writer (struct file_handle *fh, struct dictionary *d, for (i = 0; i < dict_get_var_cnt (d); i++) write_variable (w, dict_get_var (d, i)); - /* Write out value labels. */ - idx = 0; - for (i = 0; i < dict_get_var_cnt (d); i++) - { - struct variable *v = dict_get_var (d, i); - - write_value_labels (w, v, idx); - idx += sfm_width_to_octs (var_get_width (v)); - } + write_value_labels (w, d); if (dict_get_document_line_cnt (d) > 0) write_documents (w, d); @@ -515,47 +506,117 @@ write_variable (struct sfm_writer *w, const struct variable *v) mv_destroy (&mv); } -/* Writes the value labels for variable V having system file - variable index IDX to system file W. +/* Writes the value labels to system file W. Value labels for long string variables are written separately, by write_long_string_value_labels. */ static void -write_value_labels (struct sfm_writer *w, struct variable *v, int idx) +write_value_labels (struct sfm_writer *w, const struct dictionary *d) { - const struct val_labs *val_labs; - const struct val_lab **labels; - size_t n_labels; + struct label_set + { + struct hmap_node hmap_node; + const struct val_labs *val_labs; + int *indexes; + size_t n_indexes, allocated_indexes; + }; + + size_t n_sets, allocated_sets; + struct label_set **sets; + struct hmap same_sets; size_t i; + int idx; - val_labs = var_get_value_labels (v); - n_labels = val_labs_count (val_labs); - if (n_labels == 0 || var_get_width (v) > 8) - return; + n_sets = allocated_sets = 0; + sets = NULL; + hmap_init (&same_sets); - /* Value label record. */ - write_int (w, 3); /* Record type. */ - write_int (w, val_labs_count (val_labs)); - labels = val_labs_sorted (val_labs); - for (i = 0; i < n_labels; i++) + idx = 0; + for (i = 0; i < dict_get_var_cnt (d); i++) { - const struct val_lab *vl = labels[i]; - char *label = recode_string (var_get_encoding (v), UTF8, - val_lab_get_escaped_label (vl), -1); - uint8_t len = MIN (strlen (label), 255); - - write_value (w, val_lab_get_value (vl), var_get_width (v)); - write_bytes (w, &len, 1); - write_bytes (w, label, len); - write_zeros (w, REM_RND_UP (len + 1, 8)); - free (label); + struct variable *v = dict_get_var (d, i); + + if (var_has_value_labels (v) && var_get_width (v) <= 8) + { + const struct val_labs *val_labs = var_get_value_labels (v); + unsigned int hash = val_labs_hash (val_labs, 0); + struct label_set *set; + + HMAP_FOR_EACH_WITH_HASH (set, struct label_set, hmap_node, + hash, &same_sets) + { + if (val_labs_equal (set->val_labs, val_labs)) + { + if (set->n_indexes >= set->allocated_indexes) + set->indexes = x2nrealloc (set->indexes, + &set->allocated_indexes, + sizeof *set->indexes); + set->indexes[set->n_indexes++] = idx; + goto next_var; + } + } + + set = xmalloc (sizeof *set); + set->val_labs = val_labs; + set->indexes = xmalloc (sizeof *set->indexes); + set->indexes[0] = idx; + set->n_indexes = 1; + set->allocated_indexes = 1; + hmap_insert (&same_sets, &set->hmap_node, hash); + + if (n_sets >= allocated_sets) + sets = x2nrealloc (sets, &allocated_sets, sizeof *sets); + sets[n_sets++] = set; + } + + next_var: + idx += sfm_width_to_octs (var_get_width (v)); + } + + for (i = 0; i < n_sets; i++) + { + const struct label_set *set = sets[i]; + const struct val_labs *val_labs = set->val_labs; + size_t n_labels = val_labs_count (val_labs); + int width = val_labs_get_width (val_labs); + const struct val_lab **labels; + size_t j; + + /* Value label record. */ + write_int (w, 3); /* Record type. */ + write_int (w, n_labels); + labels = val_labs_sorted (val_labs); + for (j = 0; j < n_labels; j++) + { + const struct val_lab *vl = labels[j]; + char *label = recode_string (dict_get_encoding (d), UTF8, + val_lab_get_escaped_label (vl), -1); + uint8_t len = MIN (strlen (label), 255); + + write_value (w, val_lab_get_value (vl), width); + write_bytes (w, &len, 1); + write_bytes (w, label, len); + write_zeros (w, REM_RND_UP (len + 1, 8)); + free (label); + } + free (labels); + + /* Value label variable record. */ + write_int (w, 4); /* Record type. */ + write_int (w, set->n_indexes); + for (j = 0; j < set->n_indexes; j++) + write_int (w, set->indexes[j] + 1); } - free (labels); - /* Value label variable record. */ - write_int (w, 4); /* Record type. */ - write_int (w, 1); /* Number of variables. */ - write_int (w, idx + 1); /* Variable's dictionary index. */ + for (i = 0; i < n_sets; i++) + { + struct label_set *set = sets[i]; + + free (set->indexes); + free (set); + } + free (sets); + hmap_destroy (&same_sets); } /* Writes record type 6, document record. */ diff --git a/src/data/value-labels.c b/src/data/value-labels.c index 6d7f19c1..746e2090 100644 --- a/src/data/value-labels.c +++ b/src/data/value-labels.c @@ -231,21 +231,30 @@ val_labs_find (const struct val_labs *vls, const union value *value) return label ? label->label : NULL; } +/* Searches VLS for a value label for VALUE. If successful, + returns the value label; otherwise, returns a null pointer. + Returns a null pointer if VLS is null. */ +static struct val_lab * +val_labs_lookup__ (const struct val_labs *vls, const union value *value, + unsigned int hash) +{ + struct val_lab *label; + + HMAP_FOR_EACH_WITH_HASH (label, struct val_lab, node, hash, &vls->labels) + if (value_equal (&label->value, value, vls->width)) + return label; + + return NULL; +} + /* Searches VLS for a value label for VALUE. If successful, returns the value label; otherwise, returns a null pointer. Returns a null pointer if VLS is null. */ struct val_lab * val_labs_lookup (const struct val_labs *vls, const union value *value) { - if (vls != NULL) - { - struct val_lab *label; - HMAP_FOR_EACH_WITH_HASH (label, struct val_lab, node, - value_hash (value, vls->width, 0), &vls->labels) - if (value_equal (&label->value, value, vls->width)) - return label; - } - return NULL; + return (vls == NULL ? NULL + : val_labs_lookup__ (vls, value, value_hash (value, vls->width, 0))); } /* Returns the first value label in VLS, in arbitrary order, or a @@ -301,3 +310,39 @@ val_labs_sorted (const struct val_labs *vls) else return NULL; } + +/* Returns a hash value that represents all of the labels in VLS, starting from + BASIS. */ +unsigned int +val_labs_hash (const struct val_labs *vls, unsigned int basis) +{ + const struct val_lab *label; + unsigned int hash; + + hash = hash_int (val_labs_count (vls), basis); + HMAP_FOR_EACH (label, struct val_lab, node, &vls->labels) + hash ^= value_hash (&label->value, vls->width, + hash_string (label->label, basis)); + return hash; +} + +/* Returns true if A and B contain the same values with the same labels, + false if they differ in some way. */ +bool +val_labs_equal (const struct val_labs *a, const struct val_labs *b) +{ + const struct val_lab *label; + + if (val_labs_count (a) != val_labs_count (b) || a->width != b->width) + return false; + + HMAP_FOR_EACH (label, struct val_lab, node, &a->labels) + { + struct val_lab *label2 = val_labs_lookup__ (b, &label->value, + label->node.hash); + if (!label2 || label->label != label2->label) + return false; + } + + return true; +} diff --git a/src/data/value-labels.h b/src/data/value-labels.h index 6c13ec9a..f0f7ce01 100644 --- a/src/data/value-labels.h +++ b/src/data/value-labels.h @@ -111,4 +111,8 @@ const struct val_lab *val_labs_next (const struct val_labs *, const struct val_lab *); const struct val_lab **val_labs_sorted (const struct val_labs *); +/* Properties of entire sets. */ +unsigned int val_labs_hash (const struct val_labs *, unsigned int basis); +bool val_labs_equal (const struct val_labs *, const struct val_labs *); + #endif /* data/value-labels.h */ -- 2.30.2