From 1dcdba517fbb5c99c37e0a03275325f256975c7c Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 4 Mar 2023 19:06:12 -0800 Subject: [PATCH 1/1] dictionary: Always compact immediately upon deletion of a variable. With this change, the dictionary has a new invariant that the case_indexes are always a permutation of [0,N) where N is the number of variables in the dictionary. Most commonly, the case_indexes are the same as the dict_indexes, but reordering the dictionary can permute them. --- src/data/case-map.c | 43 +++----------------------- src/data/case-map.h | 4 --- src/data/dataset-writer.c | 10 ++---- src/data/dataset.c | 29 ++++++----------- src/data/datasheet.c | 6 ++++ src/data/dictionary.c | 42 ++++++------------------- src/data/dictionary.h | 5 +-- src/data/pc+-file-reader.c | 28 +++++++---------- src/data/sys-file-reader.c | 1 - src/language/commands/combine-files.c | 1 - src/language/commands/get.c | 1 - src/language/commands/save-translate.c | 1 - src/language/commands/save.c | 1 - src/ui/gui/marshaller-list | 1 + src/ui/gui/psppire-data-store.c | 20 ++++++------ src/ui/gui/psppire-data-store.h | 2 +- src/ui/gui/psppire-data-window.c | 2 +- src/ui/gui/psppire-dict.c | 19 +++++------- utilities/pspp-convert.c | 6 ++++ 19 files changed, 72 insertions(+), 150 deletions(-) diff --git a/src/data/case-map.c b/src/data/case-map.c index b340972c47..528643b304 100644 --- a/src/data/case-map.c +++ b/src/data/case-map.c @@ -101,8 +101,8 @@ case_map_execute (const struct case_map *map, struct ccase *src) for (dst_idx = 0; dst_idx < n_values; dst_idx++) { int src_idx = map->map[dst_idx]; - if (src_idx != -1) - value_copy (case_data_rw_idx (dst, dst_idx), case_data_idx (src, src_idx), caseproto_get_width (map->proto, dst_idx)); + assert (src_idx != -1); + value_copy (case_data_rw_idx (dst, dst_idx), case_data_idx (src, src_idx), caseproto_get_width (map->proto, dst_idx)); } case_unref (src); return dst; @@ -131,6 +131,8 @@ struct casereader * case_map_create_input_translator (struct case_map *map, struct casereader *subreader) { + if (!map) + return casereader_rename (subreader); static const struct casereader_translator_class class = { translate_case, destroy_case_map, }; @@ -175,43 +177,6 @@ destroy_case_map (void *map_) case_map_destroy (map); return true; } - -/* Creates and returns a case_map that can be used to compact - cases for dictionary D. - - Compacting a case eliminates "holes" between values and after - the last value. (Holes are created by deleting variables.) - - All variables are compacted if EXCLUDE_CLASSES is 0, or it may - contain one or more of DC_ORDINARY, DC_SYSTEM, or DC_SCRATCH - to cause the corresponding type of variable to be deleted - during compaction. */ -struct case_map * -case_map_to_compact_dict (const struct dictionary *d, - unsigned int exclude_classes) -{ - size_t n_vars = dict_get_n_vars (d); - struct caseproto *proto; - struct case_map *map; - size_t n_values; - size_t i; - - /* Create the case mapping. */ - proto = dict_get_compacted_proto (d, exclude_classes); - map = create_case_map (proto); - caseproto_unref (proto); - - /* Add the values to the case mapping. */ - n_values = 0; - for (i = 0; i < n_vars; i++) - { - struct variable *v = dict_get_var (d, i); - if (!(exclude_classes & var_get_dict_class (v))) - insert_mapping (map, var_get_case_index (v), n_values++); - } - - return map; -} struct stage_var { diff --git a/src/data/case-map.h b/src/data/case-map.h index 3e1f8be41f..a74921c38b 100644 --- a/src/data/case-map.h +++ b/src/data/case-map.h @@ -48,10 +48,6 @@ struct case_map_stage *case_map_stage_create (const struct dictionary *); void case_map_stage_destroy (struct case_map_stage *); struct case_map *case_map_stage_get_case_map (const struct case_map_stage *); -/* For eliminating "holes" in a case. */ -struct case_map *case_map_to_compact_dict (const struct dictionary *d, - unsigned int exclude_classes); - /* For mapping cases for one dictionary to another based on variable names within the dictionary. */ struct case_map *case_map_by_name (const struct dictionary *old, diff --git a/src/data/dataset-writer.c b/src/data/dataset-writer.c index 011deed39d..cfaa88ab5c 100644 --- a/src/data/dataset-writer.c +++ b/src/data/dataset-writer.c @@ -73,14 +73,10 @@ dataset_writer_open (struct file_handle *fh, writer->ds = fh_get_dataset (fh); writer->dict = dict_clone (dictionary); + struct case_map_stage *stage = case_map_stage_create (writer->dict); dict_delete_scratch_vars (writer->dict); - if (dict_count_values (writer->dict, 0) < dict_get_n_vars (writer->dict)) - { - writer->compactor = case_map_to_compact_dict (writer->dict, 0); - dict_compact_values (writer->dict); - } - else - writer->compactor = NULL; + writer->compactor = case_map_stage_get_case_map (stage); + case_map_stage_destroy (stage); writer->subwriter = autopaging_writer_create (dict_get_proto (writer->dict)); casewriter = casewriter_create (dict_get_proto (writer->dict), diff --git a/src/data/dataset.c b/src/data/dataset.c index 063a4a4251..c74715ae9b 100644 --- a/src/data/dataset.c +++ b/src/data/dataset.c @@ -360,10 +360,11 @@ dataset_delete_vars (struct dataset *ds, struct variable **vars, size_t n) caseinit_clear (ds->caseinit); caseinit_mark_as_preinited (ds->caseinit, ds->dict); + struct case_map_stage *stage = case_map_stage_create (ds->dict); dict_delete_vars (ds->dict, vars, n); ds->source = case_map_create_input_translator ( - case_map_to_compact_dict (ds->dict, 0), ds->source); - dict_compact_values (ds->dict); + case_map_stage_get_case_map (stage), ds->source); + case_map_stage_destroy (stage); caseinit_clear (ds->caseinit); caseinit_mark_as_preinited (ds->caseinit, ds->dict); } @@ -472,22 +473,13 @@ proc_open_filtering (struct dataset *ds, bool filter) /* Prepare sink. */ if (!ds->discard_output) { - struct dictionary *pd = ds->permanent_dict; - size_t compacted_n_values = dict_count_values (pd, DC_SCRATCH); - assert (dict_count_values (pd, 0) == dict_get_n_vars (pd)); - if (compacted_n_values < dict_get_n_vars (pd)) - { - struct caseproto *compacted_proto; - compacted_proto = dict_get_compacted_proto (pd, DC_SCRATCH); - ds->compactor = case_map_to_compact_dict (pd, DC_SCRATCH); - ds->sink = autopaging_writer_create (compacted_proto); - caseproto_unref (compacted_proto); - } - else - { - ds->compactor = NULL; - ds->sink = autopaging_writer_create (dict_get_proto (pd)); - } + struct dictionary *pd = dict_clone (ds->permanent_dict); + struct case_map_stage *stage = case_map_stage_create (pd); + dict_delete_scratch_vars (pd); + ds->compactor = case_map_stage_get_case_map (stage); + case_map_stage_destroy (stage); + ds->sink = autopaging_writer_create (dict_get_proto (pd)); + dict_unref (pd); } else { @@ -652,7 +644,6 @@ proc_commit (struct dataset *ds) ds->compactor = NULL; dict_delete_scratch_vars (ds->dict); - dict_compact_values (ds->dict); } /* Old data sink becomes new data source. */ diff --git a/src/data/datasheet.c b/src/data/datasheet.c index 1659f6f71e..970a7bbac2 100644 --- a/src/data/datasheet.c +++ b/src/data/datasheet.c @@ -190,6 +190,7 @@ caseproto_to_n_bytes (const struct caseproto *proto) for (i = 0; i < caseproto_get_n_widths (proto); i++) { int width = caseproto_get_width (proto, i); + assert (width >= 0); if (width >= 0) n_bytes += width_to_n_bytes (width); } @@ -236,6 +237,7 @@ datasheet_create (struct casereader *reader) int width = caseproto_get_width (ds->proto, i); column->source = ds->sources[0]; column->width = width; + assert (width >= 0); if (width >= 0) { column->value_ofs = i; @@ -473,6 +475,8 @@ datasheet_resize_column (struct datasheet *ds, size_t column, int new_width, col = &ds->columns[column]; old_col = *col; old_width = old_col.width; + assert (old_width >= 0); + assert (new_width >= 0); if (new_width == -1) { @@ -756,6 +760,7 @@ allocate_column (struct datasheet *ds, int width, struct column *column) column->value_ofs = -1; column->width = width; + assert (width >= 0); if (width >= 0) { int n_bytes; @@ -1402,6 +1407,7 @@ copy_case_into_source (struct source *source, struct ccase *c, casenumber row) for (i = 0; i < n_widths; i++) { int width = caseproto_get_width (proto, i); + assert (width >= 0); if (width >= 0) { int n_bytes = width_to_n_bytes (width); diff --git a/src/data/dictionary.c b/src/data/dictionary.c index 71c6e8136e..2c4633f7f9 100644 --- a/src/data/dictionary.c +++ b/src/data/dictionary.c @@ -478,7 +478,6 @@ static void dict_delete_var__ (struct dictionary *d, struct variable *v, bool skip_callbacks) { int dict_index = var_get_dict_index (v); - const int case_index = var_get_case_index (v); assert (dict_contains_var (d, v)); @@ -508,8 +507,8 @@ dict_delete_var__ (struct dictionary *d, struct variable *v, bool skip_callbacks if (! skip_callbacks) { if (d->changed) d->changed (d, d->changed_data); - if (d->callbacks && d->callbacks->var_deleted) - d->callbacks->var_deleted (d, v, dict_index, case_index, d->cb_data); + if (d->callbacks && d->callbacks->vars_deleted) + d->callbacks->vars_deleted (d, dict_index, 1, d->cb_data); } invalidate_proto (d); @@ -533,6 +532,7 @@ void dict_delete_var (struct dictionary *d, struct variable *v) { dict_delete_var__ (d, v, false); + dict_compact_values (d); } @@ -548,6 +548,7 @@ dict_delete_vars (struct dictionary *d, while (count-- > 0) dict_delete_var (d, *vars++); + dict_compact_values (d); } /* Deletes the COUNT variables in D starting at index IDX. This @@ -569,7 +570,6 @@ dict_delete_consecutive_vars (struct dictionary *d, size_t idx, size_t count) struct delvar { struct ll ll; struct variable *var; - int case_index; }; struct ll_list list = LL_INITIALIZER (list); @@ -590,7 +590,6 @@ dict_delete_consecutive_vars (struct dictionary *d, size_t idx, size_t count) dict_set_filter (d, NULL); dv->var = v; - dv->case_index = var_get_case_index (v); ll_push_tail (&list, (struct ll *)dv); } @@ -611,15 +610,17 @@ dict_delete_consecutive_vars (struct dictionary *d, size_t idx, size_t count) the variables. The vardict is not valid at this point anymore. That is the reason why we stored the caseindex before reindexing. */ + if (d->callbacks && d->callbacks->vars_deleted) + d->callbacks->vars_deleted (d, idx, count, d->cb_data); for (size_t vi = idx; vi < idx + count; vi++) { struct delvar *dv = (struct delvar *) ll_pop_head (&list); var_clear_vardict (dv->var); - if (d->callbacks && d->callbacks->var_deleted) - d->callbacks->var_deleted (d, dv->var, vi, dv->case_index, d->cb_data); var_unref (dv->var); free (dv); } + + dict_compact_values (d); } /* Deletes scratch variables from dictionary D. */ @@ -635,6 +636,8 @@ dict_delete_scratch_vars (struct dictionary *d) dict_delete_var (d, d->vars[i].var); else i++; + + dict_compact_values (d); } @@ -1437,31 +1440,6 @@ dict_count_values (const struct dictionary *d, unsigned int exclude_classes) return n; } -/* Returns the case prototype that would result after deleting - all variables from D that are not in one of the - EXCLUDE_CLASSES and compacting the dictionary with - dict_compact(). - - The caller must unref the returned caseproto when it is no - longer needed. */ -struct caseproto * -dict_get_compacted_proto (const struct dictionary *d, - unsigned int exclude_classes) -{ - struct caseproto *proto; - size_t i; - - assert (!(exclude_classes & ~DC_ALL)); - - proto = caseproto_create (); - for (i = 0; i < d->n_vars; i++) - { - struct variable *v = d->vars[i].var; - if (!(exclude_classes & var_get_dict_class (v))) - proto = caseproto_add_width (proto, var_get_width (v)); - } - return proto; -} /* Returns the file label for D, or a null pointer if D is unlabeled (see cmd_file_label()). */ const char * diff --git a/src/data/dictionary.h b/src/data/dictionary.h index cec706b355..d6823ed854 100644 --- a/src/data/dictionary.h +++ b/src/data/dictionary.h @@ -120,8 +120,6 @@ const struct caseproto *dict_get_proto (const struct dictionary *); size_t dict_count_values (const struct dictionary *, unsigned int exclude_classes); void dict_compact_values (struct dictionary *); -struct caseproto *dict_get_compacted_proto (const struct dictionary *, - unsigned int exclude_classes); /* SPLIT FILE variables. @@ -205,8 +203,7 @@ bool dict_id_is_valid (const struct dictionary *, const char *id); struct dict_callbacks { void (*var_added) (struct dictionary *, int, void *); - void (*var_deleted) (struct dictionary *, const struct variable *, - int dict_index, int case_index, void *); + void (*vars_deleted) (struct dictionary *, int dict_index, unsigned int n, void *); void (*var_changed) (struct dictionary *, int, unsigned int, const struct variable *, void *); void (*weight_changed) (struct dictionary *, int, void *); void (*filter_changed) (struct dictionary *, int, void *); diff --git a/src/data/pc+-file-reader.c b/src/data/pc+-file-reader.c index e12536b577..73b9ea804b 100644 --- a/src/data/pc+-file-reader.c +++ b/src/data/pc+-file-reader.c @@ -80,6 +80,9 @@ struct pcp_var_record { unsigned int pos; + bool drop; + union value tmp; + char name[9]; int width; struct fmt_spec format; @@ -393,14 +396,6 @@ pcp_get_strings (const struct any_reader *r_, struct pool *pool, return aux.n; } -static void -find_and_delete_var (struct dictionary *dict, const char *name) -{ - struct variable *var = dict_lookup_var (dict, name); - if (var) - dict_delete_var (dict, var); -} - /* Decodes the dictionary read from R, saving it into *DICT. Character strings in R are decoded using ENCODING, or an encoding obtained from R if ENCODING is null, or the locale encoding if R specifies no encoding. @@ -442,10 +437,6 @@ pcp_decode (struct any_reader *r_, const char *encoding, dictionary and may destroy or modify its variables. */ r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool); - find_and_delete_var (dict, "CASENUM_"); - find_and_delete_var (dict, "DATE_"); - find_and_delete_var (dict, "WEIGHT_"); - *dictp = dict; if (infop) { @@ -844,9 +835,13 @@ parse_variable_records (struct pcp_reader *r, struct dictionary *dict, rec->name, -1, r->pool); name[strcspn (name, " ")] = '\0'; - /* Transform $DATE => DATE_, $WEIGHT => WEIGHT_, $CASENUM => CASENUM_. */ - if (name[0] == '$') - name = pool_asprintf (r->pool, "%s_", name + 1); + /* Drop system variables. */ + rec->drop = name[0] == '$'; + if (rec->drop) + { + value_init_pool (r->pool, &rec->tmp, rec->width); + continue; + } if (!dict_id_is_valid (dict, name) || name[0] == '#') { @@ -954,10 +949,11 @@ pcp_file_casereader_read (struct casereader *reader, void *r_) r->n_cases--; c = case_create (r->proto); + size_t case_idx = 0; for (i = 0; i < r->n_vars; i++) { struct pcp_var_record *var = &r->vars[i]; - union value *v = case_data_rw_idx (c, i); + union value *v = var->drop ? &var->tmp : case_data_rw_idx (c, case_idx++); if (var->width == 0) retval = read_case_number (r, &v->f); diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index 880b62e384..491df7f996 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -2141,7 +2141,6 @@ parse_long_string_map (struct sfm_reader *r, var_set_width (var, length); } close_text_record (r, text); - dict_compact_values (dict); return true; } diff --git a/src/language/commands/combine-files.c b/src/language/commands/combine-files.c index c6af498b86..6743c5e7a9 100644 --- a/src/language/commands/combine-files.c +++ b/src/language/commands/combine-files.c @@ -407,7 +407,6 @@ combine_files (enum comb_command_type command, goto error; dict_delete_scratch_vars (proc.dict); - dict_compact_values (proc.dict); /* Set up mapping from each file's variables to master variables. */ diff --git a/src/language/commands/get.c b/src/language/commands/get.c index 90ad2e55e6..ea6323ba40 100644 --- a/src/language/commands/get.c +++ b/src/language/commands/get.c @@ -142,7 +142,6 @@ parse_read_command (struct lexer *lexer, struct dataset *ds, if (!parse_dict_trim (lexer, dict)) goto error; } - dict_compact_values (dict); map = case_map_stage_get_case_map (stage); case_map_stage_destroy (stage); diff --git a/src/language/commands/save-translate.c b/src/language/commands/save-translate.c index 25e2020c52..600a2156bd 100644 --- a/src/language/commands/save-translate.c +++ b/src/language/commands/save-translate.c @@ -236,7 +236,6 @@ cmd_save_translate (struct lexer *lexer, struct dataset *ds) } dict_delete_scratch_vars (dict); - dict_compact_values (dict); struct csv_writer_options csv_opts = { .recode_user_missing = recode_user_missing, diff --git a/src/language/commands/save.c b/src/language/commands/save.c index ab63c52547..d892c536ee 100644 --- a/src/language/commands/save.c +++ b/src/language/commands/save.c @@ -310,7 +310,6 @@ parse_write_command (struct lexer *lexer, struct dataset *ds, } dict_delete_scratch_vars (dict); - dict_compact_values (dict); if (handle) { diff --git a/src/ui/gui/marshaller-list b/src/ui/gui/marshaller-list index 202d172572..422c9c72b0 100644 --- a/src/ui/gui/marshaller-list +++ b/src/ui/gui/marshaller-list @@ -4,3 +4,4 @@ BOOLEAN:INT,INT VOID:POINTER,INT,INT VOID:INT,UINT,POINTER VOID:UINT,UINT,UINT +VOID:INT,UINT \ No newline at end of file diff --git a/src/ui/gui/psppire-data-store.c b/src/ui/gui/psppire-data-store.c index 598e278669..0a6614a641 100644 --- a/src/ui/gui/psppire-data-store.c +++ b/src/ui/gui/psppire-data-store.c @@ -352,27 +352,25 @@ psppire_data_store_init (PsppireDataStore *data_store) static void -psppire_data_store_delete_value (PsppireDataStore *store, gint case_index) +psppire_data_store_delete_values (PsppireDataStore *store, gint case_index, + guint n) { g_return_if_fail (store->datasheet); g_return_if_fail (case_index < datasheet_get_n_columns (store->datasheet)); - datasheet_delete_columns (store->datasheet, case_index, 1); - datasheet_insert_column (store->datasheet, NULL, -1, case_index); + datasheet_delete_columns (store->datasheet, case_index, n); } /* - A callback which occurs after a variable has been deleted. + A callback which occurs after variables have been deleted. */ static void -delete_variable_callback (GObject *obj, const struct variable *var UNUSED, - gint dict_index, gint case_index, - gpointer data) +delete_variables_callback (GObject *obj, gint dict_index, unsigned int n, gpointer data) { PsppireDataStore *store = PSPPIRE_DATA_STORE (data); - psppire_data_store_delete_value (store, case_index); + psppire_data_store_delete_values (store, dict_index, n); } struct resize_datum_aux @@ -514,9 +512,9 @@ psppire_data_store_set_dictionary (PsppireDataStore *data_store, PsppireDict *di G_CALLBACK (insert_variable_callback), data_store); - data_store->dict_handler_id [VARIABLE_DELETED] = - g_signal_connect (dict, "variable-deleted", - G_CALLBACK (delete_variable_callback), + data_store->dict_handler_id [VARIABLES_DELETED] = + g_signal_connect (dict, "variables-deleted", + G_CALLBACK (delete_variables_callback), data_store); data_store->dict_handler_id [VARIABLE_CHANGED] = diff --git a/src/ui/gui/psppire-data-store.h b/src/ui/gui/psppire-data-store.h index 2fff2d16dd..87bdefd14e 100644 --- a/src/ui/gui/psppire-data-store.h +++ b/src/ui/gui/psppire-data-store.h @@ -57,7 +57,7 @@ struct dictionary; enum dict_signal_handler { VARIABLE_INSERTED, VARIABLE_CHANGED, - VARIABLE_DELETED, + VARIABLES_DELETED, n_dict_signals }; diff --git a/src/ui/gui/psppire-data-window.c b/src/ui/gui/psppire-data-window.c index 54f65d7747..e29f469b4a 100644 --- a/src/ui/gui/psppire-data-window.c +++ b/src/ui/gui/psppire-data-window.c @@ -1479,7 +1479,7 @@ psppire_data_window_finish_init (PsppireDataWindow *de, G_CALLBACK (enable_save), de); g_signal_connect_swapped (de->dict, "variable-inserted", G_CALLBACK (enable_save), de); - g_signal_connect_swapped (de->dict, "variable-deleted", + g_signal_connect_swapped (de->dict, "variables-deleted", G_CALLBACK (enable_save), de); enable_save (de); diff --git a/src/ui/gui/psppire-dict.c b/src/ui/gui/psppire-dict.c index 22be052f30..0ed6841131 100644 --- a/src/ui/gui/psppire-dict.c +++ b/src/ui/gui/psppire-dict.c @@ -50,7 +50,7 @@ GType role_enum_type; enum { VARIABLE_CHANGED, VARIABLE_INSERTED, - VARIABLE_DELETED, + VARIABLES_DELETED, WEIGHT_CHANGED, FILTER_CHANGED, @@ -191,18 +191,17 @@ psppire_dict_class_init (PsppireDictClass *class) 1, G_TYPE_INT); - signals [VARIABLE_DELETED] = - g_signal_new ("variable-deleted", + signals [VARIABLES_DELETED] = + g_signal_new ("variables-deleted", G_TYPE_FROM_CLASS (class), G_SIGNAL_RUN_FIRST, 0, NULL, NULL, - psppire_marshal_VOID__POINTER_INT_INT, + psppire_marshal_VOID__INT_UINT, G_TYPE_NONE, - 3, - G_TYPE_POINTER, + 2, G_TYPE_INT, - G_TYPE_INT); + G_TYPE_UINT); signals [WEIGHT_CHANGED] = g_signal_new ("weight-changed", @@ -268,11 +267,9 @@ addcb (struct dictionary *d, int idx, void *pd) } static void -delcb (struct dictionary *d, const struct variable *var, - int dict_idx, int case_idx, void *pd) +delcb (struct dictionary *d, int dict_idx, unsigned int n, void *pd) { - g_signal_emit (pd, signals [VARIABLE_DELETED], 0, - var, dict_idx, case_idx); + g_signal_emit (pd, signals [VARIABLES_DELETED], 0, dict_idx, n); g_signal_emit_by_name (pd, "items-changed", dict_idx, 1, 0); } diff --git a/utilities/pspp-convert.c b/utilities/pspp-convert.c index a381a98746..6599ba3fc8 100644 --- a/utilities/pspp-convert.c +++ b/utilities/pspp-convert.c @@ -23,6 +23,7 @@ #include #include "data/any-reader.h" +#include "data/case-map.h" #include "data/casereader.h" #include "data/casewriter.h" #include "data/csv-file-writer.h" @@ -294,6 +295,7 @@ main (int argc, char *argv[]) if (reader == NULL) goto error; + struct case_map_stage *stage = case_map_stage_create (dict); if (keep) { struct variable **keep_vars; @@ -316,6 +318,10 @@ main (int argc, char *argv[]) free (drop_vars); } + reader = case_map_create_input_translator ( + case_map_stage_get_case_map (stage), reader); + case_map_stage_destroy (stage); + if (!strcmp (output_format, "csv") || !strcmp (output_format, "txt")) { if (!csv_opts.delimiter) -- 2.30.2