From: Ben Pfaff Date: Sun, 23 Dec 2018 17:20:34 +0000 (-0800) Subject: categoricals: Improve comments. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=259b5c32c06fd93964720f4a0e7844b5a24c55db;p=pspp categoricals: Improve comments. --- diff --git a/src/math/categoricals.c b/src/math/categoricals.c index d2b5c322b0..db9ff64a05 100644 --- a/src/math/categoricals.c +++ b/src/math/categoricals.c @@ -113,15 +113,23 @@ lookup_variable (const struct hmap *map, const struct variable *var, unsigned in struct interact_params { - /* A map of cases indexed by a interaction_value */ + /* An example of each interaction that appears in the data, like a frequency + table for 'iact'. By construction, the number of elements must be less + than or equal to 'n_cats'. + + categoricals_update() updates 'ivmap' case-by-case, then + categoricals_done() dumps 'ivmap' into 'reverse_interaction_value_map' and + sorts it. */ struct hmap ivmap; + struct interaction_value **reverse_interaction_value_map; const struct interaction *iact; int base_subscript_short; int base_subscript_long; - /* The number of distinct values of this interaction */ + /* Product of hmap_count(&varnodes[*]->valmap), that is, the maximum number + of distinct values of this interaction. */ int n_cats; /* An array of integers df_n * df_{n-1} * df_{n-2} ... @@ -131,9 +139,7 @@ struct interact_params double *enc_sum; - /* A map of interaction_values indexed by subscript */ - struct interaction_value **reverse_interaction_value_map; - + /* Sum of reverse_interaction_value_map[*]->cc. */ double cc; }; @@ -157,27 +163,23 @@ struct categoricals /* An array of interact_params */ struct interact_params *iap; + size_t n_iap; /* Map whose members are the union of the variables which comprise IAP */ struct hmap varmap; - /* The size of IAP. (ie, the number of interactions involved.) */ - size_t n_iap; - /* The number of categorical variables which contain entries. In the absence of missing values, this will be equal to N_IAP */ size_t n_vars; - size_t df_sum; - /* A map to enable the lookup of variables indexed by subscript. This map considers only the N - 1 of the N variables. */ - int *reverse_variable_map_short; + int *reverse_variable_map_short; /* 'df_sum' elements. */ + size_t df_sum; /* Like the above, but uses all N variables */ - int *reverse_variable_map_long; - + int *reverse_variable_map_long; /* 'n_cats_total' elements. */ size_t n_cats_total; struct pool *pool; @@ -342,12 +344,21 @@ lookup_case (const struct hmap *map, const struct interaction *iact, const struc return iv; } +/* Returns true iff CAT is sane, that is, if it is complete and has at least + one value. */ bool categoricals_sane (const struct categoricals *cat) { return cat->sane; } +/* Creates and returns a new categoricals object whose variables come from the + N_INTER interactions objects in the array starting at INTER. (The INTER + objects must outlive the categoricals object because it uses them + internally.) + + FCTR_EXCL determines which cases are listwise ignored by + categoricals_update(). */ struct categoricals * categoricals_create (struct interaction *const*inter, size_t n_inter, const struct variable *wv, enum mv_class fctr_excl) @@ -473,7 +484,8 @@ categoricals_update (struct categoricals *cat, const struct ccase *c) } } -/* Return the number of categories (distinct values) for interction N */ +/* Return the number of categories (distinct values) for interaction IDX in + CAT. */ size_t categoricals_n_count (const struct categoricals *cat, size_t n) { @@ -481,6 +493,7 @@ categoricals_n_count (const struct categoricals *cat, size_t n) } +/* Returns the number of degrees of freedom for interaction IDX within CAT. */ size_t categoricals_df (const struct categoricals *cat, size_t n) { @@ -489,7 +502,7 @@ categoricals_df (const struct categoricals *cat, size_t n) } -/* Return the total number of categories */ +/* Return the total number of categories across all interactions in CAT. */ size_t categoricals_n_total (const struct categoricals *cat) { @@ -499,6 +512,7 @@ categoricals_n_total (const struct categoricals *cat) return cat->n_cats_total; } +/* Returns the total degrees of freedom for CAT. */ size_t categoricals_df_total (const struct categoricals *cat) { @@ -508,6 +522,7 @@ categoricals_df_total (const struct categoricals *cat) return cat->df_sum; } +/* Returns true iff categoricals_done() has been called for CAT. */ bool categoricals_is_complete (const struct categoricals *cat) { @@ -515,8 +530,10 @@ categoricals_is_complete (const struct categoricals *cat) } -/* This function must be called *before* any call to categoricals_get_*_by subscript and - *after* all calls to categoricals_update */ +/* This function must be called (once) before any call to the *_by_subscript or + *_by_category functions, but AFTER any calls to categoricals_update. If this + function returns false, then no calls to _by_subscript or *_by_category are + allowed. */ void categoricals_done (const struct categoricals *cat_) { diff --git a/src/math/categoricals.h b/src/math/categoricals.h index 4d7d4959a5..d5a8b0df95 100644 --- a/src/math/categoricals.h +++ b/src/math/categoricals.h @@ -28,91 +28,123 @@ struct interaction; union value ; +/* Categoricals. + + A categorical variable has a finite and usually small number of possible + values. The categoricals data structure organizes an array of interactions + maong categorical variables, that is, a set of sets of categorical + variables. (Both levels of "set" are ordered.) + + The life cycle of a categoricals object looks like this: + + 1. Create it with categoricals_create(). This fixes the set of interactions + and other parameters. + + 2. Pass all of the desired cases through the object with + categoricals_update(). + + 3. Finalize the object with categoricals_done(). Only at this point may + most of the categoricals query functions be called. + + 4. Use the categoricals object as desired. + + 5. Destroy the object with categoricals_destroy(). +*/ + +/* Creating and destroying categoricals. */ struct categoricals *categoricals_create (struct interaction *const*, size_t n_int, const struct variable *wv, enum mv_class fctr_excl); - void categoricals_destroy (struct categoricals *); +/* Updating categoricals. */ void categoricals_update (struct categoricals *cat, const struct ccase *c); +void categoricals_done (const struct categoricals *cat); +bool categoricals_is_complete (const struct categoricals *cat); + +/* Counting categories. + A variable's number of categories is the number of unique values observed in + the data passed to categoricals_update(). + An interaction's number of categories is the number of observed unique + values of its variables, which will often be less than the product of its + variables' numbers of categories. + + A categorical object's number of categories is the sum of its interactions' + categories. */ /* Return the number of categories (distinct values) for variable N */ size_t categoricals_n_count (const struct categoricals *cat, size_t n); +size_t categoricals_n_total (const struct categoricals *cat); -size_t categoricals_df (const struct categoricals *cat, size_t n); +/* Degrees of freedom. -/* Return the total number of categories */ -size_t categoricals_n_total (const struct categoricals *cat); + A categorical variable with N_CATS categories has N_CATS - 1 degrees of + freedom. -/* Return the total degrees of freedom */ -size_t categoricals_df_total (const struct categoricals *cat); + An interaction's degrees of freedom is the product of its variables' degrees + of freedom. + A categorical object's degrees of freedom is the sum of its interactions' + degrees of freedom. */ +size_t categoricals_df (const struct categoricals *cat, size_t n); +size_t categoricals_df_total (const struct categoricals *cat); -/* - Return the total number of variables which participated in these categoricals. - Due to the possibility of missing values, this is NOT necessarily - equal to the number of variables passed in when the object was - created. -*/ size_t categoricals_get_n_variables (const struct categoricals *cat); -bool categoricals_is_complete (const struct categoricals *cat); - +/* Sanity. */ +bool categoricals_sane (const struct categoricals *cat); -/* - Must be called (once) before any call to the *_by_subscript or *_by_category - functions, but AFTER any calls to categoricals_update. - If this function returns false, then no calls to _by_subscript or *_by_category - are allowed. -*/ -void categoricals_done (const struct categoricals *cat); +/* "Short map". -bool categoricals_sane (const struct categoricals *cat); + These look up an interaction within a categoricals object on the basis of a + "subscript". Interaction 0 with DF_0 degrees of freedom is assigned + subscripts [0, DF_0 - 1], interaction 1 with DF_1 degrees of freedom is + assigned subscripts [DF_0, DF_0 + DF_1 - 1], and so on. The subscripts + passed in must be in the range [0, DF_SUM - 1] where DF_SUM is the total + number of degrees of freedom for the object, as returned by + categoricals_df_total(). + These functions are intended for covariance matrix routines, where normally + 1 less than the total number of distinct values of each categorical variable + should be considered. -/* - The *_by_subscript functions use the short map. - Their intended use is by covariance matrix routines, where normally 1 less than - the total number of distinct values of each categorical variable should - be considered. - */ + These functions may be used on an object only after calling + categoricals_done(). +*/ double categoricals_get_weight_by_subscript (const struct categoricals *cat, int subscript); const struct interaction *categoricals_get_interaction_by_subscript (const struct categoricals *cat, int subscript); double categoricals_get_sum_by_subscript (const struct categoricals *cat, int subscript); - -/* Returns unity if the value in case C at SUBSCRIPT is equal to the category - for that subscript */ double categoricals_get_dummy_code_for_case (const struct categoricals *cat, int subscript, const struct ccase *c); - -/* Returns unity if the value in case C at SUBSCRIPT is equal to the category - for that subscript. - Else if it is the last category, return -1. - Otherwise return 0. - */ double categoricals_get_effects_code_for_case (const struct categoricals *cat, int subscript, const struct ccase *c); -/* These use the long map. Useful for descriptive statistics. */ +/* "Long map". + + These look up an interaction within a categoricals object on the basis of a + "category index". Interaction 0 in CAT with CAT_0 categories has indexes + [0, CAT_0 - 1], interaction 1 with CAT_1 categories has indexes [CAT_0, + CAT_0 + CAT_1 - 1], and so on. The indexes passed in must be in the range + [0, CAT_TOTAL - 1] where CAT_TOTAL is the total number of categories for the + object, as returned by categoricals_n_total(). + These functions are useful for descriptive statistics. + These functions may be used on an object only after calling + categoricals_done(). +*/ const struct ccase * categoricals_get_case_by_category_real (const struct categoricals *cat, int iact, int n); - void * categoricals_get_user_data_by_category_real (const struct categoricals *cat, int iact, int n); - - void * categoricals_get_user_data_by_category (const struct categoricals *cat, int category); - const struct ccase * categoricals_get_case_by_category (const struct categoricals *cat, int subscript); - struct payload { void* (*create) (const void *aux1, void *aux2); @@ -121,7 +153,6 @@ struct payload void (*destroy) (const void *aux1, void *aux2, void *user_data); }; - void categoricals_set_payload (struct categoricals *cats, const struct payload *p, const void *aux1, void *aux2); bool categoricals_isbalanced (const struct categoricals *cat);